5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
154 #define STATIC static
157 /* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
161 typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
166 bool in_gosub; /* this or an outer frame is for GOSUB */
168 struct scan_frame *this_prev_frame; /* this previous frame */
169 struct scan_frame *prev_frame; /* previous frame */
170 struct scan_frame *next_frame; /* next frame */
173 /* Certain characters are output as a sequence with the first being a
175 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
178 struct RExC_state_t {
179 U32 flags; /* RXf_* are we folding, multilining? */
180 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
181 char *precomp; /* uncompiled string. */
182 char *precomp_end; /* pointer to end of uncompiled string. */
183 REGEXP *rx_sv; /* The SV that is the regexp. */
184 regexp *rx; /* perl core regexp structure */
185 regexp_internal *rxi; /* internal data for regexp object
187 char *start; /* Start of input for compile */
188 char *end; /* End of input for compile */
189 char *parse; /* Input-scan pointer. */
190 char *copy_start; /* start of copy of input within
191 constructed parse string */
192 char *save_copy_start; /* Provides one level of saving
193 and restoring 'copy_start' */
194 char *copy_start_in_input; /* Position in input string
195 corresponding to copy_start */
196 SSize_t whilem_seen; /* number of WHILEM in this expr */
197 regnode *emit_start; /* Start of emitted-code area */
198 regnode_offset emit; /* Code-emit pointer */
199 I32 naughty; /* How bad is this pattern? */
200 I32 sawback; /* Did we see \1, ...? */
201 SSize_t size; /* Number of regnode equivalents in
203 Size_t sets_depth; /* Counts recursion depth of already-
204 compiled regex set patterns */
207 I32 parens_buf_size; /* #slots malloced open/close_parens */
208 regnode_offset *open_parens; /* offsets to open parens */
209 regnode_offset *close_parens; /* offsets to close parens */
210 HV *paren_names; /* Paren names */
212 /* position beyond 'precomp' of the warning message furthest away from
213 * 'precomp'. During the parse, no warnings are raised for any problems
214 * earlier in the parse than this position. This works if warnings are
215 * raised the first time a given spot is parsed, and if only one
216 * independent warning is raised for any given spot */
217 Size_t latest_warn_offset;
219 I32 npar; /* Capture buffer count so far in the
220 parse, (OPEN) plus one. ("par" 0 is
222 I32 total_par; /* During initial parse, is either 0,
223 or -1; the latter indicating a
224 reparse is needed. After that pass,
225 it is what 'npar' became after the
226 pass. Hence, it being > 0 indicates
227 we are in a reparse situation */
228 I32 nestroot; /* root parens we are in - used by
231 regnode *end_op; /* END node in program */
232 I32 utf8; /* whether the pattern is utf8 or not */
233 I32 orig_utf8; /* whether the pattern was originally in utf8 */
234 /* XXX use this for future optimisation of case
235 * where pattern must be upgraded to utf8. */
236 I32 uni_semantics; /* If a d charset modifier should use unicode
237 rules, even if the pattern is not in
240 I32 recurse_count; /* Number of recurse regops we have generated */
241 regnode **recurse; /* Recurse regops */
242 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
244 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
248 I32 override_recoding;
249 I32 recode_x_to_native;
250 I32 in_multi_char_class;
251 int code_index; /* next code_blocks[] slot */
252 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
254 SSize_t maxlen; /* mininum possible number of chars in string to match */
255 scan_frame *frame_head;
256 scan_frame *frame_last;
260 SV *runtime_code_qr; /* qr with the runtime code blocks */
262 const char *lastparse;
264 U32 study_chunk_recursed_count;
265 AV *paren_name_list; /* idx -> name */
269 #define RExC_lastparse (pRExC_state->lastparse)
270 #define RExC_lastnum (pRExC_state->lastnum)
271 #define RExC_paren_name_list (pRExC_state->paren_name_list)
272 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
273 #define RExC_mysv (pRExC_state->mysv1)
274 #define RExC_mysv1 (pRExC_state->mysv1)
275 #define RExC_mysv2 (pRExC_state->mysv2)
283 bool sWARN_EXPERIMENTAL__VLB;
284 bool sWARN_EXPERIMENTAL__REGEX_SETS;
287 #define RExC_flags (pRExC_state->flags)
288 #define RExC_pm_flags (pRExC_state->pm_flags)
289 #define RExC_precomp (pRExC_state->precomp)
290 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
291 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
292 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
293 #define RExC_precomp_end (pRExC_state->precomp_end)
294 #define RExC_rx_sv (pRExC_state->rx_sv)
295 #define RExC_rx (pRExC_state->rx)
296 #define RExC_rxi (pRExC_state->rxi)
297 #define RExC_start (pRExC_state->start)
298 #define RExC_end (pRExC_state->end)
299 #define RExC_parse (pRExC_state->parse)
300 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
301 #define RExC_whilem_seen (pRExC_state->whilem_seen)
302 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
303 under /d from /u ? */
305 #ifdef RE_TRACK_PATTERN_OFFSETS
306 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
309 #define RExC_emit (pRExC_state->emit)
310 #define RExC_emit_start (pRExC_state->emit_start)
311 #define RExC_sawback (pRExC_state->sawback)
312 #define RExC_seen (pRExC_state->seen)
313 #define RExC_size (pRExC_state->size)
314 #define RExC_maxlen (pRExC_state->maxlen)
315 #define RExC_npar (pRExC_state->npar)
316 #define RExC_total_parens (pRExC_state->total_par)
317 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
318 #define RExC_nestroot (pRExC_state->nestroot)
319 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
320 #define RExC_utf8 (pRExC_state->utf8)
321 #define RExC_uni_semantics (pRExC_state->uni_semantics)
322 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
323 #define RExC_open_parens (pRExC_state->open_parens)
324 #define RExC_close_parens (pRExC_state->close_parens)
325 #define RExC_end_op (pRExC_state->end_op)
326 #define RExC_paren_names (pRExC_state->paren_names)
327 #define RExC_recurse (pRExC_state->recurse)
328 #define RExC_recurse_count (pRExC_state->recurse_count)
329 #define RExC_sets_depth (pRExC_state->sets_depth)
330 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
331 #define RExC_study_chunk_recursed_bytes \
332 (pRExC_state->study_chunk_recursed_bytes)
333 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
334 #define RExC_in_lookahead (pRExC_state->in_lookahead)
335 #define RExC_contains_locale (pRExC_state->contains_locale)
336 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
339 # define SET_recode_x_to_native(x) \
340 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
342 # define SET_recode_x_to_native(x) NOOP
345 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
346 #define RExC_frame_head (pRExC_state->frame_head)
347 #define RExC_frame_last (pRExC_state->frame_last)
348 #define RExC_frame_count (pRExC_state->frame_count)
349 #define RExC_strict (pRExC_state->strict)
350 #define RExC_study_started (pRExC_state->study_started)
351 #define RExC_warn_text (pRExC_state->warn_text)
352 #define RExC_in_script_run (pRExC_state->in_script_run)
353 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
354 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
355 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
356 #define RExC_unlexed_names (pRExC_state->unlexed_names)
358 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
359 * a flag to disable back-off on the fixed/floating substrings - if it's
360 * a high complexity pattern we assume the benefit of avoiding a full match
361 * is worth the cost of checking for the substrings even if they rarely help.
363 #define RExC_naughty (pRExC_state->naughty)
364 #define TOO_NAUGHTY (10)
365 #define MARK_NAUGHTY(add) \
366 if (RExC_naughty < TOO_NAUGHTY) \
367 RExC_naughty += (add)
368 #define MARK_NAUGHTY_EXP(exp, add) \
369 if (RExC_naughty < TOO_NAUGHTY) \
370 RExC_naughty += RExC_naughty / (exp) + (add)
372 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
373 #define ISMULT2(s) (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
376 * Flags to be passed up and down.
378 #define HASWIDTH 0x01 /* Known to not match null strings, could match
380 #define SIMPLE 0x02 /* Exactly one character wide */
381 /* (or LNBREAK as a special case) */
382 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
383 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
384 #define RESTART_PARSE 0x20 /* Need to redo the parse */
385 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
386 calcuate sizes as UTF-8 */
388 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
390 /* whether trie related optimizations are enabled */
391 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
392 #define TRIE_STUDY_OPT
393 #define FULL_TRIE_STUDY
399 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
400 #define PBITVAL(paren) (1 << ((paren) & 7))
401 #define PAREN_OFFSET(depth) \
402 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
403 #define PAREN_TEST(depth, paren) \
404 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
405 #define PAREN_SET(depth, paren) \
406 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
407 #define PAREN_UNSET(depth, paren) \
408 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
410 #define REQUIRE_UTF8(flagp) STMT_START { \
412 *flagp = RESTART_PARSE|NEED_UTF8; \
417 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
418 * a flag that indicates we need to override /d with /u as a result of
419 * something in the pattern. It should only be used in regards to calling
420 * set_regex_charset() or get_regex_charset() */
421 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
423 if (DEPENDS_SEMANTICS) { \
424 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
425 RExC_uni_semantics = 1; \
426 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
427 /* No need to restart the parse if we haven't seen \
428 * anything that differs between /u and /d, and no need \
429 * to restart immediately if we're going to reparse \
430 * anyway to count parens */ \
431 *flagp |= RESTART_PARSE; \
432 return restart_retval; \
437 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
439 RExC_use_BRANCHJ = 1; \
440 *flagp |= RESTART_PARSE; \
441 return restart_retval; \
444 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
445 * less. After that, it must always be positive, because the whole re is
446 * considered to be surrounded by virtual parens. Setting it to negative
447 * indicates there is some construct that needs to know the actual number of
448 * parens to be properly handled. And that means an extra pass will be
449 * required after we've counted them all */
450 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
451 #define REQUIRE_PARENS_PASS \
452 STMT_START { /* No-op if have completed a pass */ \
453 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
455 #define IN_PARENS_PASS (RExC_total_parens < 0)
458 /* This is used to return failure (zero) early from the calling function if
459 * various flags in 'flags' are set. Two flags always cause a return:
460 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
461 * additional flags that should cause a return; 0 if none. If the return will
462 * be done, '*flagp' is first set to be all of the flags that caused the
464 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
466 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
467 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
472 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
474 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
475 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
476 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
477 if (MUST_RESTART(*(flagp))) return 0
479 /* This converts the named class defined in regcomp.h to its equivalent class
480 * number defined in handy.h. */
481 #define namedclass_to_classnum(class) ((int) ((class) / 2))
482 #define classnum_to_namedclass(classnum) ((classnum) * 2)
484 #define _invlist_union_complement_2nd(a, b, output) \
485 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
486 #define _invlist_intersection_complement_2nd(a, b, output) \
487 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
489 /* We add a marker if we are deferring expansion of a property that is both
490 * 1) potentiallly user-defined; and
491 * 2) could also be an official Unicode property.
493 * Without this marker, any deferred expansion can only be for a user-defined
494 * one. This marker shouldn't conflict with any that could be in a legal name,
495 * and is appended to its name to indicate this. There is a string and
497 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
498 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
500 /* What is infinity for optimization purposes */
501 #define OPTIMIZE_INFTY SSize_t_MAX
503 /* About scan_data_t.
505 During optimisation we recurse through the regexp program performing
506 various inplace (keyhole style) optimisations. In addition study_chunk
507 and scan_commit populate this data structure with information about
508 what strings MUST appear in the pattern. We look for the longest
509 string that must appear at a fixed location, and we look for the
510 longest string that may appear at a floating location. So for instance
515 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
516 strings (because they follow a .* construct). study_chunk will identify
517 both FOO and BAR as being the longest fixed and floating strings respectively.
519 The strings can be composites, for instance
523 will result in a composite fixed substring 'foo'.
525 For each string some basic information is maintained:
528 This is the position the string must appear at, or not before.
529 It also implicitly (when combined with minlenp) tells us how many
530 characters must match before the string we are searching for.
531 Likewise when combined with minlenp and the length of the string it
532 tells us how many characters must appear after the string we have
536 Only used for floating strings. This is the rightmost point that
537 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
538 string can occur infinitely far to the right.
539 For fixed strings, it is equal to min_offset.
542 A pointer to the minimum number of characters of the pattern that the
543 string was found inside. This is important as in the case of positive
544 lookahead or positive lookbehind we can have multiple patterns
549 The minimum length of the pattern overall is 3, the minimum length
550 of the lookahead part is 3, but the minimum length of the part that
551 will actually match is 1. So 'FOO's minimum length is 3, but the
552 minimum length for the F is 1. This is important as the minimum length
553 is used to determine offsets in front of and behind the string being
554 looked for. Since strings can be composites this is the length of the
555 pattern at the time it was committed with a scan_commit. Note that
556 the length is calculated by study_chunk, so that the minimum lengths
557 are not known until the full pattern has been compiled, thus the
558 pointer to the value.
562 In the case of lookbehind the string being searched for can be
563 offset past the start point of the final matching string.
564 If this value was just blithely removed from the min_offset it would
565 invalidate some of the calculations for how many chars must match
566 before or after (as they are derived from min_offset and minlen and
567 the length of the string being searched for).
568 When the final pattern is compiled and the data is moved from the
569 scan_data_t structure into the regexp structure the information
570 about lookbehind is factored in, with the information that would
571 have been lost precalculated in the end_shift field for the
574 The fields pos_min and pos_delta are used to store the minimum offset
575 and the delta to the maximum offset at the current point in the pattern.
579 struct scan_data_substrs {
580 SV *str; /* longest substring found in pattern */
581 SSize_t min_offset; /* earliest point in string it can appear */
582 SSize_t max_offset; /* latest point in string it can appear */
583 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
584 SSize_t lookbehind; /* is the pos of the string modified by LB */
585 I32 flags; /* per substring SF_* and SCF_* flags */
588 typedef struct scan_data_t {
589 /*I32 len_min; unused */
590 /*I32 len_delta; unused */
594 SSize_t last_end; /* min value, <0 unless valid. */
595 SSize_t last_start_min;
596 SSize_t last_start_max;
597 U8 cur_is_floating; /* whether the last_* values should be set as
598 * the next fixed (0) or floating (1)
601 /* [0] is longest fixed substring so far, [1] is longest float so far */
602 struct scan_data_substrs substrs[2];
604 I32 flags; /* common SF_* and SCF_* flags */
606 SSize_t *last_closep;
607 regnode_ssc *start_class;
611 * Forward declarations for pregcomp()'s friends.
614 static const scan_data_t zero_scan_data = {
615 0, 0, NULL, 0, 0, 0, 0,
617 { NULL, 0, 0, 0, 0, 0 },
618 { NULL, 0, 0, 0, 0, 0 },
625 #define SF_BEFORE_SEOL 0x0001
626 #define SF_BEFORE_MEOL 0x0002
627 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
629 #define SF_IS_INF 0x0040
630 #define SF_HAS_PAR 0x0080
631 #define SF_IN_PAR 0x0100
632 #define SF_HAS_EVAL 0x0200
635 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
636 * longest substring in the pattern. When it is not set the optimiser keeps
637 * track of position, but does not keep track of the actual strings seen,
639 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
642 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
643 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
644 * turned off because of the alternation (BRANCH). */
645 #define SCF_DO_SUBSTR 0x0400
647 #define SCF_DO_STCLASS_AND 0x0800
648 #define SCF_DO_STCLASS_OR 0x1000
649 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
650 #define SCF_WHILEM_VISITED_POS 0x2000
652 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
653 #define SCF_SEEN_ACCEPT 0x8000
654 #define SCF_TRIE_DOING_RESTUDY 0x10000
655 #define SCF_IN_DEFINE 0x20000
660 #define UTF cBOOL(RExC_utf8)
662 /* The enums for all these are ordered so things work out correctly */
663 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
664 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
665 == REGEX_DEPENDS_CHARSET)
666 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
667 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
668 >= REGEX_UNICODE_CHARSET)
669 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
670 == REGEX_ASCII_RESTRICTED_CHARSET)
671 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
672 >= REGEX_ASCII_RESTRICTED_CHARSET)
673 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
674 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
676 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
678 /* For programs that want to be strictly Unicode compatible by dying if any
679 * attempt is made to match a non-Unicode code point against a Unicode
681 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
683 #define OOB_NAMEDCLASS -1
685 /* There is no code point that is out-of-bounds, so this is problematic. But
686 * its only current use is to initialize a variable that is always set before
688 #define OOB_UNICODE 0xDEADBEEF
690 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
693 /* length of regex to show in messages that don't mark a position within */
694 #define RegexLengthToShowInErrorMessages 127
697 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
698 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
699 * op/pragma/warn/regcomp.
701 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
702 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
704 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
705 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
707 /* The code in this file in places uses one level of recursion with parsing
708 * rebased to an alternate string constructed by us in memory. This can take
709 * the form of something that is completely different from the input, or
710 * something that uses the input as part of the alternate. In the first case,
711 * there should be no possibility of an error, as we are in complete control of
712 * the alternate string. But in the second case we don't completely control
713 * the input portion, so there may be errors in that. Here's an example:
715 * is handled specially because \x{df} folds to a sequence of more than one
716 * character: 'ss'. What is done is to create and parse an alternate string,
717 * which looks like this:
718 * /(?:\x{DF}|[abc\x{DF}def])/ui
719 * where it uses the input unchanged in the middle of something it constructs,
720 * which is a branch for the DF outside the character class, and clustering
721 * parens around the whole thing. (It knows enough to skip the DF inside the
722 * class while in this substitute parse.) 'abc' and 'def' may have errors that
723 * need to be reported. The general situation looks like this:
725 * |<------- identical ------>|
727 * Input: ---------------------------------------------------------------
728 * Constructed: ---------------------------------------------------
730 * |<------- identical ------>|
732 * sI..eI is the portion of the input pattern we are concerned with here.
733 * sC..EC is the constructed substitute parse string.
734 * sC..tC is constructed by us
735 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
736 * In the diagram, these are vertically aligned.
737 * eC..EC is also constructed by us.
738 * xC is the position in the substitute parse string where we found a
740 * xI is the position in the original pattern corresponding to xC.
742 * We want to display a message showing the real input string. Thus we need to
743 * translate from xC to xI. We know that xC >= tC, since the portion of the
744 * string sC..tC has been constructed by us, and so shouldn't have errors. We
746 * xI = tI + (xC - tC)
748 * When the substitute parse is constructed, the code needs to set:
751 * RExC_copy_start_in_input (tI)
752 * RExC_copy_start_in_constructed (tC)
753 * and restore them when done.
755 * During normal processing of the input pattern, both
756 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
757 * sI, so that xC equals xI.
760 #define sI RExC_precomp
761 #define eI RExC_precomp_end
762 #define sC RExC_start
764 #define tI RExC_copy_start_in_input
765 #define tC RExC_copy_start_in_constructed
766 #define xI(xC) (tI + (xC - tC))
767 #define xI_offset(xC) (xI(xC) - sI)
769 #define REPORT_LOCATION_ARGS(xC) \
771 (xI(xC) > eI) /* Don't run off end */ \
772 ? eI - sI /* Length before the <--HERE */ \
773 : ((xI_offset(xC) >= 0) \
775 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
776 IVdf " trying to output message for " \
778 __FILE__, __LINE__, (IV) xI_offset(xC), \
779 ((int) (eC - sC)), sC), 0)), \
780 sI), /* The input pattern printed up to the <--HERE */ \
782 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
783 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
785 /* Used to point after bad bytes for an error message, but avoid skipping
786 * past a nul byte. */
787 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
789 /* Set up to clean up after our imminent demise */
790 #define PREPARE_TO_DIE \
793 SAVEFREESV(RExC_rx_sv); \
794 if (RExC_open_parens) \
795 SAVEFREEPV(RExC_open_parens); \
796 if (RExC_close_parens) \
797 SAVEFREEPV(RExC_close_parens); \
801 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
802 * arg. Show regex, up to a maximum length. If it's too long, chop and add
805 #define _FAIL(code) STMT_START { \
806 const char *ellipses = ""; \
807 IV len = RExC_precomp_end - RExC_precomp; \
810 if (len > RegexLengthToShowInErrorMessages) { \
811 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
812 len = RegexLengthToShowInErrorMessages - 10; \
818 #define FAIL(msg) _FAIL( \
819 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
820 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
822 #define FAIL2(msg,arg) _FAIL( \
823 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
824 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
826 #define FAIL3(msg,arg1,arg2) _FAIL( \
827 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
828 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
831 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
833 #define Simple_vFAIL(m) STMT_START { \
834 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
835 m, REPORT_LOCATION_ARGS(RExC_parse)); \
839 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
841 #define vFAIL(m) STMT_START { \
847 * Like Simple_vFAIL(), but accepts two arguments.
849 #define Simple_vFAIL2(m,a1) STMT_START { \
850 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
851 REPORT_LOCATION_ARGS(RExC_parse)); \
855 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
857 #define vFAIL2(m,a1) STMT_START { \
859 Simple_vFAIL2(m, a1); \
864 * Like Simple_vFAIL(), but accepts three arguments.
866 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
867 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
868 REPORT_LOCATION_ARGS(RExC_parse)); \
872 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
874 #define vFAIL3(m,a1,a2) STMT_START { \
876 Simple_vFAIL3(m, a1, a2); \
880 * Like Simple_vFAIL(), but accepts four arguments.
882 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
883 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
884 REPORT_LOCATION_ARGS(RExC_parse)); \
887 #define vFAIL4(m,a1,a2,a3) STMT_START { \
889 Simple_vFAIL4(m, a1, a2, a3); \
892 /* A specialized version of vFAIL2 that works with UTF8f */
893 #define vFAIL2utf8f(m, a1) STMT_START { \
895 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
896 REPORT_LOCATION_ARGS(RExC_parse)); \
899 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
901 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
902 REPORT_LOCATION_ARGS(RExC_parse)); \
905 /* Setting this to NULL is a signal to not output warnings */
906 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
908 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
909 RExC_copy_start_in_constructed = NULL; \
911 #define RESTORE_WARNINGS \
912 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
914 /* Since a warning can be generated multiple times as the input is reparsed, we
915 * output it the first time we come to that point in the parse, but suppress it
916 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
917 * generate any warnings */
918 #define TO_OUTPUT_WARNINGS(loc) \
919 ( RExC_copy_start_in_constructed \
920 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
922 /* After we've emitted a warning, we save the position in the input so we don't
924 #define UPDATE_WARNINGS_LOC(loc) \
926 if (TO_OUTPUT_WARNINGS(loc)) { \
927 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
932 /* 'warns' is the output of the packWARNx macro used in 'code' */
933 #define _WARN_HELPER(loc, warns, code) \
935 if (! RExC_copy_start_in_constructed) { \
936 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
937 " expected at '%s'", \
938 __FILE__, __LINE__, loc); \
940 if (TO_OUTPUT_WARNINGS(loc)) { \
944 UPDATE_WARNINGS_LOC(loc); \
948 /* m is not necessarily a "literal string", in this macro */
949 #define warn_non_literal_string(loc, packed_warn, m) \
950 _WARN_HELPER(loc, packed_warn, \
951 Perl_warner(aTHX_ packed_warn, \
952 "%s" REPORT_LOCATION, \
953 m, REPORT_LOCATION_ARGS(loc)))
954 #define reg_warn_non_literal_string(loc, m) \
955 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
957 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
960 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
961 Newx(format, format_size, char); \
962 my_strlcpy(format, m, format_size); \
963 my_strlcat(format, REPORT_LOCATION, format_size); \
964 SAVEFREEPV(format); \
965 _WARN_HELPER(loc, packwarn, \
966 Perl_ck_warner(aTHX_ packwarn, \
968 a1, REPORT_LOCATION_ARGS(loc))); \
971 #define ckWARNreg(loc,m) \
972 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
973 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
975 REPORT_LOCATION_ARGS(loc)))
977 #define vWARN(loc, m) \
978 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
979 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
981 REPORT_LOCATION_ARGS(loc))) \
983 #define vWARN_dep(loc, m) \
984 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
985 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
987 REPORT_LOCATION_ARGS(loc)))
989 #define ckWARNdep(loc,m) \
990 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
991 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
993 REPORT_LOCATION_ARGS(loc)))
995 #define ckWARNregdep(loc,m) \
996 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
997 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1000 REPORT_LOCATION_ARGS(loc)))
1002 #define ckWARN2reg_d(loc,m, a1) \
1003 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1004 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1005 m REPORT_LOCATION, \
1006 a1, REPORT_LOCATION_ARGS(loc)))
1008 #define ckWARN2reg(loc, m, a1) \
1009 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1010 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1011 m REPORT_LOCATION, \
1012 a1, REPORT_LOCATION_ARGS(loc)))
1014 #define vWARN3(loc, m, a1, a2) \
1015 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1016 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1017 m REPORT_LOCATION, \
1018 a1, a2, REPORT_LOCATION_ARGS(loc)))
1020 #define ckWARN3reg(loc, m, a1, a2) \
1021 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1022 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1023 m REPORT_LOCATION, \
1025 REPORT_LOCATION_ARGS(loc)))
1027 #define vWARN4(loc, m, a1, a2, a3) \
1028 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1029 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1030 m REPORT_LOCATION, \
1032 REPORT_LOCATION_ARGS(loc)))
1034 #define ckWARN4reg(loc, m, a1, a2, a3) \
1035 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1036 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1037 m REPORT_LOCATION, \
1039 REPORT_LOCATION_ARGS(loc)))
1041 #define vWARN5(loc, m, a1, a2, a3, a4) \
1042 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1043 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1044 m REPORT_LOCATION, \
1046 REPORT_LOCATION_ARGS(loc)))
1048 #define ckWARNexperimental(loc, class, m) \
1050 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1051 RExC_warned_ ## class = 1; \
1052 _WARN_HELPER(loc, packWARN(class), \
1053 Perl_ck_warner_d(aTHX_ packWARN(class), \
1054 m REPORT_LOCATION, \
1055 REPORT_LOCATION_ARGS(loc)));\
1059 /* Convert between a pointer to a node and its offset from the beginning of the
1061 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1062 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1064 /* Macros for recording node offsets. 20001227 mjd@plover.com
1065 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1066 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1067 * Element 0 holds the number n.
1068 * Position is 1 indexed.
1070 #ifndef RE_TRACK_PATTERN_OFFSETS
1071 #define Set_Node_Offset_To_R(offset,byte)
1072 #define Set_Node_Offset(node,byte)
1073 #define Set_Cur_Node_Offset
1074 #define Set_Node_Length_To_R(node,len)
1075 #define Set_Node_Length(node,len)
1076 #define Set_Node_Cur_Length(node,start)
1077 #define Node_Offset(n)
1078 #define Node_Length(n)
1079 #define Set_Node_Offset_Length(node,offset,len)
1080 #define ProgLen(ri) ri->u.proglen
1081 #define SetProgLen(ri,x) ri->u.proglen = x
1082 #define Track_Code(code)
1084 #define ProgLen(ri) ri->u.offsets[0]
1085 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1086 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
1087 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
1088 __LINE__, (int)(offset), (int)(byte))); \
1089 if((offset) < 0) { \
1090 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
1093 RExC_offsets[2*(offset)-1] = (byte); \
1097 #define Set_Node_Offset(node,byte) \
1098 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1099 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1101 #define Set_Node_Length_To_R(node,len) STMT_START { \
1102 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1103 __LINE__, (int)(node), (int)(len))); \
1105 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1108 RExC_offsets[2*(node)] = (len); \
1112 #define Set_Node_Length(node,len) \
1113 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1114 #define Set_Node_Cur_Length(node, start) \
1115 Set_Node_Length(node, RExC_parse - start)
1117 /* Get offsets and lengths */
1118 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1119 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1121 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1122 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1123 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1126 #define Track_Code(code) STMT_START { code } STMT_END
1129 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1130 #define EXPERIMENTAL_INPLACESCAN
1131 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1135 Perl_re_printf(pTHX_ const char *fmt, ...)
1139 PerlIO *f= Perl_debug_log;
1140 PERL_ARGS_ASSERT_RE_PRINTF;
1142 result = PerlIO_vprintf(f, fmt, ap);
1148 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1152 PerlIO *f= Perl_debug_log;
1153 PERL_ARGS_ASSERT_RE_INDENTF;
1154 va_start(ap, depth);
1155 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1156 result = PerlIO_vprintf(f, fmt, ap);
1160 #endif /* DEBUGGING */
1162 #define DEBUG_RExC_seen() \
1163 DEBUG_OPTIMISE_MORE_r({ \
1164 Perl_re_printf( aTHX_ "RExC_seen: "); \
1166 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1167 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1169 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1170 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1172 if (RExC_seen & REG_GPOS_SEEN) \
1173 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1175 if (RExC_seen & REG_RECURSE_SEEN) \
1176 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1178 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1179 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1181 if (RExC_seen & REG_VERBARG_SEEN) \
1182 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1184 if (RExC_seen & REG_CUTGROUP_SEEN) \
1185 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1187 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1188 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1190 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1191 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1193 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1194 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1196 Perl_re_printf( aTHX_ "\n"); \
1199 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1200 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1205 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1206 const char *close_str)
1211 Perl_re_printf( aTHX_ "%s", open_str);
1212 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1213 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1214 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1215 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1216 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1217 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1218 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1219 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1221 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1227 Perl_re_printf( aTHX_ "%s", close_str);
1232 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1233 U32 depth, int is_inf)
1235 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1237 DEBUG_OPTIMISE_MORE_r({
1240 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1244 (IV)data->pos_delta,
1248 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1250 Perl_re_printf( aTHX_
1251 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1253 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1254 is_inf ? "INF " : ""
1257 if (data->last_found) {
1259 Perl_re_printf(aTHX_
1260 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1261 SvPVX_const(data->last_found),
1263 (IV)data->last_start_min,
1264 (IV)data->last_start_max
1267 for (i = 0; i < 2; i++) {
1268 Perl_re_printf(aTHX_
1269 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1270 data->cur_is_floating == i ? "*" : "",
1271 i ? "Float" : "Fixed",
1272 SvPVX_const(data->substrs[i].str),
1273 (IV)data->substrs[i].min_offset,
1274 (IV)data->substrs[i].max_offset
1276 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1280 Perl_re_printf( aTHX_ "\n");
1286 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1287 regnode *scan, U32 depth, U32 flags)
1289 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1296 Next = regnext(scan);
1297 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1298 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1301 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1302 Next ? (REG_NODE_NUM(Next)) : 0 );
1303 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1304 Perl_re_printf( aTHX_ "\n");
1309 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1310 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1312 # define DEBUG_PEEP(str, scan, depth, flags) \
1313 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1316 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1317 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1321 /* =========================================================
1322 * BEGIN edit_distance stuff.
1324 * This calculates how many single character changes of any type are needed to
1325 * transform a string into another one. It is taken from version 3.1 of
1327 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1330 /* Our unsorted dictionary linked list. */
1331 /* Note we use UVs, not chars. */
1336 struct dictionary* next;
1338 typedef struct dictionary item;
1341 PERL_STATIC_INLINE item*
1342 push(UV key, item* curr)
1345 Newx(head, 1, item);
1353 PERL_STATIC_INLINE item*
1354 find(item* head, UV key)
1356 item* iterator = head;
1358 if (iterator->key == key){
1361 iterator = iterator->next;
1367 PERL_STATIC_INLINE item*
1368 uniquePush(item* head, UV key)
1370 item* iterator = head;
1373 if (iterator->key == key) {
1376 iterator = iterator->next;
1379 return push(key, head);
1382 PERL_STATIC_INLINE void
1383 dict_free(item* head)
1385 item* iterator = head;
1388 item* temp = iterator;
1389 iterator = iterator->next;
1396 /* End of Dictionary Stuff */
1398 /* All calculations/work are done here */
1400 S_edit_distance(const UV* src,
1402 const STRLEN x, /* length of src[] */
1403 const STRLEN y, /* length of tgt[] */
1404 const SSize_t maxDistance
1408 UV swapCount, swapScore, targetCharCount, i, j;
1410 UV score_ceil = x + y;
1412 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1414 /* intialize matrix start values */
1415 Newx(scores, ( (x + 2) * (y + 2)), UV);
1416 scores[0] = score_ceil;
1417 scores[1 * (y + 2) + 0] = score_ceil;
1418 scores[0 * (y + 2) + 1] = score_ceil;
1419 scores[1 * (y + 2) + 1] = 0;
1420 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1425 for (i=1;i<=x;i++) {
1427 head = uniquePush(head, src[i]);
1428 scores[(i+1) * (y + 2) + 1] = i;
1429 scores[(i+1) * (y + 2) + 0] = score_ceil;
1432 for (j=1;j<=y;j++) {
1435 head = uniquePush(head, tgt[j]);
1436 scores[1 * (y + 2) + (j + 1)] = j;
1437 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1440 targetCharCount = find(head, tgt[j-1])->value;
1441 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1443 if (src[i-1] != tgt[j-1]){
1444 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1448 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1452 find(head, src[i-1])->value = i;
1456 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1459 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1463 /* END of edit_distance() stuff
1464 * ========================================================= */
1466 /* Mark that we cannot extend a found fixed substring at this point.
1467 Update the longest found anchored substring or the longest found
1468 floating substrings if needed. */
1471 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1472 SSize_t *minlenp, int is_inf)
1474 const STRLEN l = CHR_SVLEN(data->last_found);
1475 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1476 const STRLEN old_l = CHR_SVLEN(longest_sv);
1477 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1479 PERL_ARGS_ASSERT_SCAN_COMMIT;
1481 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1482 const U8 i = data->cur_is_floating;
1483 SvSetMagicSV(longest_sv, data->last_found);
1484 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1487 data->substrs[0].max_offset = data->substrs[0].min_offset;
1489 data->substrs[1].max_offset =
1493 ? data->last_start_max
1494 /* temporary underflow guard for 5.32 */
1495 : data->pos_delta < 0 ? OPTIMIZE_INFTY
1496 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1498 : data->pos_min + data->pos_delta));
1501 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1502 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1503 data->substrs[i].minlenp = minlenp;
1504 data->substrs[i].lookbehind = 0;
1507 SvCUR_set(data->last_found, 0);
1509 SV * const sv = data->last_found;
1510 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1511 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1516 data->last_end = -1;
1517 data->flags &= ~SF_BEFORE_EOL;
1518 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1521 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1522 * list that describes which code points it matches */
1525 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1527 /* Set the SSC 'ssc' to match an empty string or any code point */
1529 PERL_ARGS_ASSERT_SSC_ANYTHING;
1531 assert(is_ANYOF_SYNTHETIC(ssc));
1533 /* mortalize so won't leak */
1534 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1535 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1539 S_ssc_is_anything(const regnode_ssc *ssc)
1541 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1542 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1543 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1544 * in any way, so there's no point in using it */
1549 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1551 assert(is_ANYOF_SYNTHETIC(ssc));
1553 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1557 /* See if the list consists solely of the range 0 - Infinity */
1558 invlist_iterinit(ssc->invlist);
1559 ret = invlist_iternext(ssc->invlist, &start, &end)
1563 invlist_iterfinish(ssc->invlist);
1569 /* If e.g., both \w and \W are set, matches everything */
1570 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1572 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1573 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1583 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1585 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1586 * string, any code point, or any posix class under locale */
1588 PERL_ARGS_ASSERT_SSC_INIT;
1590 Zero(ssc, 1, regnode_ssc);
1591 set_ANYOF_SYNTHETIC(ssc);
1592 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1595 /* If any portion of the regex is to operate under locale rules that aren't
1596 * fully known at compile time, initialization includes it. The reason
1597 * this isn't done for all regexes is that the optimizer was written under
1598 * the assumption that locale was all-or-nothing. Given the complexity and
1599 * lack of documentation in the optimizer, and that there are inadequate
1600 * test cases for locale, many parts of it may not work properly, it is
1601 * safest to avoid locale unless necessary. */
1602 if (RExC_contains_locale) {
1603 ANYOF_POSIXL_SETALL(ssc);
1606 ANYOF_POSIXL_ZERO(ssc);
1611 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1612 const regnode_ssc *ssc)
1614 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1615 * to the list of code points matched, and locale posix classes; hence does
1616 * not check its flags) */
1621 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1623 assert(is_ANYOF_SYNTHETIC(ssc));
1625 invlist_iterinit(ssc->invlist);
1626 ret = invlist_iternext(ssc->invlist, &start, &end)
1630 invlist_iterfinish(ssc->invlist);
1636 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1643 #define INVLIST_INDEX 0
1644 #define ONLY_LOCALE_MATCHES_INDEX 1
1645 #define DEFERRED_USER_DEFINED_INDEX 2
1648 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1649 const regnode_charclass* const node)
1651 /* Returns a mortal inversion list defining which code points are matched
1652 * by 'node', which is of type ANYOF. Handles complementing the result if
1653 * appropriate. If some code points aren't knowable at this time, the
1654 * returned list must, and will, contain every code point that is a
1658 SV* only_utf8_locale_invlist = NULL;
1660 const U32 n = ARG(node);
1661 bool new_node_has_latin1 = FALSE;
1662 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1664 : ANYOF_FLAGS(node);
1666 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1668 /* Look at the data structure created by S_set_ANYOF_arg() */
1669 if (n != ANYOF_ONLY_HAS_BITMAP) {
1670 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1671 AV * const av = MUTABLE_AV(SvRV(rv));
1672 SV **const ary = AvARRAY(av);
1673 assert(RExC_rxi->data->what[n] == 's');
1675 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1677 /* Here there are things that won't be known until runtime -- we
1678 * have to assume it could be anything */
1679 invlist = sv_2mortal(_new_invlist(1));
1680 return _add_range_to_invlist(invlist, 0, UV_MAX);
1682 else if (ary[INVLIST_INDEX]) {
1684 /* Use the node's inversion list */
1685 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1688 /* Get the code points valid only under UTF-8 locales */
1689 if ( (flags & ANYOFL_FOLD)
1690 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1692 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1697 invlist = sv_2mortal(_new_invlist(0));
1700 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1701 * code points, and an inversion list for the others, but if there are code
1702 * points that should match only conditionally on the target string being
1703 * UTF-8, those are placed in the inversion list, and not the bitmap.
1704 * Since there are circumstances under which they could match, they are
1705 * included in the SSC. But if the ANYOF node is to be inverted, we have
1706 * to exclude them here, so that when we invert below, the end result
1707 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1708 * have to do this here before we add the unconditionally matched code
1710 if (flags & ANYOF_INVERT) {
1711 _invlist_intersection_complement_2nd(invlist,
1716 /* Add in the points from the bit map */
1717 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1718 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1719 if (ANYOF_BITMAP_TEST(node, i)) {
1720 unsigned int start = i++;
1722 for (; i < NUM_ANYOF_CODE_POINTS
1723 && ANYOF_BITMAP_TEST(node, i); ++i)
1727 invlist = _add_range_to_invlist(invlist, start, i-1);
1728 new_node_has_latin1 = TRUE;
1733 /* If this can match all upper Latin1 code points, have to add them
1734 * as well. But don't add them if inverting, as when that gets done below,
1735 * it would exclude all these characters, including the ones it shouldn't
1736 * that were added just above */
1737 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1738 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1740 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1743 /* Similarly for these */
1744 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1745 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1748 if (flags & ANYOF_INVERT) {
1749 _invlist_invert(invlist);
1751 else if (flags & ANYOFL_FOLD) {
1752 if (new_node_has_latin1) {
1754 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1755 * the locale. We can skip this if there are no 0-255 at all. */
1756 _invlist_union(invlist, PL_Latin1, &invlist);
1758 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1759 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1762 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1763 invlist = add_cp_to_invlist(invlist, 'I');
1765 if (_invlist_contains_cp(invlist,
1766 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1768 invlist = add_cp_to_invlist(invlist, 'i');
1773 /* Similarly add the UTF-8 locale possible matches. These have to be
1774 * deferred until after the non-UTF-8 locale ones are taken care of just
1775 * above, or it leads to wrong results under ANYOF_INVERT */
1776 if (only_utf8_locale_invlist) {
1777 _invlist_union_maybe_complement_2nd(invlist,
1778 only_utf8_locale_invlist,
1779 flags & ANYOF_INVERT,
1786 /* These two functions currently do the exact same thing */
1787 #define ssc_init_zero ssc_init
1789 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1790 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1792 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1793 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1794 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1797 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1798 const regnode_charclass *and_with)
1800 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1801 * another SSC or a regular ANYOF class. Can create false positives. */
1804 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1806 : ANYOF_FLAGS(and_with);
1809 PERL_ARGS_ASSERT_SSC_AND;
1811 assert(is_ANYOF_SYNTHETIC(ssc));
1813 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1814 * the code point inversion list and just the relevant flags */
1815 if (is_ANYOF_SYNTHETIC(and_with)) {
1816 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1817 anded_flags = and_with_flags;
1819 /* XXX This is a kludge around what appears to be deficiencies in the
1820 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1821 * there are paths through the optimizer where it doesn't get weeded
1822 * out when it should. And if we don't make some extra provision for
1823 * it like the code just below, it doesn't get added when it should.
1824 * This solution is to add it only when AND'ing, which is here, and
1825 * only when what is being AND'ed is the pristine, original node
1826 * matching anything. Thus it is like adding it to ssc_anything() but
1827 * only when the result is to be AND'ed. Probably the same solution
1828 * could be adopted for the same problem we have with /l matching,
1829 * which is solved differently in S_ssc_init(), and that would lead to
1830 * fewer false positives than that solution has. But if this solution
1831 * creates bugs, the consequences are only that a warning isn't raised
1832 * that should be; while the consequences for having /l bugs is
1833 * incorrect matches */
1834 if (ssc_is_anything((regnode_ssc *)and_with)) {
1835 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1839 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1840 if (OP(and_with) == ANYOFD) {
1841 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1844 anded_flags = and_with_flags
1845 &( ANYOF_COMMON_FLAGS
1846 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1847 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1848 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1850 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1855 ANYOF_FLAGS(ssc) &= anded_flags;
1857 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1858 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1859 * 'and_with' may be inverted. When not inverted, we have the situation of
1861 * (C1 | P1) & (C2 | P2)
1862 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1863 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1864 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1865 * <= ((C1 & C2) | P1 | P2)
1866 * Alternatively, the last few steps could be:
1867 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1868 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1869 * <= (C1 | C2 | (P1 & P2))
1870 * We favor the second approach if either P1 or P2 is non-empty. This is
1871 * because these components are a barrier to doing optimizations, as what
1872 * they match cannot be known until the moment of matching as they are
1873 * dependent on the current locale, 'AND"ing them likely will reduce or
1875 * But we can do better if we know that C1,P1 are in their initial state (a
1876 * frequent occurrence), each matching everything:
1877 * (<everything>) & (C2 | P2) = C2 | P2
1878 * Similarly, if C2,P2 are in their initial state (again a frequent
1879 * occurrence), the result is a no-op
1880 * (C1 | P1) & (<everything>) = C1 | P1
1883 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1884 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1885 * <= (C1 & ~C2) | (P1 & ~P2)
1888 if ((and_with_flags & ANYOF_INVERT)
1889 && ! is_ANYOF_SYNTHETIC(and_with))
1893 ssc_intersection(ssc,
1895 FALSE /* Has already been inverted */
1898 /* If either P1 or P2 is empty, the intersection will be also; can skip
1900 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1901 ANYOF_POSIXL_ZERO(ssc);
1903 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1905 /* Note that the Posix class component P from 'and_with' actually
1907 * P = Pa | Pb | ... | Pn
1908 * where each component is one posix class, such as in [\w\s].
1910 * ~P = ~(Pa | Pb | ... | Pn)
1911 * = ~Pa & ~Pb & ... & ~Pn
1912 * <= ~Pa | ~Pb | ... | ~Pn
1913 * The last is something we can easily calculate, but unfortunately
1914 * is likely to have many false positives. We could do better
1915 * in some (but certainly not all) instances if two classes in
1916 * P have known relationships. For example
1917 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1919 * :lower: & :print: = :lower:
1920 * And similarly for classes that must be disjoint. For example,
1921 * since \s and \w can have no elements in common based on rules in
1922 * the POSIX standard,
1923 * \w & ^\S = nothing
1924 * Unfortunately, some vendor locales do not meet the Posix
1925 * standard, in particular almost everything by Microsoft.
1926 * The loop below just changes e.g., \w into \W and vice versa */
1928 regnode_charclass_posixl temp;
1929 int add = 1; /* To calculate the index of the complement */
1931 Zero(&temp, 1, regnode_charclass_posixl);
1932 ANYOF_POSIXL_ZERO(&temp);
1933 for (i = 0; i < ANYOF_MAX; i++) {
1935 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1936 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1938 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1939 ANYOF_POSIXL_SET(&temp, i + add);
1941 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1943 ANYOF_POSIXL_AND(&temp, ssc);
1945 } /* else ssc already has no posixes */
1946 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1947 in its initial state */
1948 else if (! is_ANYOF_SYNTHETIC(and_with)
1949 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1951 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1952 * copy it over 'ssc' */
1953 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1954 if (is_ANYOF_SYNTHETIC(and_with)) {
1955 StructCopy(and_with, ssc, regnode_ssc);
1958 ssc->invlist = anded_cp_list;
1959 ANYOF_POSIXL_ZERO(ssc);
1960 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1961 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1965 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1966 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1968 /* One or the other of P1, P2 is non-empty. */
1969 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1970 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1972 ssc_union(ssc, anded_cp_list, FALSE);
1974 else { /* P1 = P2 = empty */
1975 ssc_intersection(ssc, anded_cp_list, FALSE);
1981 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1982 const regnode_charclass *or_with)
1984 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1985 * another SSC or a regular ANYOF class. Can create false positives if
1986 * 'or_with' is to be inverted. */
1990 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1992 : ANYOF_FLAGS(or_with);
1994 PERL_ARGS_ASSERT_SSC_OR;
1996 assert(is_ANYOF_SYNTHETIC(ssc));
1998 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1999 * the code point inversion list and just the relevant flags */
2000 if (is_ANYOF_SYNTHETIC(or_with)) {
2001 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2002 ored_flags = or_with_flags;
2005 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2006 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2007 if (OP(or_with) != ANYOFD) {
2010 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2011 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2012 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2014 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2019 ANYOF_FLAGS(ssc) |= ored_flags;
2021 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2022 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2023 * 'or_with' may be inverted. When not inverted, we have the simple
2024 * situation of computing:
2025 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2026 * If P1|P2 yields a situation with both a class and its complement are
2027 * set, like having both \w and \W, this matches all code points, and we
2028 * can delete these from the P component of the ssc going forward. XXX We
2029 * might be able to delete all the P components, but I (khw) am not certain
2030 * about this, and it is better to be safe.
2033 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2034 * <= (C1 | P1) | ~C2
2035 * <= (C1 | ~C2) | P1
2036 * (which results in actually simpler code than the non-inverted case)
2039 if ((or_with_flags & ANYOF_INVERT)
2040 && ! is_ANYOF_SYNTHETIC(or_with))
2042 /* We ignore P2, leaving P1 going forward */
2043 } /* else Not inverted */
2044 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2045 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2046 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2048 for (i = 0; i < ANYOF_MAX; i += 2) {
2049 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2051 ssc_match_all_cp(ssc);
2052 ANYOF_POSIXL_CLEAR(ssc, i);
2053 ANYOF_POSIXL_CLEAR(ssc, i+1);
2061 FALSE /* Already has been inverted */
2066 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2068 PERL_ARGS_ASSERT_SSC_UNION;
2070 assert(is_ANYOF_SYNTHETIC(ssc));
2072 _invlist_union_maybe_complement_2nd(ssc->invlist,
2079 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2081 const bool invert2nd)
2083 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2085 assert(is_ANYOF_SYNTHETIC(ssc));
2087 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2094 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2096 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2098 assert(is_ANYOF_SYNTHETIC(ssc));
2100 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2104 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2106 /* AND just the single code point 'cp' into the SSC 'ssc' */
2108 SV* cp_list = _new_invlist(2);
2110 PERL_ARGS_ASSERT_SSC_CP_AND;
2112 assert(is_ANYOF_SYNTHETIC(ssc));
2114 cp_list = add_cp_to_invlist(cp_list, cp);
2115 ssc_intersection(ssc, cp_list,
2116 FALSE /* Not inverted */
2118 SvREFCNT_dec_NN(cp_list);
2122 S_ssc_clear_locale(regnode_ssc *ssc)
2124 /* Set the SSC 'ssc' to not match any locale things */
2125 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2127 assert(is_ANYOF_SYNTHETIC(ssc));
2129 ANYOF_POSIXL_ZERO(ssc);
2130 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2133 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2136 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2138 /* The synthetic start class is used to hopefully quickly winnow down
2139 * places where a pattern could start a match in the target string. If it
2140 * doesn't really narrow things down that much, there isn't much point to
2141 * having the overhead of using it. This function uses some very crude
2142 * heuristics to decide if to use the ssc or not.
2144 * It returns TRUE if 'ssc' rules out more than half what it considers to
2145 * be the "likely" possible matches, but of course it doesn't know what the
2146 * actual things being matched are going to be; these are only guesses
2148 * For /l matches, it assumes that the only likely matches are going to be
2149 * in the 0-255 range, uniformly distributed, so half of that is 127
2150 * For /a and /d matches, it assumes that the likely matches will be just
2151 * the ASCII range, so half of that is 63
2152 * For /u and there isn't anything matching above the Latin1 range, it
2153 * assumes that that is the only range likely to be matched, and uses
2154 * half that as the cut-off: 127. If anything matches above Latin1,
2155 * it assumes that all of Unicode could match (uniformly), except for
2156 * non-Unicode code points and things in the General Category "Other"
2157 * (unassigned, private use, surrogates, controls and formats). This
2158 * is a much large number. */
2160 U32 count = 0; /* Running total of number of code points matched by
2162 UV start, end; /* Start and end points of current range in inversion
2163 XXX outdated. UTF-8 locales are common, what about invert? list */
2164 const U32 max_code_points = (LOC)
2166 : (( ! UNI_SEMANTICS
2167 || invlist_highest(ssc->invlist) < 256)
2170 const U32 max_match = max_code_points / 2;
2172 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2174 invlist_iterinit(ssc->invlist);
2175 while (invlist_iternext(ssc->invlist, &start, &end)) {
2176 if (start >= max_code_points) {
2179 end = MIN(end, max_code_points - 1);
2180 count += end - start + 1;
2181 if (count >= max_match) {
2182 invlist_iterfinish(ssc->invlist);
2192 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2194 /* The inversion list in the SSC is marked mortal; now we need a more
2195 * permanent copy, which is stored the same way that is done in a regular
2196 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2199 SV* invlist = invlist_clone(ssc->invlist, NULL);
2201 PERL_ARGS_ASSERT_SSC_FINALIZE;
2203 assert(is_ANYOF_SYNTHETIC(ssc));
2205 /* The code in this file assumes that all but these flags aren't relevant
2206 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2207 * by the time we reach here */
2208 assert(! (ANYOF_FLAGS(ssc)
2209 & ~( ANYOF_COMMON_FLAGS
2210 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2211 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2213 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2215 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2216 SvREFCNT_dec(invlist);
2218 /* Make sure is clone-safe */
2219 ssc->invlist = NULL;
2221 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2222 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2223 OP(ssc) = ANYOFPOSIXL;
2225 else if (RExC_contains_locale) {
2229 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2232 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2233 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2234 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2235 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2236 ? (TRIE_LIST_CUR( idx ) - 1) \
2242 dump_trie(trie,widecharmap,revcharmap)
2243 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2244 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2246 These routines dump out a trie in a somewhat readable format.
2247 The _interim_ variants are used for debugging the interim
2248 tables that are used to generate the final compressed
2249 representation which is what dump_trie expects.
2251 Part of the reason for their existence is to provide a form
2252 of documentation as to how the different representations function.
2257 Dumps the final compressed table form of the trie to Perl_debug_log.
2258 Used for debugging make_trie().
2262 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2263 AV *revcharmap, U32 depth)
2266 SV *sv=sv_newmortal();
2267 int colwidth= widecharmap ? 6 : 4;
2269 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2271 PERL_ARGS_ASSERT_DUMP_TRIE;
2273 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2274 depth+1, "Match","Base","Ofs" );
2276 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2277 SV ** const tmp = av_fetch( revcharmap, state, 0);
2279 Perl_re_printf( aTHX_ "%*s",
2281 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2282 PL_colors[0], PL_colors[1],
2283 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2284 PERL_PV_ESCAPE_FIRSTCHAR
2289 Perl_re_printf( aTHX_ "\n");
2290 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2292 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2293 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2294 Perl_re_printf( aTHX_ "\n");
2296 for( state = 1 ; state < trie->statecount ; state++ ) {
2297 const U32 base = trie->states[ state ].trans.base;
2299 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2301 if ( trie->states[ state ].wordnum ) {
2302 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2304 Perl_re_printf( aTHX_ "%6s", "" );
2307 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2312 while( ( base + ofs < trie->uniquecharcount ) ||
2313 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2314 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2318 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2320 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2321 if ( ( base + ofs >= trie->uniquecharcount )
2322 && ( base + ofs - trie->uniquecharcount
2324 && trie->trans[ base + ofs
2325 - trie->uniquecharcount ].check == state )
2327 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2328 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2331 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2335 Perl_re_printf( aTHX_ "]");
2338 Perl_re_printf( aTHX_ "\n" );
2340 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2342 for (word=1; word <= trie->wordcount; word++) {
2343 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2344 (int)word, (int)(trie->wordinfo[word].prev),
2345 (int)(trie->wordinfo[word].len));
2347 Perl_re_printf( aTHX_ "\n" );
2350 Dumps a fully constructed but uncompressed trie in list form.
2351 List tries normally only are used for construction when the number of
2352 possible chars (trie->uniquecharcount) is very high.
2353 Used for debugging make_trie().
2356 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2357 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2361 SV *sv=sv_newmortal();
2362 int colwidth= widecharmap ? 6 : 4;
2363 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2365 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2367 /* print out the table precompression. */
2368 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2370 Perl_re_indentf( aTHX_ "%s",
2371 depth+1, "------:-----+-----------------\n" );
2373 for( state=1 ; state < next_alloc ; state ++ ) {
2376 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2377 depth+1, (UV)state );
2378 if ( ! trie->states[ state ].wordnum ) {
2379 Perl_re_printf( aTHX_ "%5s| ","");
2381 Perl_re_printf( aTHX_ "W%4x| ",
2382 trie->states[ state ].wordnum
2385 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2386 SV ** const tmp = av_fetch( revcharmap,
2387 TRIE_LIST_ITEM(state, charid).forid, 0);
2389 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2391 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2393 PL_colors[0], PL_colors[1],
2394 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2395 | PERL_PV_ESCAPE_FIRSTCHAR
2397 TRIE_LIST_ITEM(state, charid).forid,
2398 (UV)TRIE_LIST_ITEM(state, charid).newstate
2401 Perl_re_printf( aTHX_ "\n%*s| ",
2402 (int)((depth * 2) + 14), "");
2405 Perl_re_printf( aTHX_ "\n");
2410 Dumps a fully constructed but uncompressed trie in table form.
2411 This is the normal DFA style state transition table, with a few
2412 twists to facilitate compression later.
2413 Used for debugging make_trie().
2416 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2417 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2422 SV *sv=sv_newmortal();
2423 int colwidth= widecharmap ? 6 : 4;
2424 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2426 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2429 print out the table precompression so that we can do a visual check
2430 that they are identical.
2433 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2435 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2436 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2438 Perl_re_printf( aTHX_ "%*s",
2440 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2441 PL_colors[0], PL_colors[1],
2442 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2443 PERL_PV_ESCAPE_FIRSTCHAR
2449 Perl_re_printf( aTHX_ "\n");
2450 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2452 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2453 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2456 Perl_re_printf( aTHX_ "\n" );
2458 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2460 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2462 (UV)TRIE_NODENUM( state ) );
2464 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2465 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2467 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2469 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2471 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2472 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2473 (UV)trie->trans[ state ].check );
2475 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2476 (UV)trie->trans[ state ].check,
2477 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2485 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2486 startbranch: the first branch in the whole branch sequence
2487 first : start branch of sequence of branch-exact nodes.
2488 May be the same as startbranch
2489 last : Thing following the last branch.
2490 May be the same as tail.
2491 tail : item following the branch sequence
2492 count : words in the sequence
2493 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2494 depth : indent depth
2496 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2498 A trie is an N'ary tree where the branches are determined by digital
2499 decomposition of the key. IE, at the root node you look up the 1st character and
2500 follow that branch repeat until you find the end of the branches. Nodes can be
2501 marked as "accepting" meaning they represent a complete word. Eg:
2505 would convert into the following structure. Numbers represent states, letters
2506 following numbers represent valid transitions on the letter from that state, if
2507 the number is in square brackets it represents an accepting state, otherwise it
2508 will be in parenthesis.
2510 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2514 (1) +-i->(6)-+-s->[7]
2516 +-s->(3)-+-h->(4)-+-e->[5]
2518 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2520 This shows that when matching against the string 'hers' we will begin at state 1
2521 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2522 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2523 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2524 single traverse. We store a mapping from accepting to state to which word was
2525 matched, and then when we have multiple possibilities we try to complete the
2526 rest of the regex in the order in which they occurred in the alternation.
2528 The only prior NFA like behaviour that would be changed by the TRIE support is
2529 the silent ignoring of duplicate alternations which are of the form:
2531 / (DUPE|DUPE) X? (?{ ... }) Y /x
2533 Thus EVAL blocks following a trie may be called a different number of times with
2534 and without the optimisation. With the optimisations dupes will be silently
2535 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2536 the following demonstrates:
2538 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2540 which prints out 'word' three times, but
2542 'words'=~/(word|word|word)(?{ print $1 })S/
2544 which doesnt print it out at all. This is due to other optimisations kicking in.
2546 Example of what happens on a structural level:
2548 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2550 1: CURLYM[1] {1,32767}(18)
2561 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2562 and should turn into:
2564 1: CURLYM[1] {1,32767}(18)
2566 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2574 Cases where tail != last would be like /(?foo|bar)baz/:
2584 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2585 and would end up looking like:
2588 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2595 d = uvchr_to_utf8_flags(d, uv, 0);
2597 is the recommended Unicode-aware way of saying
2602 #define TRIE_STORE_REVCHAR(val) \
2605 SV *zlopp = newSV(UTF8_MAXBYTES); \
2606 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2607 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2609 SvCUR_set(zlopp, kapow - flrbbbbb); \
2612 av_push(revcharmap, zlopp); \
2614 char ooooff = (char)val; \
2615 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2619 /* This gets the next character from the input, folding it if not already
2621 #define TRIE_READ_CHAR STMT_START { \
2624 /* if it is UTF then it is either already folded, or does not need \
2626 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2628 else if (folder == PL_fold_latin1) { \
2629 /* This folder implies Unicode rules, which in the range expressible \
2630 * by not UTF is the lower case, with the two exceptions, one of \
2631 * which should have been taken care of before calling this */ \
2632 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2633 uvc = toLOWER_L1(*uc); \
2634 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2637 /* raw data, will be folded later if needed */ \
2645 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2646 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2647 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2648 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2649 TRIE_LIST_LEN( state ) = ging; \
2651 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2652 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2653 TRIE_LIST_CUR( state )++; \
2656 #define TRIE_LIST_NEW(state) STMT_START { \
2657 Newx( trie->states[ state ].trans.list, \
2658 4, reg_trie_trans_le ); \
2659 TRIE_LIST_CUR( state ) = 1; \
2660 TRIE_LIST_LEN( state ) = 4; \
2663 #define TRIE_HANDLE_WORD(state) STMT_START { \
2664 U16 dupe= trie->states[ state ].wordnum; \
2665 regnode * const noper_next = regnext( noper ); \
2668 /* store the word for dumping */ \
2670 if (OP(noper) != NOTHING) \
2671 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2673 tmp = newSVpvn_utf8( "", 0, UTF ); \
2674 av_push( trie_words, tmp ); \
2678 trie->wordinfo[curword].prev = 0; \
2679 trie->wordinfo[curword].len = wordlen; \
2680 trie->wordinfo[curword].accept = state; \
2682 if ( noper_next < tail ) { \
2684 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2686 trie->jump[curword] = (U16)(noper_next - convert); \
2688 jumper = noper_next; \
2690 nextbranch= regnext(cur); \
2694 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2695 /* chain, so that when the bits of chain are later */\
2696 /* linked together, the dups appear in the chain */\
2697 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2698 trie->wordinfo[dupe].prev = curword; \
2700 /* we haven't inserted this word yet. */ \
2701 trie->states[ state ].wordnum = curword; \
2706 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2707 ( ( base + charid >= ucharcount \
2708 && base + charid < ubound \
2709 && state == trie->trans[ base - ucharcount + charid ].check \
2710 && trie->trans[ base - ucharcount + charid ].next ) \
2711 ? trie->trans[ base - ucharcount + charid ].next \
2712 : ( state==1 ? special : 0 ) \
2715 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2717 TRIE_BITMAP_SET(trie, uvc); \
2718 /* store the folded codepoint */ \
2720 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2723 /* store first byte of utf8 representation of */ \
2724 /* variant codepoints */ \
2725 if (! UVCHR_IS_INVARIANT(uvc)) { \
2726 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2731 #define MADE_JUMP_TRIE 2
2732 #define MADE_EXACT_TRIE 4
2735 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2736 regnode *first, regnode *last, regnode *tail,
2737 U32 word_count, U32 flags, U32 depth)
2739 /* first pass, loop through and scan words */
2740 reg_trie_data *trie;
2741 HV *widecharmap = NULL;
2742 AV *revcharmap = newAV();
2748 regnode *jumper = NULL;
2749 regnode *nextbranch = NULL;
2750 regnode *convert = NULL;
2751 U32 *prev_states; /* temp array mapping each state to previous one */
2752 /* we just use folder as a flag in utf8 */
2753 const U8 * folder = NULL;
2755 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2756 * which stands for one trie structure, one hash, optionally followed
2759 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2760 AV *trie_words = NULL;
2761 /* along with revcharmap, this only used during construction but both are
2762 * useful during debugging so we store them in the struct when debugging.
2765 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2766 STRLEN trie_charcount=0;
2768 SV *re_trie_maxbuff;
2769 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2771 PERL_ARGS_ASSERT_MAKE_TRIE;
2773 PERL_UNUSED_ARG(depth);
2777 case EXACT: case EXACT_REQ8: case EXACTL: break;
2781 case EXACTFLU8: folder = PL_fold_latin1; break;
2782 case EXACTF: folder = PL_fold; break;
2783 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2786 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2788 trie->startstate = 1;
2789 trie->wordcount = word_count;
2790 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2791 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2792 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2793 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2794 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2795 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2798 trie_words = newAV();
2801 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2802 assert(re_trie_maxbuff);
2803 if (!SvIOK(re_trie_maxbuff)) {
2804 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2806 DEBUG_TRIE_COMPILE_r({
2807 Perl_re_indentf( aTHX_
2808 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2810 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2811 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2814 /* Find the node we are going to overwrite */
2815 if ( first == startbranch && OP( last ) != BRANCH ) {
2816 /* whole branch chain */
2819 /* branch sub-chain */
2820 convert = NEXTOPER( first );
2823 /* -- First loop and Setup --
2825 We first traverse the branches and scan each word to determine if it
2826 contains widechars, and how many unique chars there are, this is
2827 important as we have to build a table with at least as many columns as we
2830 We use an array of integers to represent the character codes 0..255
2831 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2832 the native representation of the character value as the key and IV's for
2835 *TODO* If we keep track of how many times each character is used we can
2836 remap the columns so that the table compression later on is more
2837 efficient in terms of memory by ensuring the most common value is in the
2838 middle and the least common are on the outside. IMO this would be better
2839 than a most to least common mapping as theres a decent chance the most
2840 common letter will share a node with the least common, meaning the node
2841 will not be compressible. With a middle is most common approach the worst
2842 case is when we have the least common nodes twice.
2846 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2847 regnode *noper = NEXTOPER( cur );
2851 U32 wordlen = 0; /* required init */
2852 STRLEN minchars = 0;
2853 STRLEN maxchars = 0;
2854 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2857 if (OP(noper) == NOTHING) {
2858 /* skip past a NOTHING at the start of an alternation
2859 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2861 * If the next node is not something we are supposed to process
2862 * we will just ignore it due to the condition guarding the
2866 regnode *noper_next= regnext(noper);
2867 if (noper_next < tail)
2872 && ( OP(noper) == flags
2873 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2874 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
2875 || OP(noper) == EXACTFUP))))
2877 uc= (U8*)STRING(noper);
2878 e= uc + STR_LEN(noper);
2885 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2886 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2887 regardless of encoding */
2888 if (OP( noper ) == EXACTFUP) {
2889 /* false positives are ok, so just set this */
2890 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2894 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2896 TRIE_CHARCOUNT(trie)++;
2899 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2900 * is in effect. Under /i, this character can match itself, or
2901 * anything that folds to it. If not under /i, it can match just
2902 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2903 * all fold to k, and all are single characters. But some folds
2904 * expand to more than one character, so for example LATIN SMALL
2905 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2906 * the string beginning at 'uc' is 'ffi', it could be matched by
2907 * three characters, or just by the one ligature character. (It
2908 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2909 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2910 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2911 * match.) The trie needs to know the minimum and maximum number
2912 * of characters that could match so that it can use size alone to
2913 * quickly reject many match attempts. The max is simple: it is
2914 * the number of folded characters in this branch (since a fold is
2915 * never shorter than what folds to it. */
2919 /* And the min is equal to the max if not under /i (indicated by
2920 * 'folder' being NULL), or there are no multi-character folds. If
2921 * there is a multi-character fold, the min is incremented just
2922 * once, for the character that folds to the sequence. Each
2923 * character in the sequence needs to be added to the list below of
2924 * characters in the trie, but we count only the first towards the
2925 * min number of characters needed. This is done through the
2926 * variable 'foldlen', which is returned by the macros that look
2927 * for these sequences as the number of bytes the sequence
2928 * occupies. Each time through the loop, we decrement 'foldlen' by
2929 * how many bytes the current char occupies. Only when it reaches
2930 * 0 do we increment 'minchars' or look for another multi-character
2932 if (folder == NULL) {
2935 else if (foldlen > 0) {
2936 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2941 /* See if *uc is the beginning of a multi-character fold. If
2942 * so, we decrement the length remaining to look at, to account
2943 * for the current character this iteration. (We can use 'uc'
2944 * instead of the fold returned by TRIE_READ_CHAR because the
2945 * macro is smart enough to account for any unfolded
2948 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2949 foldlen -= UTF8SKIP(uc);
2952 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2957 /* The current character (and any potential folds) should be added
2958 * to the possible matching characters for this position in this
2962 U8 folded= folder[ (U8) uvc ];
2963 if ( !trie->charmap[ folded ] ) {
2964 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2965 TRIE_STORE_REVCHAR( folded );
2968 if ( !trie->charmap[ uvc ] ) {
2969 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2970 TRIE_STORE_REVCHAR( uvc );
2973 /* store the codepoint in the bitmap, and its folded
2975 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2976 set_bit = 0; /* We've done our bit :-) */
2980 /* XXX We could come up with the list of code points that fold
2981 * to this using PL_utf8_foldclosures, except not for
2982 * multi-char folds, as there may be multiple combinations
2983 * there that could work, which needs to wait until runtime to
2984 * resolve (The comment about LIGATURE FFI above is such an
2989 widecharmap = newHV();
2991 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2994 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2996 if ( !SvTRUE( *svpp ) ) {
2997 sv_setiv( *svpp, ++trie->uniquecharcount );
2998 TRIE_STORE_REVCHAR(uvc);
3001 } /* end loop through characters in this branch of the trie */
3003 /* We take the min and max for this branch and combine to find the min
3004 * and max for all branches processed so far */
3005 if( cur == first ) {
3006 trie->minlen = minchars;
3007 trie->maxlen = maxchars;
3008 } else if (minchars < trie->minlen) {
3009 trie->minlen = minchars;
3010 } else if (maxchars > trie->maxlen) {
3011 trie->maxlen = maxchars;
3013 } /* end first pass */
3014 DEBUG_TRIE_COMPILE_r(
3015 Perl_re_indentf( aTHX_
3016 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3018 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3019 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3020 (int)trie->minlen, (int)trie->maxlen )
3024 We now know what we are dealing with in terms of unique chars and
3025 string sizes so we can calculate how much memory a naive
3026 representation using a flat table will take. If it's over a reasonable
3027 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3028 conservative but potentially much slower representation using an array
3031 At the end we convert both representations into the same compressed
3032 form that will be used in regexec.c for matching with. The latter
3033 is a form that cannot be used to construct with but has memory
3034 properties similar to the list form and access properties similar
3035 to the table form making it both suitable for fast searches and
3036 small enough that its feasable to store for the duration of a program.
3038 See the comment in the code where the compressed table is produced
3039 inplace from the flat tabe representation for an explanation of how
3040 the compression works.
3045 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3048 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3049 > SvIV(re_trie_maxbuff) )
3052 Second Pass -- Array Of Lists Representation
3054 Each state will be represented by a list of charid:state records
3055 (reg_trie_trans_le) the first such element holds the CUR and LEN
3056 points of the allocated array. (See defines above).
3058 We build the initial structure using the lists, and then convert
3059 it into the compressed table form which allows faster lookups
3060 (but cant be modified once converted).
3063 STRLEN transcount = 1;
3065 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3068 trie->states = (reg_trie_state *)
3069 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3070 sizeof(reg_trie_state) );
3074 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3076 regnode *noper = NEXTOPER( cur );
3077 U32 state = 1; /* required init */
3078 U16 charid = 0; /* sanity init */
3079 U32 wordlen = 0; /* required init */
3081 if (OP(noper) == NOTHING) {
3082 regnode *noper_next= regnext(noper);
3083 if (noper_next < tail)
3085 /* we will undo this assignment if noper does not
3086 * point at a trieable type in the else clause of
3087 * the following statement. */
3091 && ( OP(noper) == flags
3092 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3093 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3094 || OP(noper) == EXACTFUP))))
3096 const U8 *uc= (U8*)STRING(noper);
3097 const U8 *e= uc + STR_LEN(noper);
3099 for ( ; uc < e ; uc += len ) {
3104 charid = trie->charmap[ uvc ];
3106 SV** const svpp = hv_fetch( widecharmap,
3113 charid=(U16)SvIV( *svpp );
3116 /* charid is now 0 if we dont know the char read, or
3117 * nonzero if we do */
3124 if ( !trie->states[ state ].trans.list ) {
3125 TRIE_LIST_NEW( state );
3128 check <= TRIE_LIST_USED( state );
3131 if ( TRIE_LIST_ITEM( state, check ).forid
3134 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3139 newstate = next_alloc++;
3140 prev_states[newstate] = state;
3141 TRIE_LIST_PUSH( state, charid, newstate );
3146 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3150 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3151 * on a trieable type. So we need to reset noper back to point at the first regop
3152 * in the branch before we call TRIE_HANDLE_WORD()
3154 noper= NEXTOPER(cur);
3156 TRIE_HANDLE_WORD(state);
3158 } /* end second pass */
3160 /* next alloc is the NEXT state to be allocated */
3161 trie->statecount = next_alloc;
3162 trie->states = (reg_trie_state *)
3163 PerlMemShared_realloc( trie->states,
3165 * sizeof(reg_trie_state) );
3167 /* and now dump it out before we compress it */
3168 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3169 revcharmap, next_alloc,
3173 trie->trans = (reg_trie_trans *)
3174 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3181 for( state=1 ; state < next_alloc ; state ++ ) {
3185 DEBUG_TRIE_COMPILE_MORE_r(
3186 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3190 if (trie->states[state].trans.list) {
3191 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3195 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3196 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3197 if ( forid < minid ) {
3199 } else if ( forid > maxid ) {
3203 if ( transcount < tp + maxid - minid + 1) {
3205 trie->trans = (reg_trie_trans *)
3206 PerlMemShared_realloc( trie->trans,
3208 * sizeof(reg_trie_trans) );
3209 Zero( trie->trans + (transcount / 2),
3213 base = trie->uniquecharcount + tp - minid;
3214 if ( maxid == minid ) {
3216 for ( ; zp < tp ; zp++ ) {
3217 if ( ! trie->trans[ zp ].next ) {
3218 base = trie->uniquecharcount + zp - minid;
3219 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3221 trie->trans[ zp ].check = state;
3227 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3229 trie->trans[ tp ].check = state;
3234 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3235 const U32 tid = base
3236 - trie->uniquecharcount
3237 + TRIE_LIST_ITEM( state, idx ).forid;
3238 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3240 trie->trans[ tid ].check = state;
3242 tp += ( maxid - minid + 1 );
3244 Safefree(trie->states[ state ].trans.list);
3247 DEBUG_TRIE_COMPILE_MORE_r(
3248 Perl_re_printf( aTHX_ " base: %d\n",base);
3251 trie->states[ state ].trans.base=base;
3253 trie->lasttrans = tp + 1;
3257 Second Pass -- Flat Table Representation.
3259 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3260 each. We know that we will need Charcount+1 trans at most to store
3261 the data (one row per char at worst case) So we preallocate both
3262 structures assuming worst case.
3264 We then construct the trie using only the .next slots of the entry
3267 We use the .check field of the first entry of the node temporarily
3268 to make compression both faster and easier by keeping track of how
3269 many non zero fields are in the node.
3271 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3274 There are two terms at use here: state as a TRIE_NODEIDX() which is
3275 a number representing the first entry of the node, and state as a
3276 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3277 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3278 if there are 2 entrys per node. eg:
3286 The table is internally in the right hand, idx form. However as we
3287 also have to deal with the states array which is indexed by nodenum
3288 we have to use TRIE_NODENUM() to convert.
3291 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3294 trie->trans = (reg_trie_trans *)
3295 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3296 * trie->uniquecharcount + 1,
3297 sizeof(reg_trie_trans) );
3298 trie->states = (reg_trie_state *)
3299 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3300 sizeof(reg_trie_state) );
3301 next_alloc = trie->uniquecharcount + 1;
3304 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3306 regnode *noper = NEXTOPER( cur );
3308 U32 state = 1; /* required init */
3310 U16 charid = 0; /* sanity init */
3311 U32 accept_state = 0; /* sanity init */
3313 U32 wordlen = 0; /* required init */
3315 if (OP(noper) == NOTHING) {
3316 regnode *noper_next= regnext(noper);
3317 if (noper_next < tail)
3319 /* we will undo this assignment if noper does not
3320 * point at a trieable type in the else clause of
3321 * the following statement. */
3325 && ( OP(noper) == flags
3326 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3327 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3328 || OP(noper) == EXACTFUP))))
3330 const U8 *uc= (U8*)STRING(noper);
3331 const U8 *e= uc + STR_LEN(noper);
3333 for ( ; uc < e ; uc += len ) {
3338 charid = trie->charmap[ uvc ];
3340 SV* const * const svpp = hv_fetch( widecharmap,
3344 charid = svpp ? (U16)SvIV(*svpp) : 0;
3348 if ( !trie->trans[ state + charid ].next ) {
3349 trie->trans[ state + charid ].next = next_alloc;
3350 trie->trans[ state ].check++;
3351 prev_states[TRIE_NODENUM(next_alloc)]
3352 = TRIE_NODENUM(state);
3353 next_alloc += trie->uniquecharcount;
3355 state = trie->trans[ state + charid ].next;
3357 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3359 /* charid is now 0 if we dont know the char read, or
3360 * nonzero if we do */
3363 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3364 * on a trieable type. So we need to reset noper back to point at the first regop
3365 * in the branch before we call TRIE_HANDLE_WORD().
3367 noper= NEXTOPER(cur);
3369 accept_state = TRIE_NODENUM( state );
3370 TRIE_HANDLE_WORD(accept_state);
3372 } /* end second pass */
3374 /* and now dump it out before we compress it */
3375 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3377 next_alloc, depth+1));
3381 * Inplace compress the table.*
3383 For sparse data sets the table constructed by the trie algorithm will
3384 be mostly 0/FAIL transitions or to put it another way mostly empty.
3385 (Note that leaf nodes will not contain any transitions.)
3387 This algorithm compresses the tables by eliminating most such
3388 transitions, at the cost of a modest bit of extra work during lookup:
3390 - Each states[] entry contains a .base field which indicates the
3391 index in the state[] array wheres its transition data is stored.
3393 - If .base is 0 there are no valid transitions from that node.
3395 - If .base is nonzero then charid is added to it to find an entry in
3398 -If trans[states[state].base+charid].check!=state then the
3399 transition is taken to be a 0/Fail transition. Thus if there are fail
3400 transitions at the front of the node then the .base offset will point
3401 somewhere inside the previous nodes data (or maybe even into a node
3402 even earlier), but the .check field determines if the transition is
3406 The following process inplace converts the table to the compressed
3407 table: We first do not compress the root node 1,and mark all its
3408 .check pointers as 1 and set its .base pointer as 1 as well. This
3409 allows us to do a DFA construction from the compressed table later,
3410 and ensures that any .base pointers we calculate later are greater
3413 - We set 'pos' to indicate the first entry of the second node.
3415 - We then iterate over the columns of the node, finding the first and
3416 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3417 and set the .check pointers accordingly, and advance pos
3418 appropriately and repreat for the next node. Note that when we copy
3419 the next pointers we have to convert them from the original
3420 NODEIDX form to NODENUM form as the former is not valid post
3423 - If a node has no transitions used we mark its base as 0 and do not
3424 advance the pos pointer.
3426 - If a node only has one transition we use a second pointer into the
3427 structure to fill in allocated fail transitions from other states.
3428 This pointer is independent of the main pointer and scans forward
3429 looking for null transitions that are allocated to a state. When it
3430 finds one it writes the single transition into the "hole". If the
3431 pointer doesnt find one the single transition is appended as normal.
3433 - Once compressed we can Renew/realloc the structures to release the
3436 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3437 specifically Fig 3.47 and the associated pseudocode.
3441 const U32 laststate = TRIE_NODENUM( next_alloc );
3444 trie->statecount = laststate;
3446 for ( state = 1 ; state < laststate ; state++ ) {
3448 const U32 stateidx = TRIE_NODEIDX( state );
3449 const U32 o_used = trie->trans[ stateidx ].check;
3450 U32 used = trie->trans[ stateidx ].check;
3451 trie->trans[ stateidx ].check = 0;
3454 used && charid < trie->uniquecharcount;
3457 if ( flag || trie->trans[ stateidx + charid ].next ) {
3458 if ( trie->trans[ stateidx + charid ].next ) {
3460 for ( ; zp < pos ; zp++ ) {
3461 if ( ! trie->trans[ zp ].next ) {
3465 trie->states[ state ].trans.base
3467 + trie->uniquecharcount
3469 trie->trans[ zp ].next
3470 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3472 trie->trans[ zp ].check = state;
3473 if ( ++zp > pos ) pos = zp;
3480 trie->states[ state ].trans.base
3481 = pos + trie->uniquecharcount - charid ;
3483 trie->trans[ pos ].next
3484 = SAFE_TRIE_NODENUM(
3485 trie->trans[ stateidx + charid ].next );
3486 trie->trans[ pos ].check = state;
3491 trie->lasttrans = pos + 1;
3492 trie->states = (reg_trie_state *)
3493 PerlMemShared_realloc( trie->states, laststate
3494 * sizeof(reg_trie_state) );
3495 DEBUG_TRIE_COMPILE_MORE_r(
3496 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3498 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3502 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3505 } /* end table compress */
3507 DEBUG_TRIE_COMPILE_MORE_r(
3508 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3510 (UV)trie->statecount,
3511 (UV)trie->lasttrans)
3513 /* resize the trans array to remove unused space */
3514 trie->trans = (reg_trie_trans *)
3515 PerlMemShared_realloc( trie->trans, trie->lasttrans
3516 * sizeof(reg_trie_trans) );
3518 { /* Modify the program and insert the new TRIE node */
3519 U8 nodetype =(U8)(flags & 0xFF);
3523 regnode *optimize = NULL;
3524 #ifdef RE_TRACK_PATTERN_OFFSETS
3527 U32 mjd_nodelen = 0;
3528 #endif /* RE_TRACK_PATTERN_OFFSETS */
3529 #endif /* DEBUGGING */
3531 This means we convert either the first branch or the first Exact,
3532 depending on whether the thing following (in 'last') is a branch
3533 or not and whther first is the startbranch (ie is it a sub part of
3534 the alternation or is it the whole thing.)
3535 Assuming its a sub part we convert the EXACT otherwise we convert
3536 the whole branch sequence, including the first.
3538 /* Find the node we are going to overwrite */
3539 if ( first != startbranch || OP( last ) == BRANCH ) {
3540 /* branch sub-chain */
3541 NEXT_OFF( first ) = (U16)(last - first);
3542 #ifdef RE_TRACK_PATTERN_OFFSETS
3544 mjd_offset= Node_Offset((convert));
3545 mjd_nodelen= Node_Length((convert));
3548 /* whole branch chain */
3550 #ifdef RE_TRACK_PATTERN_OFFSETS
3553 const regnode *nop = NEXTOPER( convert );
3554 mjd_offset= Node_Offset((nop));
3555 mjd_nodelen= Node_Length((nop));
3559 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3561 (UV)mjd_offset, (UV)mjd_nodelen)
3564 /* But first we check to see if there is a common prefix we can
3565 split out as an EXACT and put in front of the TRIE node. */
3566 trie->startstate= 1;
3567 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3568 /* we want to find the first state that has more than
3569 * one transition, if that state is not the first state
3570 * then we have a common prefix which we can remove.
3573 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3575 I32 first_ofs = -1; /* keeps track of the ofs of the first
3576 transition, -1 means none */
3578 const U32 base = trie->states[ state ].trans.base;
3580 /* does this state terminate an alternation? */
3581 if ( trie->states[state].wordnum )
3584 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3585 if ( ( base + ofs >= trie->uniquecharcount ) &&
3586 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3587 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3589 if ( ++count > 1 ) {
3590 /* we have more than one transition */
3593 /* if this is the first state there is no common prefix
3594 * to extract, so we can exit */
3595 if ( state == 1 ) break;
3596 tmp = av_fetch( revcharmap, ofs, 0);
3597 ch = (U8*)SvPV_nolen_const( *tmp );
3599 /* if we are on count 2 then we need to initialize the
3600 * bitmap, and store the previous char if there was one
3603 /* clear the bitmap */
3604 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3606 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3609 if (first_ofs >= 0) {
3610 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3611 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3613 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3615 Perl_re_printf( aTHX_ "%s", (char*)ch)
3619 /* store the current firstchar in the bitmap */
3620 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3621 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3627 /* This state has only one transition, its transition is part
3628 * of a common prefix - we need to concatenate the char it
3629 * represents to what we have so far. */
3630 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3632 char *ch = SvPV( *tmp, len );
3634 SV *sv=sv_newmortal();
3635 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3637 (UV)state, (UV)first_ofs,
3638 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3639 PL_colors[0], PL_colors[1],
3640 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3641 PERL_PV_ESCAPE_FIRSTCHAR
3646 OP( convert ) = nodetype;
3647 str=STRING(convert);
3648 setSTR_LEN(convert, 0);
3650 assert( ( STR_LEN(convert) + len ) < 256 );
3651 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3657 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3662 trie->prefixlen = (state-1);
3664 regnode *n = convert+NODE_SZ_STR(convert);
3665 assert( NODE_SZ_STR(convert) <= U16_MAX );
3666 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3667 trie->startstate = state;
3668 trie->minlen -= (state - 1);
3669 trie->maxlen -= (state - 1);
3671 /* At least the UNICOS C compiler choked on this
3672 * being argument to DEBUG_r(), so let's just have
3675 #ifdef PERL_EXT_RE_BUILD
3681 regnode *fix = convert;
3682 U32 word = trie->wordcount;
3683 #ifdef RE_TRACK_PATTERN_OFFSETS
3686 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3687 while( ++fix < n ) {
3688 Set_Node_Offset_Length(fix, 0, 0);
3691 SV ** const tmp = av_fetch( trie_words, word, 0 );
3693 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3694 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3696 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3704 NEXT_OFF(convert) = (U16)(tail - convert);
3705 DEBUG_r(optimize= n);
3711 if ( trie->maxlen ) {
3712 NEXT_OFF( convert ) = (U16)(tail - convert);
3713 ARG_SET( convert, data_slot );
3714 /* Store the offset to the first unabsorbed branch in
3715 jump[0], which is otherwise unused by the jump logic.
3716 We use this when dumping a trie and during optimisation. */
3718 trie->jump[0] = (U16)(nextbranch - convert);
3720 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3721 * and there is a bitmap
3722 * and the first "jump target" node we found leaves enough room
3723 * then convert the TRIE node into a TRIEC node, with the bitmap
3724 * embedded inline in the opcode - this is hypothetically faster.
3726 if ( !trie->states[trie->startstate].wordnum
3728 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3730 OP( convert ) = TRIEC;
3731 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3732 PerlMemShared_free(trie->bitmap);
3735 OP( convert ) = TRIE;
3737 /* store the type in the flags */
3738 convert->flags = nodetype;
3742 + regarglen[ OP( convert ) ];
3744 /* XXX We really should free up the resource in trie now,
3745 as we won't use them - (which resources?) dmq */
3747 /* needed for dumping*/
3748 DEBUG_r(if (optimize) {
3749 regnode *opt = convert;
3751 while ( ++opt < optimize) {
3752 Set_Node_Offset_Length(opt, 0, 0);
3755 Try to clean up some of the debris left after the
3758 while( optimize < jumper ) {
3759 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3760 OP( optimize ) = OPTIMIZED;
3761 Set_Node_Offset_Length(optimize, 0, 0);
3764 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3766 } /* end node insert */
3768 /* Finish populating the prev field of the wordinfo array. Walk back
3769 * from each accept state until we find another accept state, and if
3770 * so, point the first word's .prev field at the second word. If the
3771 * second already has a .prev field set, stop now. This will be the
3772 * case either if we've already processed that word's accept state,
3773 * or that state had multiple words, and the overspill words were
3774 * already linked up earlier.
3781 for (word=1; word <= trie->wordcount; word++) {
3783 if (trie->wordinfo[word].prev)
3785 state = trie->wordinfo[word].accept;
3787 state = prev_states[state];
3790 prev = trie->states[state].wordnum;
3794 trie->wordinfo[word].prev = prev;
3796 Safefree(prev_states);
3800 /* and now dump out the compressed format */
3801 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3803 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3805 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3806 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3808 SvREFCNT_dec_NN(revcharmap);
3812 : trie->startstate>1
3818 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3820 /* The Trie is constructed and compressed now so we can build a fail array if
3823 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3825 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3829 We find the fail state for each state in the trie, this state is the longest
3830 proper suffix of the current state's 'word' that is also a proper prefix of
3831 another word in our trie. State 1 represents the word '' and is thus the
3832 default fail state. This allows the DFA not to have to restart after its
3833 tried and failed a word at a given point, it simply continues as though it
3834 had been matching the other word in the first place.
3836 'abcdgu'=~/abcdefg|cdgu/
3837 When we get to 'd' we are still matching the first word, we would encounter
3838 'g' which would fail, which would bring us to the state representing 'd' in
3839 the second word where we would try 'g' and succeed, proceeding to match
3842 /* add a fail transition */
3843 const U32 trie_offset = ARG(source);
3844 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3846 const U32 ucharcount = trie->uniquecharcount;
3847 const U32 numstates = trie->statecount;
3848 const U32 ubound = trie->lasttrans + ucharcount;
3852 U32 base = trie->states[ 1 ].trans.base;
3855 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3857 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3859 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3860 PERL_UNUSED_CONTEXT;
3862 PERL_UNUSED_ARG(depth);
3865 if ( OP(source) == TRIE ) {
3866 struct regnode_1 *op = (struct regnode_1 *)
3867 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3868 StructCopy(source, op, struct regnode_1);
3869 stclass = (regnode *)op;
3871 struct regnode_charclass *op = (struct regnode_charclass *)
3872 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3873 StructCopy(source, op, struct regnode_charclass);
3874 stclass = (regnode *)op;
3876 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3878 ARG_SET( stclass, data_slot );
3879 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3880 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3881 aho->trie=trie_offset;
3882 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3883 Copy( trie->states, aho->states, numstates, reg_trie_state );
3884 Newx( q, numstates, U32);
3885 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3888 /* initialize fail[0..1] to be 1 so that we always have
3889 a valid final fail state */
3890 fail[ 0 ] = fail[ 1 ] = 1;
3892 for ( charid = 0; charid < ucharcount ; charid++ ) {
3893 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3895 q[ q_write ] = newstate;
3896 /* set to point at the root */
3897 fail[ q[ q_write++ ] ]=1;
3900 while ( q_read < q_write) {
3901 const U32 cur = q[ q_read++ % numstates ];
3902 base = trie->states[ cur ].trans.base;
3904 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3905 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3907 U32 fail_state = cur;
3910 fail_state = fail[ fail_state ];
3911 fail_base = aho->states[ fail_state ].trans.base;
3912 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3914 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3915 fail[ ch_state ] = fail_state;
3916 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3918 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3920 q[ q_write++ % numstates] = ch_state;
3924 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3925 when we fail in state 1, this allows us to use the
3926 charclass scan to find a valid start char. This is based on the principle
3927 that theres a good chance the string being searched contains lots of stuff
3928 that cant be a start char.
3930 fail[ 0 ] = fail[ 1 ] = 0;
3931 DEBUG_TRIE_COMPILE_r({
3932 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3933 depth, (UV)numstates
3935 for( q_read=1; q_read<numstates; q_read++ ) {
3936 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3938 Perl_re_printf( aTHX_ "\n");
3941 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3946 /* The below joins as many adjacent EXACTish nodes as possible into a single
3947 * one. The regop may be changed if the node(s) contain certain sequences that
3948 * require special handling. The joining is only done if:
3949 * 1) there is room in the current conglomerated node to entirely contain the
3951 * 2) they are compatible node types
3953 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3954 * these get optimized out
3956 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3957 * as possible, even if that means splitting an existing node so that its first
3958 * part is moved to the preceeding node. This would maximise the efficiency of
3959 * memEQ during matching.
3961 * If a node is to match under /i (folded), the number of characters it matches
3962 * can be different than its character length if it contains a multi-character
3963 * fold. *min_subtract is set to the total delta number of characters of the
3966 * And *unfolded_multi_char is set to indicate whether or not the node contains
3967 * an unfolded multi-char fold. This happens when it won't be known until
3968 * runtime whether the fold is valid or not; namely
3969 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3970 * target string being matched against turns out to be UTF-8 is that fold
3972 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3974 * (Multi-char folds whose components are all above the Latin1 range are not
3975 * run-time locale dependent, and have already been folded by the time this
3976 * function is called.)
3978 * This is as good a place as any to discuss the design of handling these
3979 * multi-character fold sequences. It's been wrong in Perl for a very long
3980 * time. There are three code points in Unicode whose multi-character folds
3981 * were long ago discovered to mess things up. The previous designs for
3982 * dealing with these involved assigning a special node for them. This
3983 * approach doesn't always work, as evidenced by this example:
3984 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3985 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3986 * would match just the \xDF, it won't be able to handle the case where a
3987 * successful match would have to cross the node's boundary. The new approach
3988 * that hopefully generally solves the problem generates an EXACTFUP node
3989 * that is "sss" in this case.
3991 * It turns out that there are problems with all multi-character folds, and not
3992 * just these three. Now the code is general, for all such cases. The
3993 * approach taken is:
3994 * 1) This routine examines each EXACTFish node that could contain multi-
3995 * character folded sequences. Since a single character can fold into
3996 * such a sequence, the minimum match length for this node is less than
3997 * the number of characters in the node. This routine returns in
3998 * *min_subtract how many characters to subtract from the actual
3999 * length of the string to get a real minimum match length; it is 0 if
4000 * there are no multi-char foldeds. This delta is used by the caller to
4001 * adjust the min length of the match, and the delta between min and max,
4002 * so that the optimizer doesn't reject these possibilities based on size
4005 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4006 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4007 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4008 * EXACTFU nodes. The node type of such nodes is then changed to
4009 * EXACTFUP, indicating it is problematic, and needs careful handling.
4010 * (The procedures in step 1) above are sufficient to handle this case in
4011 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4012 * the only case where there is a possible fold length change in non-UTF-8
4013 * patterns. By reserving a special node type for problematic cases, the
4014 * far more common regular EXACTFU nodes can be processed faster.
4015 * regexec.c takes advantage of this.
4017 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4018 * problematic cases. These all only occur when the pattern is not
4019 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4020 * length change, it handles the situation where the string cannot be
4021 * entirely folded. The strings in an EXACTFish node are folded as much
4022 * as possible during compilation in regcomp.c. This saves effort in
4023 * regex matching. By using an EXACTFUP node when it is not possible to
4024 * fully fold at compile time, regexec.c can know that everything in an
4025 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4026 * case where folding in EXACTFU nodes can't be done at compile time is
4027 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4028 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4029 * handle two very different cases. Alternatively, there could have been
4030 * a node type where there are length changes, one for unfolded, and one
4031 * for both. If yet another special case needed to be created, the number
4032 * of required node types would have to go to 7. khw figures that even
4033 * though there are plenty of node types to spare, that the maintenance
4034 * cost wasn't worth the small speedup of doing it that way, especially
4035 * since he thinks the MICRO SIGN is rarely encountered in practice.
4037 * There are other cases where folding isn't done at compile time, but
4038 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4039 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4040 * changes. Some folds in EXACTF depend on if the runtime target string
4041 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4042 * when no fold in it depends on the UTF-8ness of the target string.)
4044 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4045 * validity of the fold won't be known until runtime, and so must remain
4046 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4047 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4048 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4049 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4050 * The reason this is a problem is that the optimizer part of regexec.c
4051 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4052 * that a character in the pattern corresponds to at most a single
4053 * character in the target string. (And I do mean character, and not byte
4054 * here, unlike other parts of the documentation that have never been
4055 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4056 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4057 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4058 * EXACTFL nodes, violate the assumption, and they are the only instances
4059 * where it is violated. I'm reluctant to try to change the assumption,
4060 * as the code involved is impenetrable to me (khw), so instead the code
4061 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4062 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4063 * boolean indicating whether or not the node contains such a fold. When
4064 * it is true, the caller sets a flag that later causes the optimizer in
4065 * this file to not set values for the floating and fixed string lengths,
4066 * and thus avoids the optimizer code in regexec.c that makes the invalid
4067 * assumption. Thus, there is no optimization based on string lengths for
4068 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4069 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4070 * assumption is wrong only in these cases is that all other non-UTF-8
4071 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4072 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4073 * EXACTF nodes because we don't know at compile time if it actually
4074 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4075 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4076 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4077 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4078 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4079 * string would require the pattern to be forced into UTF-8, the overhead
4080 * of which we want to avoid. Similarly the unfolded multi-char folds in
4081 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4084 * Similarly, the code that generates tries doesn't currently handle
4085 * not-already-folded multi-char folds, and it looks like a pain to change
4086 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4087 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4088 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4089 * using /iaa matching will be doing so almost entirely with ASCII
4090 * strings, so this should rarely be encountered in practice */
4093 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4094 UV *min_subtract, bool *unfolded_multi_char,
4095 U32 flags, regnode *val, U32 depth)
4097 /* Merge several consecutive EXACTish nodes into one. */
4099 regnode *n = regnext(scan);
4101 regnode *next = scan + NODE_SZ_STR(scan);
4105 regnode *stop = scan;
4106 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4108 PERL_UNUSED_ARG(depth);
4111 PERL_ARGS_ASSERT_JOIN_EXACT;
4112 #ifndef EXPERIMENTAL_INPLACESCAN
4113 PERL_UNUSED_ARG(flags);
4114 PERL_UNUSED_ARG(val);
4116 DEBUG_PEEP("join", scan, depth, 0);
4118 assert(PL_regkind[OP(scan)] == EXACT);
4120 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4121 * EXACT ones that are mergeable to the current one. */
4123 && ( PL_regkind[OP(n)] == NOTHING
4124 || (stringok && PL_regkind[OP(n)] == EXACT))
4126 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4129 if (OP(n) == TAIL || n > next)
4131 if (PL_regkind[OP(n)] == NOTHING) {
4132 DEBUG_PEEP("skip:", n, depth, 0);
4133 NEXT_OFF(scan) += NEXT_OFF(n);
4134 next = n + NODE_STEP_REGNODE;
4141 else if (stringok) {
4142 const unsigned int oldl = STR_LEN(scan);
4143 regnode * const nnext = regnext(n);
4145 /* XXX I (khw) kind of doubt that this works on platforms (should
4146 * Perl ever run on one) where U8_MAX is above 255 because of lots
4147 * of other assumptions */
4148 /* Don't join if the sum can't fit into a single node */
4149 if (oldl + STR_LEN(n) > U8_MAX)
4152 /* Joining something that requires UTF-8 with something that
4153 * doesn't, means the result requires UTF-8. */
4154 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4155 OP(scan) = EXACT_REQ8;
4157 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4158 ; /* join is compatible, no need to change OP */
4160 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4161 OP(scan) = EXACTFU_REQ8;
4163 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4164 ; /* join is compatible, no need to change OP */
4166 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4167 ; /* join is compatible, no need to change OP */
4169 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4171 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4172 * which can join with EXACTFU ones. We check for this case
4173 * here. These need to be resolved to either EXACTFU or
4174 * EXACTF at joining time. They have nothing in them that
4175 * would forbid them from being the more desirable EXACTFU
4176 * nodes except that they begin and/or end with a single [Ss].
4177 * The reason this is problematic is because they could be
4178 * joined in this loop with an adjacent node that ends and/or
4179 * begins with [Ss] which would then form the sequence 'ss',
4180 * which matches differently under /di than /ui, in which case
4181 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4182 * formed, the nodes get absorbed into any adjacent EXACTFU
4183 * node. And if the only adjacent node is EXACTF, they get
4184 * absorbed into that, under the theory that a longer node is
4185 * better than two shorter ones, even if one is EXACTFU. Note
4186 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4187 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4189 if (STRING(n)[STR_LEN(n)-1] == 's') {
4191 /* Here the joined node would end with 's'. If the node
4192 * following the combination is an EXACTF one, it's better to
4193 * join this trailing edge 's' node with that one, leaving the
4194 * current one in 'scan' be the more desirable EXACTFU */
4195 if (OP(nnext) == EXACTF) {
4199 OP(scan) = EXACTFU_S_EDGE;
4201 } /* Otherwise, the beginning 's' of the 2nd node just
4202 becomes an interior 's' in 'scan' */
4204 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4205 ; /* join is compatible, no need to change OP */
4207 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4209 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4210 * nodes. But the latter nodes can be also joined with EXACTFU
4211 * ones, and that is a better outcome, so if the node following
4212 * 'n' is EXACTFU, quit now so that those two can be joined
4214 if (OP(nnext) == EXACTFU) {
4218 /* The join is compatible, and the combined node will be
4219 * EXACTF. (These don't care if they begin or end with 's' */
4221 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4222 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4223 && STRING(n)[0] == 's')
4225 /* When combined, we have the sequence 'ss', which means we
4226 * have to remain /di */
4230 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4231 if (STRING(n)[0] == 's') {
4232 ; /* Here the join is compatible and the combined node
4233 starts with 's', no need to change OP */
4235 else { /* Now the trailing 's' is in the interior */
4239 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4241 /* The join is compatible, and the combined node will be
4242 * EXACTF. (These don't care if they begin or end with 's' */
4245 else if (OP(scan) != OP(n)) {
4247 /* The only other compatible joinings are the same node type */
4251 DEBUG_PEEP("merg", n, depth, 0);
4254 NEXT_OFF(scan) += NEXT_OFF(n);
4255 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4256 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4257 next = n + NODE_SZ_STR(n);
4258 /* Now we can overwrite *n : */
4259 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4267 #ifdef EXPERIMENTAL_INPLACESCAN
4268 if (flags && !NEXT_OFF(n)) {
4269 DEBUG_PEEP("atch", val, depth, 0);
4270 if (reg_off_by_arg[OP(n)]) {
4271 ARG_SET(n, val - n);
4274 NEXT_OFF(n) = val - n;
4281 /* This temporary node can now be turned into EXACTFU, and must, as
4282 * regexec.c doesn't handle it */
4283 if (OP(scan) == EXACTFU_S_EDGE) {
4288 *unfolded_multi_char = FALSE;
4290 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4291 * can now analyze for sequences of problematic code points. (Prior to
4292 * this final joining, sequences could have been split over boundaries, and
4293 * hence missed). The sequences only happen in folding, hence for any
4294 * non-EXACT EXACTish node */
4295 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4296 U8* s0 = (U8*) STRING(scan);
4298 U8* s_end = s0 + STR_LEN(scan);
4300 int total_count_delta = 0; /* Total delta number of characters that
4301 multi-char folds expand to */
4303 /* One pass is made over the node's string looking for all the
4304 * possibilities. To avoid some tests in the loop, there are two main
4305 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4310 if (OP(scan) == EXACTFL) {
4313 /* An EXACTFL node would already have been changed to another
4314 * node type unless there is at least one character in it that
4315 * is problematic; likely a character whose fold definition
4316 * won't be known until runtime, and so has yet to be folded.
4317 * For all but the UTF-8 locale, folds are 1-1 in length, but
4318 * to handle the UTF-8 case, we need to create a temporary
4319 * folded copy using UTF-8 locale rules in order to analyze it.
4320 * This is because our macros that look to see if a sequence is
4321 * a multi-char fold assume everything is folded (otherwise the
4322 * tests in those macros would be too complicated and slow).
4323 * Note that here, the non-problematic folds will have already
4324 * been done, so we can just copy such characters. We actually
4325 * don't completely fold the EXACTFL string. We skip the
4326 * unfolded multi-char folds, as that would just create work
4327 * below to figure out the size they already are */
4329 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4332 STRLEN s_len = UTF8SKIP(s);
4333 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4334 Copy(s, d, s_len, U8);
4337 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4338 *unfolded_multi_char = TRUE;
4339 Copy(s, d, s_len, U8);
4342 else if (isASCII(*s)) {
4343 *(d++) = toFOLD(*s);
4347 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4353 /* Point the remainder of the routine to look at our temporary
4357 } /* End of creating folded copy of EXACTFL string */
4359 /* Examine the string for a multi-character fold sequence. UTF-8
4360 * patterns have all characters pre-folded by the time this code is
4362 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4363 length sequence we are looking for is 2 */
4365 int count = 0; /* How many characters in a multi-char fold */
4366 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4367 if (! len) { /* Not a multi-char fold: get next char */
4372 { /* Here is a generic multi-char fold. */
4373 U8* multi_end = s + len;
4375 /* Count how many characters are in it. In the case of
4376 * /aa, no folds which contain ASCII code points are
4377 * allowed, so check for those, and skip if found. */
4378 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4379 count = utf8_length(s, multi_end);
4383 while (s < multi_end) {
4386 goto next_iteration;
4396 /* The delta is how long the sequence is minus 1 (1 is how long
4397 * the character that folds to the sequence is) */
4398 total_count_delta += count - 1;
4402 /* We created a temporary folded copy of the string in EXACTFL
4403 * nodes. Therefore we need to be sure it doesn't go below zero,
4404 * as the real string could be shorter */
4405 if (OP(scan) == EXACTFL) {
4406 int total_chars = utf8_length((U8*) STRING(scan),
4407 (U8*) STRING(scan) + STR_LEN(scan));
4408 if (total_count_delta > total_chars) {
4409 total_count_delta = total_chars;
4413 *min_subtract += total_count_delta;
4416 else if (OP(scan) == EXACTFAA) {
4418 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4419 * fold to the ASCII range (and there are no existing ones in the
4420 * upper latin1 range). But, as outlined in the comments preceding
4421 * this function, we need to flag any occurrences of the sharp s.
4422 * This character forbids trie formation (because of added
4424 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4425 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4426 || UNICODE_DOT_DOT_VERSION > 0)
4428 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4429 OP(scan) = EXACTFAA_NO_TRIE;
4430 *unfolded_multi_char = TRUE;
4436 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4438 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4439 * folds that are all Latin1. As explained in the comments
4440 * preceding this function, we look also for the sharp s in EXACTF
4441 * and EXACTFL nodes; it can be in the final position. Otherwise
4442 * we can stop looking 1 byte earlier because have to find at least
4443 * two characters for a multi-fold */
4444 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4449 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4450 if (! len) { /* Not a multi-char fold. */
4451 if (*s == LATIN_SMALL_LETTER_SHARP_S
4452 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4454 *unfolded_multi_char = TRUE;
4461 && isALPHA_FOLD_EQ(*s, 's')
4462 && isALPHA_FOLD_EQ(*(s+1), 's'))
4465 /* EXACTF nodes need to know that the minimum length
4466 * changed so that a sharp s in the string can match this
4467 * ss in the pattern, but they remain EXACTF nodes, as they
4468 * won't match this unless the target string is in UTF-8,
4469 * which we don't know until runtime. EXACTFL nodes can't
4470 * transform into EXACTFU nodes */
4471 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4472 OP(scan) = EXACTFUP;
4476 *min_subtract += len - 1;
4484 /* Allow dumping but overwriting the collection of skipped
4485 * ops and/or strings with fake optimized ops */
4486 n = scan + NODE_SZ_STR(scan);
4494 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4498 /* REx optimizer. Converts nodes into quicker variants "in place".
4499 Finds fixed substrings. */
4501 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4502 to the position after last scanned or to NULL. */
4504 #define INIT_AND_WITHP \
4505 assert(!and_withp); \
4506 Newx(and_withp, 1, regnode_ssc); \
4507 SAVEFREEPV(and_withp)
4511 S_unwind_scan_frames(pTHX_ const void *p)
4513 scan_frame *f= (scan_frame *)p;
4515 scan_frame *n= f->next_frame;
4521 /* Follow the next-chain of the current node and optimize away
4522 all the NOTHINGs from it.
4525 S_rck_elide_nothing(pTHX_ regnode *node)
4527 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4529 if (OP(node) != CURLYX) {
4530 const int max = (reg_off_by_arg[OP(node)]
4532 /* I32 may be smaller than U16 on CRAYs! */
4533 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4534 int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4538 /* Skip NOTHING and LONGJMP. */
4542 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4543 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4549 if (reg_off_by_arg[OP(node)])
4552 NEXT_OFF(node) = off;
4557 /* the return from this sub is the minimum length that could possibly match */
4559 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4560 SSize_t *minlenp, SSize_t *deltap,
4565 regnode_ssc *and_withp,
4566 U32 flags, U32 depth, bool was_mutate_ok)
4567 /* scanp: Start here (read-write). */
4568 /* deltap: Write maxlen-minlen here. */
4569 /* last: Stop before this one. */
4570 /* data: string data about the pattern */
4571 /* stopparen: treat close N as END */
4572 /* recursed: which subroutines have we recursed into */
4573 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4575 SSize_t final_minlen;
4576 /* There must be at least this number of characters to match */
4579 regnode *scan = *scanp, *next;
4581 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4582 int is_inf_internal = 0; /* The studied chunk is infinite */
4583 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4584 scan_data_t data_fake;
4585 SV *re_trie_maxbuff = NULL;
4586 regnode *first_non_open = scan;
4587 SSize_t stopmin = OPTIMIZE_INFTY;
4588 scan_frame *frame = NULL;
4589 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4591 PERL_ARGS_ASSERT_STUDY_CHUNK;
4592 RExC_study_started= 1;
4594 Zero(&data_fake, 1, scan_data_t);
4597 while (first_non_open && OP(first_non_open) == OPEN)
4598 first_non_open=regnext(first_non_open);
4604 RExC_study_chunk_recursed_count++;
4606 DEBUG_OPTIMISE_MORE_r(
4608 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4609 depth, (long)stopparen,
4610 (unsigned long)RExC_study_chunk_recursed_count,
4611 (unsigned long)depth, (unsigned long)recursed_depth,
4614 if (recursed_depth) {
4617 for ( j = 0 ; j < recursed_depth ; j++ ) {
4618 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4619 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4620 Perl_re_printf( aTHX_ " %d",(int)i);
4624 if ( j + 1 < recursed_depth ) {
4625 Perl_re_printf( aTHX_ ",");
4629 Perl_re_printf( aTHX_ "\n");
4632 while ( scan && OP(scan) != END && scan < last ){
4633 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4634 node length to get a real minimum (because
4635 the folded version may be shorter) */
4636 bool unfolded_multi_char = FALSE;
4637 /* avoid mutating ops if we are anywhere within the recursed or
4638 * enframed handling for a GOSUB: the outermost level will handle it.
4640 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4641 /* Peephole optimizer: */
4642 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4643 DEBUG_PEEP("Peep", scan, depth, flags);
4646 /* The reason we do this here is that we need to deal with things like
4647 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4648 * parsing code, as each (?:..) is handled by a different invocation of
4651 if (PL_regkind[OP(scan)] == EXACT
4652 && OP(scan) != LEXACT
4653 && OP(scan) != LEXACT_REQ8
4656 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4657 0, NULL, depth + 1);
4660 /* Follow the next-chain of the current node and optimize
4661 away all the NOTHINGs from it.
4663 rck_elide_nothing(scan);
4665 /* The principal pseudo-switch. Cannot be a switch, since we look into
4666 * several different things. */
4667 if ( OP(scan) == DEFINEP ) {
4669 SSize_t deltanext = 0;
4670 SSize_t fake_last_close = 0;
4671 I32 f = SCF_IN_DEFINE;
4673 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4674 scan = regnext(scan);
4675 assert( OP(scan) == IFTHEN );
4676 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4678 data_fake.last_closep= &fake_last_close;
4680 next = regnext(scan);
4681 scan = NEXTOPER(NEXTOPER(scan));
4682 DEBUG_PEEP("scan", scan, depth, flags);
4683 DEBUG_PEEP("next", next, depth, flags);
4685 /* we suppose the run is continuous, last=next...
4686 * NOTE we dont use the return here! */
4687 /* DEFINEP study_chunk() recursion */
4688 (void)study_chunk(pRExC_state, &scan, &minlen,
4689 &deltanext, next, &data_fake, stopparen,
4690 recursed_depth, NULL, f, depth+1, mutate_ok);
4695 OP(scan) == BRANCH ||
4696 OP(scan) == BRANCHJ ||
4699 next = regnext(scan);
4702 /* The op(next)==code check below is to see if we
4703 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4704 * IFTHEN is special as it might not appear in pairs.
4705 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4706 * we dont handle it cleanly. */
4707 if (OP(next) == code || code == IFTHEN) {
4708 /* NOTE - There is similar code to this block below for
4709 * handling TRIE nodes on a re-study. If you change stuff here
4710 * check there too. */
4711 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4713 regnode * const startbranch=scan;
4715 if (flags & SCF_DO_SUBSTR) {
4716 /* Cannot merge strings after this. */
4717 scan_commit(pRExC_state, data, minlenp, is_inf);
4720 if (flags & SCF_DO_STCLASS)
4721 ssc_init_zero(pRExC_state, &accum);
4723 while (OP(scan) == code) {
4724 SSize_t deltanext, minnext, fake;
4726 regnode_ssc this_class;
4728 DEBUG_PEEP("Branch", scan, depth, flags);
4731 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4733 data_fake.whilem_c = data->whilem_c;
4734 data_fake.last_closep = data->last_closep;
4737 data_fake.last_closep = &fake;
4739 data_fake.pos_delta = delta;
4740 next = regnext(scan);
4742 scan = NEXTOPER(scan); /* everything */
4743 if (code != BRANCH) /* everything but BRANCH */
4744 scan = NEXTOPER(scan);
4746 if (flags & SCF_DO_STCLASS) {
4747 ssc_init(pRExC_state, &this_class);
4748 data_fake.start_class = &this_class;
4749 f = SCF_DO_STCLASS_AND;
4751 if (flags & SCF_WHILEM_VISITED_POS)
4752 f |= SCF_WHILEM_VISITED_POS;
4754 /* we suppose the run is continuous, last=next...*/
4755 /* recurse study_chunk() for each BRANCH in an alternation */
4756 minnext = study_chunk(pRExC_state, &scan, minlenp,
4757 &deltanext, next, &data_fake, stopparen,
4758 recursed_depth, NULL, f, depth+1,
4763 if (deltanext == OPTIMIZE_INFTY) {
4764 is_inf = is_inf_internal = 1;
4765 max1 = OPTIMIZE_INFTY;
4766 } else if (max1 < minnext + deltanext)
4767 max1 = minnext + deltanext;
4769 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4771 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4772 if ( stopmin > minnext)
4773 stopmin = min + min1;
4774 flags &= ~SCF_DO_SUBSTR;
4776 data->flags |= SCF_SEEN_ACCEPT;
4779 if (data_fake.flags & SF_HAS_EVAL)
4780 data->flags |= SF_HAS_EVAL;
4781 data->whilem_c = data_fake.whilem_c;
4783 if (flags & SCF_DO_STCLASS)
4784 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4786 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4788 if (flags & SCF_DO_SUBSTR) {
4789 data->pos_min += min1;
4790 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4791 data->pos_delta = OPTIMIZE_INFTY;
4793 data->pos_delta += max1 - min1;
4794 if (max1 != min1 || is_inf)
4795 data->cur_is_floating = 1;
4798 if (delta == OPTIMIZE_INFTY
4799 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4800 delta = OPTIMIZE_INFTY;
4802 delta += max1 - min1;
4803 if (flags & SCF_DO_STCLASS_OR) {
4804 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4806 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4807 flags &= ~SCF_DO_STCLASS;
4810 else if (flags & SCF_DO_STCLASS_AND) {
4812 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4813 flags &= ~SCF_DO_STCLASS;
4816 /* Switch to OR mode: cache the old value of
4817 * data->start_class */
4819 StructCopy(data->start_class, and_withp, regnode_ssc);
4820 flags &= ~SCF_DO_STCLASS_AND;
4821 StructCopy(&accum, data->start_class, regnode_ssc);
4822 flags |= SCF_DO_STCLASS_OR;
4826 if (PERL_ENABLE_TRIE_OPTIMISATION
4827 && OP(startbranch) == BRANCH
4832 Assuming this was/is a branch we are dealing with: 'scan'
4833 now points at the item that follows the branch sequence,
4834 whatever it is. We now start at the beginning of the
4835 sequence and look for subsequences of
4841 which would be constructed from a pattern like
4844 If we can find such a subsequence we need to turn the first
4845 element into a trie and then add the subsequent branch exact
4846 strings to the trie.
4850 1. patterns where the whole set of branches can be
4853 2. patterns where only a subset can be converted.
4855 In case 1 we can replace the whole set with a single regop
4856 for the trie. In case 2 we need to keep the start and end
4859 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4860 becomes BRANCH TRIE; BRANCH X;
4862 There is an additional case, that being where there is a
4863 common prefix, which gets split out into an EXACT like node
4864 preceding the TRIE node.
4866 If x(1..n)==tail then we can do a simple trie, if not we make
4867 a "jump" trie, such that when we match the appropriate word
4868 we "jump" to the appropriate tail node. Essentially we turn
4869 a nested if into a case structure of sorts.
4874 if (!re_trie_maxbuff) {
4875 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4876 if (!SvIOK(re_trie_maxbuff))
4877 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4879 if ( SvIV(re_trie_maxbuff)>=0 ) {
4881 regnode *first = (regnode *)NULL;
4882 regnode *prev = (regnode *)NULL;
4883 regnode *tail = scan;
4887 /* var tail is used because there may be a TAIL
4888 regop in the way. Ie, the exacts will point to the
4889 thing following the TAIL, but the last branch will
4890 point at the TAIL. So we advance tail. If we
4891 have nested (?:) we may have to move through several
4895 while ( OP( tail ) == TAIL ) {
4896 /* this is the TAIL generated by (?:) */
4897 tail = regnext( tail );
4901 DEBUG_TRIE_COMPILE_r({
4902 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4903 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4905 "Looking for TRIE'able sequences. Tail node is ",
4906 (UV) REGNODE_OFFSET(tail),
4907 SvPV_nolen_const( RExC_mysv )
4913 Step through the branches
4914 cur represents each branch,
4915 noper is the first thing to be matched as part
4917 noper_next is the regnext() of that node.
4919 We normally handle a case like this
4920 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4921 support building with NOJUMPTRIE, which restricts
4922 the trie logic to structures like /FOO|BAR/.
4924 If noper is a trieable nodetype then the branch is
4925 a possible optimization target. If we are building
4926 under NOJUMPTRIE then we require that noper_next is
4927 the same as scan (our current position in the regex
4930 Once we have two or more consecutive such branches
4931 we can create a trie of the EXACT's contents and
4932 stitch it in place into the program.
4934 If the sequence represents all of the branches in
4935 the alternation we replace the entire thing with a
4938 Otherwise when it is a subsequence we need to
4939 stitch it in place and replace only the relevant
4940 branches. This means the first branch has to remain
4941 as it is used by the alternation logic, and its
4942 next pointer, and needs to be repointed at the item
4943 on the branch chain following the last branch we
4944 have optimized away.
4946 This could be either a BRANCH, in which case the
4947 subsequence is internal, or it could be the item
4948 following the branch sequence in which case the
4949 subsequence is at the end (which does not
4950 necessarily mean the first node is the start of the
4953 TRIE_TYPE(X) is a define which maps the optype to a
4957 ----------------+-----------
4962 EXACTFU_REQ8 | EXACTFU
4966 EXACTFLU8 | EXACTFLU8
4970 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4972 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
4974 : ( EXACTFU == (X) \
4975 || EXACTFU_REQ8 == (X) \
4976 || EXACTFUP == (X) ) \
4978 : ( EXACTFAA == (X) ) \
4980 : ( EXACTL == (X) ) \
4982 : ( EXACTFLU8 == (X) ) \
4986 /* dont use tail as the end marker for this traverse */
4987 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4988 regnode * const noper = NEXTOPER( cur );
4989 U8 noper_type = OP( noper );
4990 U8 noper_trietype = TRIE_TYPE( noper_type );
4991 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4992 regnode * const noper_next = regnext( noper );
4993 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4994 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4997 DEBUG_TRIE_COMPILE_r({
4998 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4999 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
5001 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5003 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5004 Perl_re_printf( aTHX_ " -> %d:%s",
5005 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5008 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5009 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5010 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5012 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5013 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5014 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5018 /* Is noper a trieable nodetype that can be merged
5019 * with the current trie (if there is one)? */
5023 ( noper_trietype == NOTHING )
5024 || ( trietype == NOTHING )
5025 || ( trietype == noper_trietype )
5028 && noper_next >= tail
5032 /* Handle mergable triable node Either we are
5033 * the first node in a new trieable sequence,
5034 * in which case we do some bookkeeping,
5035 * otherwise we update the end pointer. */
5038 if ( noper_trietype == NOTHING ) {
5039 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5040 regnode * const noper_next = regnext( noper );
5041 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5042 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5045 if ( noper_next_trietype ) {
5046 trietype = noper_next_trietype;
5047 } else if (noper_next_type) {
5048 /* a NOTHING regop is 1 regop wide.
5049 * We need at least two for a trie
5050 * so we can't merge this in */
5054 trietype = noper_trietype;
5057 if ( trietype == NOTHING )
5058 trietype = noper_trietype;
5063 } /* end handle mergable triable node */
5065 /* handle unmergable node -
5066 * noper may either be a triable node which can
5067 * not be tried together with the current trie,
5068 * or a non triable node */
5070 /* If last is set and trietype is not
5071 * NOTHING then we have found at least two
5072 * triable branch sequences in a row of a
5073 * similar trietype so we can turn them
5074 * into a trie. If/when we allow NOTHING to
5075 * start a trie sequence this condition
5076 * will be required, and it isn't expensive
5077 * so we leave it in for now. */
5078 if ( trietype && trietype != NOTHING )
5079 make_trie( pRExC_state,
5080 startbranch, first, cur, tail,
5081 count, trietype, depth+1 );
5082 prev = NULL; /* note: we clear/update
5083 first, trietype etc below,
5084 so we dont do it here */
5088 && noper_next >= tail
5091 /* noper is triable, so we can start a new
5095 trietype = noper_trietype;
5097 /* if we already saw a first but the
5098 * current node is not triable then we have
5099 * to reset the first information. */
5104 } /* end handle unmergable node */
5105 } /* loop over branches */
5106 DEBUG_TRIE_COMPILE_r({
5107 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5108 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5109 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5110 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5111 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5112 PL_reg_name[trietype]
5116 if ( prev && trietype ) {
5117 if ( trietype != NOTHING ) {
5118 /* the last branch of the sequence was part of
5119 * a trie, so we have to construct it here
5120 * outside of the loop */
5121 made= make_trie( pRExC_state, startbranch,
5122 first, scan, tail, count,
5123 trietype, depth+1 );
5124 #ifdef TRIE_STUDY_OPT
5125 if ( ((made == MADE_EXACT_TRIE &&
5126 startbranch == first)
5127 || ( first_non_open == first )) &&
5129 flags |= SCF_TRIE_RESTUDY;
5130 if ( startbranch == first
5133 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5138 /* at this point we know whatever we have is a
5139 * NOTHING sequence/branch AND if 'startbranch'
5140 * is 'first' then we can turn the whole thing
5143 if ( startbranch == first ) {
5145 /* the entire thing is a NOTHING sequence,
5146 * something like this: (?:|) So we can
5147 * turn it into a plain NOTHING op. */
5148 DEBUG_TRIE_COMPILE_r({
5149 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5150 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5152 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5155 OP(startbranch)= NOTHING;
5156 NEXT_OFF(startbranch)= tail - startbranch;
5157 for ( opt= startbranch + 1; opt < tail ; opt++ )
5161 } /* end if ( prev) */
5162 } /* TRIE_MAXBUF is non zero */
5166 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5167 scan = NEXTOPER(NEXTOPER(scan));
5168 } else /* single branch is optimized. */
5169 scan = NEXTOPER(scan);
5171 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5173 regnode *start = NULL;
5174 regnode *end = NULL;
5175 U32 my_recursed_depth= recursed_depth;
5177 if (OP(scan) != SUSPEND) { /* GOSUB */
5178 /* Do setup, note this code has side effects beyond
5179 * the rest of this block. Specifically setting
5180 * RExC_recurse[] must happen at least once during
5183 RExC_recurse[ARG2L(scan)] = scan;
5184 start = REGNODE_p(RExC_open_parens[paren]);
5185 end = REGNODE_p(RExC_close_parens[paren]);
5187 /* NOTE we MUST always execute the above code, even
5188 * if we do nothing with a GOSUB */
5190 ( flags & SCF_IN_DEFINE )
5193 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5195 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5198 /* no need to do anything here if we are in a define. */
5199 /* or we are after some kind of infinite construct
5200 * so we can skip recursing into this item.
5201 * Since it is infinite we will not change the maxlen
5202 * or delta, and if we miss something that might raise
5203 * the minlen it will merely pessimise a little.
5205 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5206 * might result in a minlen of 1 and not of 4,
5207 * but this doesn't make us mismatch, just try a bit
5208 * harder than we should.
5210 * However we must assume this GOSUB is infinite, to
5211 * avoid wrongly applying other optimizations in the
5212 * enclosing scope - see GH 18096, for example.
5214 is_inf = is_inf_internal = 1;
5215 scan= regnext(scan);
5221 || !PAREN_TEST(recursed_depth - 1, paren)
5223 /* it is quite possible that there are more efficient ways
5224 * to do this. We maintain a bitmap per level of recursion
5225 * of which patterns we have entered so we can detect if a
5226 * pattern creates a possible infinite loop. When we
5227 * recurse down a level we copy the previous levels bitmap
5228 * down. When we are at recursion level 0 we zero the top
5229 * level bitmap. It would be nice to implement a different
5230 * more efficient way of doing this. In particular the top
5231 * level bitmap may be unnecessary.
5233 if (!recursed_depth) {
5234 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5236 Copy(PAREN_OFFSET(recursed_depth - 1),
5237 PAREN_OFFSET(recursed_depth),
5238 RExC_study_chunk_recursed_bytes, U8);
5240 /* we havent recursed into this paren yet, so recurse into it */
5241 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5242 PAREN_SET(recursed_depth, paren);
5243 my_recursed_depth= recursed_depth + 1;
5245 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5246 /* some form of infinite recursion, assume infinite length
5248 if (flags & SCF_DO_SUBSTR) {
5249 scan_commit(pRExC_state, data, minlenp, is_inf);
5250 data->cur_is_floating = 1;
5252 is_inf = is_inf_internal = 1;
5253 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5254 ssc_anything(data->start_class);
5255 flags &= ~SCF_DO_STCLASS;
5257 start= NULL; /* reset start so we dont recurse later on. */
5262 end = regnext(scan);
5265 scan_frame *newframe;
5267 if (!RExC_frame_last) {
5268 Newxz(newframe, 1, scan_frame);
5269 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5270 RExC_frame_head= newframe;
5272 } else if (!RExC_frame_last->next_frame) {
5273 Newxz(newframe, 1, scan_frame);
5274 RExC_frame_last->next_frame= newframe;
5275 newframe->prev_frame= RExC_frame_last;
5278 newframe= RExC_frame_last->next_frame;
5280 RExC_frame_last= newframe;
5282 newframe->next_regnode = regnext(scan);
5283 newframe->last_regnode = last;
5284 newframe->stopparen = stopparen;
5285 newframe->prev_recursed_depth = recursed_depth;
5286 newframe->this_prev_frame= frame;
5287 newframe->in_gosub = (
5288 (frame && frame->in_gosub) || OP(scan) == GOSUB
5291 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5292 DEBUG_PEEP("fnew", scan, depth, flags);
5299 recursed_depth= my_recursed_depth;
5304 else if (PL_regkind[OP(scan)] == EXACT && ! isEXACTFish(OP(scan))) {
5305 SSize_t bytelen = STR_LEN(scan), charlen;
5309 const U8 * const s = (U8*)STRING(scan);
5310 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5311 charlen = utf8_length(s, s + bytelen);
5313 uc = *((U8*)STRING(scan));
5317 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5318 /* The code below prefers earlier match for fixed
5319 offset, later match for variable offset. */
5320 if (data->last_end == -1) { /* Update the start info. */
5321 data->last_start_min = data->pos_min;
5322 data->last_start_max =
5323 is_inf ? OPTIMIZE_INFTY
5324 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5325 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5327 sv_catpvn(data->last_found, STRING(scan), bytelen);
5329 SvUTF8_on(data->last_found);
5331 SV * const sv = data->last_found;
5332 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5333 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5334 if (mg && mg->mg_len >= 0)
5335 mg->mg_len += charlen;
5337 data->last_end = data->pos_min + charlen;
5338 data->pos_min += charlen; /* As in the first entry. */
5339 data->flags &= ~SF_BEFORE_EOL;
5342 /* ANDing the code point leaves at most it, and not in locale, and
5343 * can't match null string */
5344 if (flags & SCF_DO_STCLASS_AND) {
5345 ssc_cp_and(data->start_class, uc);
5346 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5347 ssc_clear_locale(data->start_class);
5349 else if (flags & SCF_DO_STCLASS_OR) {
5350 ssc_add_cp(data->start_class, uc);
5351 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5353 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5354 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5356 flags &= ~SCF_DO_STCLASS;
5358 else if (PL_regkind[OP(scan)] == EXACT) {
5359 /* But OP != EXACT!, so is EXACTFish */
5360 SSize_t bytelen = STR_LEN(scan), charlen;
5361 const U8 * s = (U8*)STRING(scan);
5363 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5364 * with the mask set to the complement of the bit that differs
5365 * between upper and lower case, and the lowest code point of the
5366 * pair (which the '&' forces) */
5369 && ( OP(scan) == EXACTFAA
5370 || ( OP(scan) == EXACTFU
5371 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5374 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5377 ARG_SET(scan, *s & mask);
5379 /* we're not EXACTFish any more, so restudy */
5383 /* Search for fixed substrings supports EXACT only. */
5384 if (flags & SCF_DO_SUBSTR) {
5386 scan_commit(pRExC_state, data, minlenp, is_inf);
5388 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5389 if (unfolded_multi_char) {
5390 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5392 min += charlen - min_subtract;
5394 delta += min_subtract;
5395 if (flags & SCF_DO_SUBSTR) {
5396 data->pos_min += charlen - min_subtract;
5397 if (data->pos_min < 0) {
5400 data->pos_delta += min_subtract;
5402 data->cur_is_floating = 1; /* float */
5406 if (flags & SCF_DO_STCLASS) {
5407 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5409 assert(EXACTF_invlist);
5410 if (flags & SCF_DO_STCLASS_AND) {
5411 if (OP(scan) != EXACTFL)
5412 ssc_clear_locale(data->start_class);
5413 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5414 ANYOF_POSIXL_ZERO(data->start_class);
5415 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5417 else { /* SCF_DO_STCLASS_OR */
5418 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5419 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5421 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5422 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5424 flags &= ~SCF_DO_STCLASS;
5425 SvREFCNT_dec(EXACTF_invlist);
5428 else if (REGNODE_VARIES(OP(scan))) {
5429 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5430 I32 fl = 0, f = flags;
5431 regnode * const oscan = scan;
5432 regnode_ssc this_class;
5433 regnode_ssc *oclass = NULL;
5434 I32 next_is_eval = 0;
5436 switch (PL_regkind[OP(scan)]) {
5437 case WHILEM: /* End of (?:...)* . */
5438 scan = NEXTOPER(scan);
5441 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5442 next = NEXTOPER(scan);
5443 if ( ( PL_regkind[OP(next)] == EXACT
5444 && ! isEXACTFish(OP(next)))
5445 || (flags & SCF_DO_STCLASS))
5448 maxcount = REG_INFTY;
5449 next = regnext(scan);
5450 scan = NEXTOPER(scan);
5454 if (flags & SCF_DO_SUBSTR)
5456 /* This will bypass the formal 'min += minnext * mincount'
5457 * calculation in the do_curly path, so assumes min width
5458 * of the PLUS payload is exactly one. */
5462 next = NEXTOPER(scan);
5464 /* This temporary node can now be turned into EXACTFU, and
5465 * must, as regexec.c doesn't handle it */
5466 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5470 if ( STR_LEN(next) == 1
5471 && isALPHA_A(* STRING(next))
5472 && ( OP(next) == EXACTFAA
5473 || ( OP(next) == EXACTFU
5474 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5477 /* These differ in just one bit */
5478 U8 mask = ~ ('A' ^ 'a');
5480 assert(isALPHA_A(* STRING(next)));
5482 /* Then replace it by an ANYOFM node, with
5483 * the mask set to the complement of the
5484 * bit that differs between upper and lower
5485 * case, and the lowest code point of the
5486 * pair (which the '&' forces) */
5488 ARG_SET(next, *STRING(next) & mask);
5492 if (flags & SCF_DO_STCLASS) {
5494 maxcount = REG_INFTY;
5495 next = regnext(scan);
5496 scan = NEXTOPER(scan);
5499 if (flags & SCF_DO_SUBSTR) {
5500 scan_commit(pRExC_state, data, minlenp, is_inf);
5501 /* Cannot extend fixed substrings */
5502 data->cur_is_floating = 1; /* float */
5504 is_inf = is_inf_internal = 1;
5505 scan = regnext(scan);
5506 goto optimize_curly_tail;
5508 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5509 && (scan->flags == stopparen))
5514 mincount = ARG1(scan);
5515 maxcount = ARG2(scan);
5517 next = regnext(scan);
5518 if (OP(scan) == CURLYX) {
5519 I32 lp = (data ? *(data->last_closep) : 0);
5520 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5522 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5523 next_is_eval = (OP(scan) == EVAL);
5525 if (flags & SCF_DO_SUBSTR) {
5527 scan_commit(pRExC_state, data, minlenp, is_inf);
5528 /* Cannot extend fixed substrings */
5529 pos_before = data->pos_min;
5533 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5535 data->flags |= SF_IS_INF;
5537 if (flags & SCF_DO_STCLASS) {
5538 ssc_init(pRExC_state, &this_class);
5539 oclass = data->start_class;
5540 data->start_class = &this_class;
5541 f |= SCF_DO_STCLASS_AND;
5542 f &= ~SCF_DO_STCLASS_OR;
5544 /* Exclude from super-linear cache processing any {n,m}
5545 regops for which the combination of input pos and regex
5546 pos is not enough information to determine if a match
5549 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5550 regex pos at the \s*, the prospects for a match depend not
5551 only on the input position but also on how many (bar\s*)
5552 repeats into the {4,8} we are. */
5553 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5554 f &= ~SCF_WHILEM_VISITED_POS;
5556 /* This will finish on WHILEM, setting scan, or on NULL: */
5557 /* recurse study_chunk() on loop bodies */
5558 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5559 last, data, stopparen, recursed_depth, NULL,
5561 ? (f & ~SCF_DO_SUBSTR)
5563 , depth+1, mutate_ok);
5565 if (flags & SCF_DO_STCLASS)
5566 data->start_class = oclass;
5567 if (mincount == 0 || minnext == 0) {
5568 if (flags & SCF_DO_STCLASS_OR) {
5569 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5571 else if (flags & SCF_DO_STCLASS_AND) {
5572 /* Switch to OR mode: cache the old value of
5573 * data->start_class */
5575 StructCopy(data->start_class, and_withp, regnode_ssc);
5576 flags &= ~SCF_DO_STCLASS_AND;
5577 StructCopy(&this_class, data->start_class, regnode_ssc);
5578 flags |= SCF_DO_STCLASS_OR;
5579 ANYOF_FLAGS(data->start_class)
5580 |= SSC_MATCHES_EMPTY_STRING;
5582 } else { /* Non-zero len */
5583 if (flags & SCF_DO_STCLASS_OR) {
5584 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5585 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5587 else if (flags & SCF_DO_STCLASS_AND)
5588 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5589 flags &= ~SCF_DO_STCLASS;
5591 if (!scan) /* It was not CURLYX, but CURLY. */
5593 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5594 /* ? quantifier ok, except for (?{ ... }) */
5595 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5596 && (minnext == 0) && (deltanext == 0)
5597 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5598 && maxcount <= REG_INFTY/3) /* Complement check for big
5601 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5602 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5603 "Quantifier unexpected on zero-length expression "
5604 "in regex m/%" UTF8f "/",
5605 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5609 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5610 || min >= SSize_t_MAX - minnext * mincount )
5612 FAIL("Regexp out of space");
5615 min += minnext * mincount;
5616 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5617 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5618 is_inf |= is_inf_internal;
5620 delta = OPTIMIZE_INFTY;
5622 delta += (minnext + deltanext) * maxcount
5623 - minnext * mincount;
5625 /* Try powerful optimization CURLYX => CURLYN. */
5626 if ( OP(oscan) == CURLYX && data
5627 && data->flags & SF_IN_PAR
5628 && !(data->flags & SF_HAS_EVAL)
5629 && !deltanext && minnext == 1
5632 /* Try to optimize to CURLYN. */
5633 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5634 regnode * const nxt1 = nxt;
5641 if (!REGNODE_SIMPLE(OP(nxt))
5642 && !(PL_regkind[OP(nxt)] == EXACT
5643 && STR_LEN(nxt) == 1))
5649 if (OP(nxt) != CLOSE)
5651 if (RExC_open_parens) {
5654 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5657 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5659 /* Now we know that nxt2 is the only contents: */
5660 oscan->flags = (U8)ARG(nxt);
5662 OP(nxt1) = NOTHING; /* was OPEN. */
5665 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5666 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5667 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5668 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5669 OP(nxt + 1) = OPTIMIZED; /* was count. */
5670 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5675 /* Try optimization CURLYX => CURLYM. */
5676 if ( OP(oscan) == CURLYX && data
5677 && !(data->flags & SF_HAS_PAR)
5678 && !(data->flags & SF_HAS_EVAL)
5679 && !deltanext /* atom is fixed width */
5680 && minnext != 0 /* CURLYM can't handle zero width */
5681 /* Nor characters whose fold at run-time may be
5682 * multi-character */
5683 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5686 /* XXXX How to optimize if data == 0? */
5687 /* Optimize to a simpler form. */
5688 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5692 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5693 && (OP(nxt2) != WHILEM))
5695 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5696 /* Need to optimize away parenths. */
5697 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5698 /* Set the parenth number. */
5699 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5701 oscan->flags = (U8)ARG(nxt);
5702 if (RExC_open_parens) {
5704 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5707 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5710 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5711 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5714 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5715 OP(nxt + 1) = OPTIMIZED; /* was count. */
5716 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5717 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5720 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5721 regnode *nnxt = regnext(nxt1);
5723 if (reg_off_by_arg[OP(nxt1)])
5724 ARG_SET(nxt1, nxt2 - nxt1);
5725 else if (nxt2 - nxt1 < U16_MAX)
5726 NEXT_OFF(nxt1) = nxt2 - nxt1;
5728 OP(nxt) = NOTHING; /* Cannot beautify */
5733 /* Optimize again: */
5734 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5735 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5736 NULL, stopparen, recursed_depth, NULL, 0,
5737 depth+1, mutate_ok);
5742 else if ((OP(oscan) == CURLYX)
5743 && (flags & SCF_WHILEM_VISITED_POS)
5744 /* See the comment on a similar expression above.
5745 However, this time it's not a subexpression
5746 we care about, but the expression itself. */
5747 && (maxcount == REG_INFTY)
5749 /* This stays as CURLYX, we can put the count/of pair. */
5750 /* Find WHILEM (as in regexec.c) */
5751 regnode *nxt = oscan + NEXT_OFF(oscan);
5753 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5755 nxt = PREVOPER(nxt);
5756 if (nxt->flags & 0xf) {
5757 /* we've already set whilem count on this node */
5758 } else if (++data->whilem_c < 16) {
5759 assert(data->whilem_c <= RExC_whilem_seen);
5760 nxt->flags = (U8)(data->whilem_c
5761 | (RExC_whilem_seen << 4)); /* On WHILEM */
5764 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5766 if (flags & SCF_DO_SUBSTR) {
5767 SV *last_str = NULL;
5768 STRLEN last_chrs = 0;
5769 int counted = mincount != 0;
5771 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5773 SSize_t b = pos_before >= data->last_start_min
5774 ? pos_before : data->last_start_min;
5776 const char * const s = SvPV_const(data->last_found, l);
5777 SSize_t old = b - data->last_start_min;
5781 old = utf8_hop_forward((U8*)s, old,
5782 (U8 *) SvEND(data->last_found))
5785 /* Get the added string: */
5786 last_str = newSVpvn_utf8(s + old, l, UTF);
5787 last_chrs = UTF ? utf8_length((U8*)(s + old),
5788 (U8*)(s + old + l)) : l;
5789 if (deltanext == 0 && pos_before == b) {
5790 /* What was added is a constant string */
5793 SvGROW(last_str, (mincount * l) + 1);
5794 repeatcpy(SvPVX(last_str) + l,
5795 SvPVX_const(last_str), l,
5797 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5798 /* Add additional parts. */
5799 SvCUR_set(data->last_found,
5800 SvCUR(data->last_found) - l);
5801 sv_catsv(data->last_found, last_str);
5803 SV * sv = data->last_found;
5805 SvUTF8(sv) && SvMAGICAL(sv) ?
5806 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5807 if (mg && mg->mg_len >= 0)
5808 mg->mg_len += last_chrs * (mincount-1);
5810 last_chrs *= mincount;
5811 data->last_end += l * (mincount - 1);
5814 /* start offset must point into the last copy */
5815 data->last_start_min += minnext * (mincount - 1);
5816 data->last_start_max =
5819 : data->last_start_max +
5820 (maxcount - 1) * (minnext + data->pos_delta);
5823 /* It is counted once already... */
5824 data->pos_min += minnext * (mincount - counted);
5826 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5827 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5828 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5829 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5831 if (deltanext != OPTIMIZE_INFTY)
5832 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5833 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5834 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5836 if (deltanext == OPTIMIZE_INFTY
5837 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5838 data->pos_delta = OPTIMIZE_INFTY;
5840 data->pos_delta += - counted * deltanext +
5841 (minnext + deltanext) * maxcount - minnext * mincount;
5842 if (mincount != maxcount) {
5843 /* Cannot extend fixed substrings found inside
5845 scan_commit(pRExC_state, data, minlenp, is_inf);
5846 if (mincount && last_str) {
5847 SV * const sv = data->last_found;
5848 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5849 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5853 sv_setsv(sv, last_str);
5854 data->last_end = data->pos_min;
5855 data->last_start_min = data->pos_min - last_chrs;
5856 data->last_start_max = is_inf
5858 : data->pos_min + data->pos_delta - last_chrs;
5860 data->cur_is_floating = 1; /* float */
5862 SvREFCNT_dec(last_str);
5864 if (data && (fl & SF_HAS_EVAL))
5865 data->flags |= SF_HAS_EVAL;
5866 optimize_curly_tail:
5867 rck_elide_nothing(oscan);
5871 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5875 if (flags & SCF_DO_SUBSTR) {
5876 /* Cannot expect anything... */
5877 scan_commit(pRExC_state, data, minlenp, is_inf);
5878 data->cur_is_floating = 1; /* float */
5880 is_inf = is_inf_internal = 1;
5881 if (flags & SCF_DO_STCLASS_OR) {
5882 if (OP(scan) == CLUMP) {
5883 /* Actually is any start char, but very few code points
5884 * aren't start characters */
5885 ssc_match_all_cp(data->start_class);
5888 ssc_anything(data->start_class);
5891 flags &= ~SCF_DO_STCLASS;
5895 else if (OP(scan) == LNBREAK) {
5896 if (flags & SCF_DO_STCLASS) {
5897 if (flags & SCF_DO_STCLASS_AND) {
5898 ssc_intersection(data->start_class,
5899 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5900 ssc_clear_locale(data->start_class);
5901 ANYOF_FLAGS(data->start_class)
5902 &= ~SSC_MATCHES_EMPTY_STRING;
5904 else if (flags & SCF_DO_STCLASS_OR) {
5905 ssc_union(data->start_class,
5906 PL_XPosix_ptrs[_CC_VERTSPACE],
5908 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5910 /* See commit msg for
5911 * 749e076fceedeb708a624933726e7989f2302f6a */
5912 ANYOF_FLAGS(data->start_class)
5913 &= ~SSC_MATCHES_EMPTY_STRING;
5915 flags &= ~SCF_DO_STCLASS;
5918 if (delta != OPTIMIZE_INFTY)
5919 delta++; /* Because of the 2 char string cr-lf */
5920 if (flags & SCF_DO_SUBSTR) {
5921 /* Cannot expect anything... */
5922 scan_commit(pRExC_state, data, minlenp, is_inf);
5924 if (data->pos_delta != OPTIMIZE_INFTY) {
5925 data->pos_delta += 1;
5927 data->cur_is_floating = 1; /* float */
5930 else if (REGNODE_SIMPLE(OP(scan))) {
5932 if (flags & SCF_DO_SUBSTR) {
5933 scan_commit(pRExC_state, data, minlenp, is_inf);
5937 if (flags & SCF_DO_STCLASS) {
5939 SV* my_invlist = NULL;
5942 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5943 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5945 /* Some of the logic below assumes that switching
5946 locale on will only add false positives. */
5951 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5955 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5956 ssc_match_all_cp(data->start_class);
5961 SV* REG_ANY_invlist = _new_invlist(2);
5962 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5964 if (flags & SCF_DO_STCLASS_OR) {
5965 ssc_union(data->start_class,
5967 TRUE /* TRUE => invert, hence all but \n
5971 else if (flags & SCF_DO_STCLASS_AND) {
5972 ssc_intersection(data->start_class,
5974 TRUE /* TRUE => invert */
5976 ssc_clear_locale(data->start_class);
5978 SvREFCNT_dec_NN(REG_ANY_invlist);
5990 if (flags & SCF_DO_STCLASS_AND)
5991 ssc_and(pRExC_state, data->start_class,
5992 (regnode_charclass *) scan);
5994 ssc_or(pRExC_state, data->start_class,
5995 (regnode_charclass *) scan);
5998 case NANYOFM: /* NANYOFM already contains the inversion of the
5999 input ANYOF data, so, unlike things like
6000 NPOSIXA, don't change 'invert' to TRUE */
6004 SV* cp_list = get_ANYOFM_contents(scan);
6006 if (flags & SCF_DO_STCLASS_OR) {
6007 ssc_union(data->start_class, cp_list, invert);
6009 else if (flags & SCF_DO_STCLASS_AND) {
6010 ssc_intersection(data->start_class, cp_list, invert);
6013 SvREFCNT_dec_NN(cp_list);
6022 cp_list = _add_range_to_invlist(cp_list,
6024 ANYOFRbase(scan) + ANYOFRdelta(scan));
6026 if (flags & SCF_DO_STCLASS_OR) {
6027 ssc_union(data->start_class, cp_list, invert);
6029 else if (flags & SCF_DO_STCLASS_AND) {
6030 ssc_intersection(data->start_class, cp_list, invert);
6033 SvREFCNT_dec_NN(cp_list);
6042 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6043 if (flags & SCF_DO_STCLASS_AND) {
6044 bool was_there = cBOOL(
6045 ANYOF_POSIXL_TEST(data->start_class,
6047 ANYOF_POSIXL_ZERO(data->start_class);
6048 if (was_there) { /* Do an AND */
6049 ANYOF_POSIXL_SET(data->start_class, namedclass);
6051 /* No individual code points can now match */
6052 data->start_class->invlist
6053 = sv_2mortal(_new_invlist(0));
6056 int complement = namedclass + ((invert) ? -1 : 1);
6058 assert(flags & SCF_DO_STCLASS_OR);
6060 /* If the complement of this class was already there,
6061 * the result is that they match all code points,
6062 * (\d + \D == everything). Remove the classes from
6063 * future consideration. Locale is not relevant in
6065 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6066 ssc_match_all_cp(data->start_class);
6067 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6068 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6070 else { /* The usual case; just add this class to the
6072 ANYOF_POSIXL_SET(data->start_class, namedclass);
6077 case NPOSIXA: /* For these, we always know the exact set of
6082 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6083 goto join_posix_and_ascii;
6091 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6093 /* NPOSIXD matches all upper Latin1 code points unless the
6094 * target string being matched is UTF-8, which is
6095 * unknowable until match time. Since we are going to
6096 * invert, we want to get rid of all of them so that the
6097 * inversion will match all */
6098 if (OP(scan) == NPOSIXD) {
6099 _invlist_subtract(my_invlist, PL_UpperLatin1,
6103 join_posix_and_ascii:
6105 if (flags & SCF_DO_STCLASS_AND) {
6106 ssc_intersection(data->start_class, my_invlist, invert);
6107 ssc_clear_locale(data->start_class);
6110 assert(flags & SCF_DO_STCLASS_OR);
6111 ssc_union(data->start_class, my_invlist, invert);
6113 SvREFCNT_dec(my_invlist);
6115 if (flags & SCF_DO_STCLASS_OR)
6116 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6117 flags &= ~SCF_DO_STCLASS;
6120 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6121 data->flags |= (OP(scan) == MEOL
6124 scan_commit(pRExC_state, data, minlenp, is_inf);
6127 else if ( PL_regkind[OP(scan)] == BRANCHJ
6128 /* Lookbehind, or need to calculate parens/evals/stclass: */
6129 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6130 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6132 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6133 || OP(scan) == UNLESSM )
6135 /* Negative Lookahead/lookbehind
6136 In this case we can't do fixed string optimisation.
6139 SSize_t deltanext, minnext, fake = 0;
6144 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6146 data_fake.whilem_c = data->whilem_c;
6147 data_fake.last_closep = data->last_closep;
6150 data_fake.last_closep = &fake;
6151 data_fake.pos_delta = delta;
6152 if ( flags & SCF_DO_STCLASS && !scan->flags
6153 && OP(scan) == IFMATCH ) { /* Lookahead */
6154 ssc_init(pRExC_state, &intrnl);
6155 data_fake.start_class = &intrnl;
6156 f |= SCF_DO_STCLASS_AND;
6158 if (flags & SCF_WHILEM_VISITED_POS)
6159 f |= SCF_WHILEM_VISITED_POS;
6160 next = regnext(scan);
6161 nscan = NEXTOPER(NEXTOPER(scan));
6163 /* recurse study_chunk() for lookahead body */
6164 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6165 last, &data_fake, stopparen,
6166 recursed_depth, NULL, f, depth+1,
6170 || deltanext > (I32) U8_MAX
6171 || minnext > (I32)U8_MAX
6172 || minnext + deltanext > (I32)U8_MAX)
6174 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6178 /* The 'next_off' field has been repurposed to count the
6179 * additional starting positions to try beyond the initial
6180 * one. (This leaves it at 0 for non-variable length
6181 * matches to avoid breakage for those not using this
6184 scan->next_off = deltanext;
6185 ckWARNexperimental(RExC_parse,
6186 WARN_EXPERIMENTAL__VLB,
6187 "Variable length lookbehind is experimental");
6189 scan->flags = (U8)minnext + deltanext;
6192 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6194 if (data_fake.flags & SF_HAS_EVAL)
6195 data->flags |= SF_HAS_EVAL;
6196 data->whilem_c = data_fake.whilem_c;
6198 if (f & SCF_DO_STCLASS_AND) {
6199 if (flags & SCF_DO_STCLASS_OR) {
6200 /* OR before, AND after: ideally we would recurse with
6201 * data_fake to get the AND applied by study of the
6202 * remainder of the pattern, and then derecurse;
6203 * *** HACK *** for now just treat as "no information".
6204 * See [perl #56690].
6206 ssc_init(pRExC_state, data->start_class);
6208 /* AND before and after: combine and continue. These
6209 * assertions are zero-length, so can match an EMPTY
6211 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6212 ANYOF_FLAGS(data->start_class)
6213 |= SSC_MATCHES_EMPTY_STRING;
6217 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6219 /* Positive Lookahead/lookbehind
6220 In this case we can do fixed string optimisation,
6221 but we must be careful about it. Note in the case of
6222 lookbehind the positions will be offset by the minimum
6223 length of the pattern, something we won't know about
6224 until after the recurse.
6226 SSize_t deltanext, fake = 0;
6230 /* We use SAVEFREEPV so that when the full compile
6231 is finished perl will clean up the allocated
6232 minlens when it's all done. This way we don't
6233 have to worry about freeing them when we know
6234 they wont be used, which would be a pain.
6237 Newx( minnextp, 1, SSize_t );
6238 SAVEFREEPV(minnextp);
6241 StructCopy(data, &data_fake, scan_data_t);
6242 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6245 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6246 data_fake.last_found=newSVsv(data->last_found);
6250 data_fake.last_closep = &fake;
6251 data_fake.flags = 0;
6252 data_fake.substrs[0].flags = 0;
6253 data_fake.substrs[1].flags = 0;
6254 data_fake.pos_delta = delta;
6256 data_fake.flags |= SF_IS_INF;
6257 if ( flags & SCF_DO_STCLASS && !scan->flags
6258 && OP(scan) == IFMATCH ) { /* Lookahead */
6259 ssc_init(pRExC_state, &intrnl);
6260 data_fake.start_class = &intrnl;
6261 f |= SCF_DO_STCLASS_AND;
6263 if (flags & SCF_WHILEM_VISITED_POS)
6264 f |= SCF_WHILEM_VISITED_POS;
6265 next = regnext(scan);
6266 nscan = NEXTOPER(NEXTOPER(scan));
6268 /* positive lookahead study_chunk() recursion */
6269 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6270 &deltanext, last, &data_fake,
6271 stopparen, recursed_depth, NULL,
6272 f, depth+1, mutate_ok);
6274 assert(0); /* This code has never been tested since this
6275 is normally not compiled */
6277 || deltanext > (I32) U8_MAX
6278 || *minnextp > (I32)U8_MAX
6279 || *minnextp + deltanext > (I32)U8_MAX)
6281 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6286 scan->next_off = deltanext;
6288 scan->flags = (U8)*minnextp + deltanext;
6293 if (f & SCF_DO_STCLASS_AND) {
6294 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6295 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6298 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6300 if (data_fake.flags & SF_HAS_EVAL)
6301 data->flags |= SF_HAS_EVAL;
6302 data->whilem_c = data_fake.whilem_c;
6303 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6305 if (RExC_rx->minlen<*minnextp)
6306 RExC_rx->minlen=*minnextp;
6307 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6308 SvREFCNT_dec_NN(data_fake.last_found);
6310 for (i = 0; i < 2; i++) {
6311 if (data_fake.substrs[i].minlenp != minlenp) {
6312 data->substrs[i].min_offset =
6313 data_fake.substrs[i].min_offset;
6314 data->substrs[i].max_offset =
6315 data_fake.substrs[i].max_offset;
6316 data->substrs[i].minlenp =
6317 data_fake.substrs[i].minlenp;
6318 data->substrs[i].lookbehind += scan->flags;
6326 else if (OP(scan) == OPEN) {
6327 if (stopparen != (I32)ARG(scan))
6330 else if (OP(scan) == CLOSE) {
6331 if (stopparen == (I32)ARG(scan)) {
6334 if ((I32)ARG(scan) == is_par) {
6335 next = regnext(scan);
6337 if ( next && (OP(next) != WHILEM) && next < last)
6338 is_par = 0; /* Disable optimization */
6341 *(data->last_closep) = ARG(scan);
6343 else if (OP(scan) == EVAL) {
6345 data->flags |= SF_HAS_EVAL;
6347 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6348 if (flags & SCF_DO_SUBSTR) {
6349 scan_commit(pRExC_state, data, minlenp, is_inf);
6350 flags &= ~SCF_DO_SUBSTR;
6352 if (data && OP(scan)==ACCEPT) {
6353 data->flags |= SCF_SEEN_ACCEPT;
6358 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6360 if (flags & SCF_DO_SUBSTR) {
6361 scan_commit(pRExC_state, data, minlenp, is_inf);
6362 data->cur_is_floating = 1; /* float */
6364 is_inf = is_inf_internal = 1;
6365 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6366 ssc_anything(data->start_class);
6367 flags &= ~SCF_DO_STCLASS;
6369 else if (OP(scan) == GPOS) {
6370 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6371 !(delta || is_inf || (data && data->pos_delta)))
6373 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6374 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6375 if (RExC_rx->gofs < (STRLEN)min)
6376 RExC_rx->gofs = min;
6378 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6382 #ifdef TRIE_STUDY_OPT
6383 #ifdef FULL_TRIE_STUDY
6384 else if (PL_regkind[OP(scan)] == TRIE) {
6385 /* NOTE - There is similar code to this block above for handling
6386 BRANCH nodes on the initial study. If you change stuff here
6388 regnode *trie_node= scan;
6389 regnode *tail= regnext(scan);
6390 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6391 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6394 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6395 /* Cannot merge strings after this. */
6396 scan_commit(pRExC_state, data, minlenp, is_inf);
6398 if (flags & SCF_DO_STCLASS)
6399 ssc_init_zero(pRExC_state, &accum);
6405 const regnode *nextbranch= NULL;
6408 for ( word=1 ; word <= trie->wordcount ; word++)
6410 SSize_t deltanext=0, minnext=0, f = 0, fake;
6411 regnode_ssc this_class;
6413 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6415 data_fake.whilem_c = data->whilem_c;
6416 data_fake.last_closep = data->last_closep;
6419 data_fake.last_closep = &fake;
6420 data_fake.pos_delta = delta;
6421 if (flags & SCF_DO_STCLASS) {
6422 ssc_init(pRExC_state, &this_class);
6423 data_fake.start_class = &this_class;
6424 f = SCF_DO_STCLASS_AND;
6426 if (flags & SCF_WHILEM_VISITED_POS)
6427 f |= SCF_WHILEM_VISITED_POS;
6429 if (trie->jump[word]) {
6431 nextbranch = trie_node + trie->jump[0];
6432 scan= trie_node + trie->jump[word];
6433 /* We go from the jump point to the branch that follows
6434 it. Note this means we need the vestigal unused
6435 branches even though they arent otherwise used. */
6436 /* optimise study_chunk() for TRIE */
6437 minnext = study_chunk(pRExC_state, &scan, minlenp,
6438 &deltanext, (regnode *)nextbranch, &data_fake,
6439 stopparen, recursed_depth, NULL, f, depth+1,
6442 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6443 nextbranch= regnext((regnode*)nextbranch);
6445 if (min1 > (SSize_t)(minnext + trie->minlen))
6446 min1 = minnext + trie->minlen;
6447 if (deltanext == OPTIMIZE_INFTY) {
6448 is_inf = is_inf_internal = 1;
6449 max1 = OPTIMIZE_INFTY;
6450 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6451 max1 = minnext + deltanext + trie->maxlen;
6453 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6455 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6456 if ( stopmin > min + min1)
6457 stopmin = min + min1;
6458 flags &= ~SCF_DO_SUBSTR;
6460 data->flags |= SCF_SEEN_ACCEPT;
6463 if (data_fake.flags & SF_HAS_EVAL)
6464 data->flags |= SF_HAS_EVAL;
6465 data->whilem_c = data_fake.whilem_c;
6467 if (flags & SCF_DO_STCLASS)
6468 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6471 if (flags & SCF_DO_SUBSTR) {
6472 data->pos_min += min1;
6473 data->pos_delta += max1 - min1;
6474 if (max1 != min1 || is_inf)
6475 data->cur_is_floating = 1; /* float */
6478 if (delta != OPTIMIZE_INFTY) {
6479 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6480 delta += max1 - min1;
6482 delta = OPTIMIZE_INFTY;
6484 if (flags & SCF_DO_STCLASS_OR) {
6485 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6487 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6488 flags &= ~SCF_DO_STCLASS;
6491 else if (flags & SCF_DO_STCLASS_AND) {
6493 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6494 flags &= ~SCF_DO_STCLASS;
6497 /* Switch to OR mode: cache the old value of
6498 * data->start_class */
6500 StructCopy(data->start_class, and_withp, regnode_ssc);
6501 flags &= ~SCF_DO_STCLASS_AND;
6502 StructCopy(&accum, data->start_class, regnode_ssc);
6503 flags |= SCF_DO_STCLASS_OR;
6510 else if (PL_regkind[OP(scan)] == TRIE) {
6511 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6514 min += trie->minlen;
6515 delta += (trie->maxlen - trie->minlen);
6516 flags &= ~SCF_DO_STCLASS; /* xxx */
6517 if (flags & SCF_DO_SUBSTR) {
6518 /* Cannot expect anything... */
6519 scan_commit(pRExC_state, data, minlenp, is_inf);
6520 data->pos_min += trie->minlen;
6521 data->pos_delta += (trie->maxlen - trie->minlen);
6522 if (trie->maxlen != trie->minlen)
6523 data->cur_is_floating = 1; /* float */
6525 if (trie->jump) /* no more substrings -- for now /grr*/
6526 flags &= ~SCF_DO_SUBSTR;
6528 else if (OP(scan) == REGEX_SET) {
6529 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6530 " before optimization", reg_name[REGEX_SET]);
6533 #endif /* old or new */
6534 #endif /* TRIE_STUDY_OPT */
6536 /* Else: zero-length, ignore. */
6537 scan = regnext(scan);
6542 /* we need to unwind recursion. */
6545 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6546 DEBUG_PEEP("fend", scan, depth, flags);
6548 /* restore previous context */
6549 last = frame->last_regnode;
6550 scan = frame->next_regnode;
6551 stopparen = frame->stopparen;
6552 recursed_depth = frame->prev_recursed_depth;
6554 RExC_frame_last = frame->prev_frame;
6555 frame = frame->this_prev_frame;
6556 goto fake_study_recurse;
6560 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6563 *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6565 if (flags & SCF_DO_SUBSTR && is_inf)
6566 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6567 if (is_par > (I32)U8_MAX)
6569 if (is_par && pars==1 && data) {
6570 data->flags |= SF_IN_PAR;
6571 data->flags &= ~SF_HAS_PAR;
6573 else if (pars && data) {
6574 data->flags |= SF_HAS_PAR;
6575 data->flags &= ~SF_IN_PAR;
6577 if (flags & SCF_DO_STCLASS_OR)
6578 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6579 if (flags & SCF_TRIE_RESTUDY)
6580 data->flags |= SCF_TRIE_RESTUDY;
6582 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6584 final_minlen = min < stopmin
6587 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6588 if (final_minlen > OPTIMIZE_INFTY - delta)
6589 RExC_maxlen = OPTIMIZE_INFTY;
6590 else if (RExC_maxlen < final_minlen + delta)
6591 RExC_maxlen = final_minlen + delta;
6593 return final_minlen;
6597 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6599 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6601 PERL_ARGS_ASSERT_ADD_DATA;
6603 Renewc(RExC_rxi->data,
6604 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6605 char, struct reg_data);
6607 Renew(RExC_rxi->data->what, count + n, U8);
6609 Newx(RExC_rxi->data->what, n, U8);
6610 RExC_rxi->data->count = count + n;
6611 Copy(s, RExC_rxi->data->what + count, n, U8);
6615 /*XXX: todo make this not included in a non debugging perl, but appears to be
6616 * used anyway there, in 'use re' */
6617 #ifndef PERL_IN_XSUB_RE
6619 Perl_reginitcolors(pTHX)
6621 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6623 char *t = savepv(s);
6627 t = strchr(t, '\t');
6633 PL_colors[i] = t = (char *)"";
6638 PL_colors[i++] = (char *)"";
6645 #ifdef TRIE_STUDY_OPT
6646 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6649 (data.flags & SCF_TRIE_RESTUDY) \
6657 #define CHECK_RESTUDY_GOTO_butfirst
6661 * pregcomp - compile a regular expression into internal code
6663 * Decides which engine's compiler to call based on the hint currently in
6667 #ifndef PERL_IN_XSUB_RE
6669 /* return the currently in-scope regex engine (or the default if none) */
6671 regexp_engine const *
6672 Perl_current_re_engine(pTHX)
6674 if (IN_PERL_COMPILETIME) {
6675 HV * const table = GvHV(PL_hintgv);
6678 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6679 return &PL_core_reg_engine;
6680 ptr = hv_fetchs(table, "regcomp", FALSE);
6681 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6682 return &PL_core_reg_engine;
6683 return INT2PTR(regexp_engine*, SvIV(*ptr));
6687 if (!PL_curcop->cop_hints_hash)
6688 return &PL_core_reg_engine;
6689 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6690 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6691 return &PL_core_reg_engine;
6692 return INT2PTR(regexp_engine*, SvIV(ptr));
6698 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6700 regexp_engine const *eng = current_re_engine();
6701 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6703 PERL_ARGS_ASSERT_PREGCOMP;
6705 /* Dispatch a request to compile a regexp to correct regexp engine. */
6707 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6710 return CALLREGCOMP_ENG(eng, pattern, flags);
6714 /* public(ish) entry point for the perl core's own regex compiling code.
6715 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6716 * pattern rather than a list of OPs, and uses the internal engine rather
6717 * than the current one */
6720 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6722 SV *pat = pattern; /* defeat constness! */
6724 PERL_ARGS_ASSERT_RE_COMPILE;
6726 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6727 #ifdef PERL_IN_XSUB_RE
6730 &PL_core_reg_engine,
6732 NULL, NULL, rx_flags, 0);
6736 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6740 if (--cbs->refcnt > 0)
6742 for (n = 0; n < cbs->count; n++) {
6743 REGEXP *rx = cbs->cb[n].src_regex;
6745 cbs->cb[n].src_regex = NULL;
6746 SvREFCNT_dec_NN(rx);
6754 static struct reg_code_blocks *
6755 S_alloc_code_blocks(pTHX_ int ncode)
6757 struct reg_code_blocks *cbs;
6758 Newx(cbs, 1, struct reg_code_blocks);
6761 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6763 Newx(cbs->cb, ncode, struct reg_code_block);
6770 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6771 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6772 * point to the realloced string and length.
6774 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6778 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6779 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6781 U8 *const src = (U8*)*pat_p;
6786 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6788 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6789 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6791 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6792 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6795 while (s < *plen_p) {
6796 append_utf8_from_native_byte(src[s], &d);
6798 if (n < num_code_blocks) {
6799 assert(pRExC_state->code_blocks);
6800 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6801 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6802 assert(*(d - 1) == '(');
6805 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6806 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6807 assert(*(d - 1) == ')');
6816 *pat_p = (char*) dst;
6818 RExC_orig_utf8 = RExC_utf8 = 1;
6823 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6824 * while recording any code block indices, and handling overloading,
6825 * nested qr// objects etc. If pat is null, it will allocate a new
6826 * string, or just return the first arg, if there's only one.
6828 * Returns the malloced/updated pat.
6829 * patternp and pat_count is the array of SVs to be concatted;
6830 * oplist is the optional list of ops that generated the SVs;
6831 * recompile_p is a pointer to a boolean that will be set if
6832 * the regex will need to be recompiled.
6833 * delim, if non-null is an SV that will be inserted between each element
6837 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6838 SV *pat, SV ** const patternp, int pat_count,
6839 OP *oplist, bool *recompile_p, SV *delim)
6843 bool use_delim = FALSE;
6844 bool alloced = FALSE;
6846 /* if we know we have at least two args, create an empty string,
6847 * then concatenate args to that. For no args, return an empty string */
6848 if (!pat && pat_count != 1) {
6854 for (svp = patternp; svp < patternp + pat_count; svp++) {
6857 STRLEN orig_patlen = 0;
6859 SV *msv = use_delim ? delim : *svp;
6860 if (!msv) msv = &PL_sv_undef;
6862 /* if we've got a delimiter, we go round the loop twice for each
6863 * svp slot (except the last), using the delimiter the second
6872 if (SvTYPE(msv) == SVt_PVAV) {
6873 /* we've encountered an interpolated array within
6874 * the pattern, e.g. /...@a..../. Expand the list of elements,
6875 * then recursively append elements.
6876 * The code in this block is based on S_pushav() */
6878 AV *const av = (AV*)msv;
6879 const SSize_t maxarg = AvFILL(av) + 1;
6883 assert(oplist->op_type == OP_PADAV
6884 || oplist->op_type == OP_RV2AV);
6885 oplist = OpSIBLING(oplist);
6888 if (SvRMAGICAL(av)) {
6891 Newx(array, maxarg, SV*);
6893 for (i=0; i < maxarg; i++) {
6894 SV ** const svp = av_fetch(av, i, FALSE);
6895 array[i] = svp ? *svp : &PL_sv_undef;
6899 array = AvARRAY(av);
6901 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6902 array, maxarg, NULL, recompile_p,
6904 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6910 /* we make the assumption here that each op in the list of
6911 * op_siblings maps to one SV pushed onto the stack,
6912 * except for code blocks, with have both an OP_NULL and
6914 * This allows us to match up the list of SVs against the
6915 * list of OPs to find the next code block.
6917 * Note that PUSHMARK PADSV PADSV ..
6919 * PADRANGE PADSV PADSV ..
6920 * so the alignment still works. */
6923 if (oplist->op_type == OP_NULL
6924 && (oplist->op_flags & OPf_SPECIAL))
6926 assert(n < pRExC_state->code_blocks->count);
6927 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6928 pRExC_state->code_blocks->cb[n].block = oplist;
6929 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6932 oplist = OpSIBLING(oplist); /* skip CONST */
6935 oplist = OpSIBLING(oplist);;
6938 /* apply magic and QR overloading to arg */
6941 if (SvROK(msv) && SvAMAGIC(msv)) {
6942 SV *sv = AMG_CALLunary(msv, regexp_amg);
6946 if (SvTYPE(sv) != SVt_REGEXP)
6947 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6952 /* try concatenation overload ... */
6953 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6954 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6957 /* overloading involved: all bets are off over literal
6958 * code. Pretend we haven't seen it */
6960 pRExC_state->code_blocks->count -= n;
6964 /* ... or failing that, try "" overload */
6965 while (SvAMAGIC(msv)
6966 && (sv = AMG_CALLunary(msv, string_amg))
6970 && SvRV(msv) == SvRV(sv))
6975 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6979 /* this is a partially unrolled
6980 * sv_catsv_nomg(pat, msv);
6981 * that allows us to adjust code block indices if
6984 char *dst = SvPV_force_nomg(pat, dlen);
6986 if (SvUTF8(msv) && !SvUTF8(pat)) {
6987 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6988 sv_setpvn(pat, dst, dlen);
6991 sv_catsv_nomg(pat, msv);
6995 /* We have only one SV to process, but we need to verify
6996 * it is properly null terminated or we will fail asserts
6997 * later. In theory we probably shouldn't get such SV's,
6998 * but if we do we should handle it gracefully. */
6999 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7000 /* not a string, or a string with a trailing null */
7003 /* a string with no trailing null, we need to copy it
7004 * so it has a trailing null */
7005 pat = sv_2mortal(newSVsv(msv));
7010 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7013 /* extract any code blocks within any embedded qr//'s */
7014 if (rx && SvTYPE(rx) == SVt_REGEXP
7015 && RX_ENGINE((REGEXP*)rx)->op_comp)
7018 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7019 if (ri->code_blocks && ri->code_blocks->count) {
7021 /* the presence of an embedded qr// with code means
7022 * we should always recompile: the text of the
7023 * qr// may not have changed, but it may be a
7024 * different closure than last time */
7026 if (pRExC_state->code_blocks) {
7027 int new_count = pRExC_state->code_blocks->count
7028 + ri->code_blocks->count;
7029 Renew(pRExC_state->code_blocks->cb,
7030 new_count, struct reg_code_block);
7031 pRExC_state->code_blocks->count = new_count;
7034 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7035 ri->code_blocks->count);
7037 for (i=0; i < ri->code_blocks->count; i++) {
7038 struct reg_code_block *src, *dst;
7039 STRLEN offset = orig_patlen
7040 + ReANY((REGEXP *)rx)->pre_prefix;
7041 assert(n < pRExC_state->code_blocks->count);
7042 src = &ri->code_blocks->cb[i];
7043 dst = &pRExC_state->code_blocks->cb[n];
7044 dst->start = src->start + offset;
7045 dst->end = src->end + offset;
7046 dst->block = src->block;
7047 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7056 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7065 /* see if there are any run-time code blocks in the pattern.
7066 * False positives are allowed */
7069 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7070 char *pat, STRLEN plen)
7075 PERL_UNUSED_CONTEXT;
7077 for (s = 0; s < plen; s++) {
7078 if ( pRExC_state->code_blocks
7079 && n < pRExC_state->code_blocks->count
7080 && s == pRExC_state->code_blocks->cb[n].start)
7082 s = pRExC_state->code_blocks->cb[n].end;
7086 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7088 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7090 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7097 /* Handle run-time code blocks. We will already have compiled any direct
7098 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7099 * copy of it, but with any literal code blocks blanked out and
7100 * appropriate chars escaped; then feed it into
7102 * eval "qr'modified_pattern'"
7106 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7110 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7112 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7113 * and merge them with any code blocks of the original regexp.
7115 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7116 * instead, just save the qr and return FALSE; this tells our caller that
7117 * the original pattern needs upgrading to utf8.
7121 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7122 char *pat, STRLEN plen)
7126 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7128 if (pRExC_state->runtime_code_qr) {
7129 /* this is the second time we've been called; this should
7130 * only happen if the main pattern got upgraded to utf8
7131 * during compilation; re-use the qr we compiled first time
7132 * round (which should be utf8 too)
7134 qr = pRExC_state->runtime_code_qr;
7135 pRExC_state->runtime_code_qr = NULL;
7136 assert(RExC_utf8 && SvUTF8(qr));
7142 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7146 /* determine how many extra chars we need for ' and \ escaping */
7147 for (s = 0; s < plen; s++) {
7148 if (pat[s] == '\'' || pat[s] == '\\')
7152 Newx(newpat, newlen, char);
7154 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7156 for (s = 0; s < plen; s++) {
7157 if ( pRExC_state->code_blocks
7158 && n < pRExC_state->code_blocks->count
7159 && s == pRExC_state->code_blocks->cb[n].start)
7161 /* blank out literal code block so that they aren't
7162 * recompiled: eg change from/to:
7172 assert(pat[s] == '(');
7173 assert(pat[s+1] == '?');
7177 while (s < pRExC_state->code_blocks->cb[n].end) {
7185 if (pat[s] == '\'' || pat[s] == '\\')
7190 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7192 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7198 Perl_re_printf( aTHX_
7199 "%sre-parsing pattern for runtime code:%s %s\n",
7200 PL_colors[4], PL_colors[5], newpat);
7203 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7209 PUSHSTACKi(PERLSI_REQUIRE);
7210 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7211 * parsing qr''; normally only q'' does this. It also alters
7213 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7214 SvREFCNT_dec_NN(sv);
7219 SV * const errsv = ERRSV;
7220 if (SvTRUE_NN(errsv))
7221 /* use croak_sv ? */
7222 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7224 assert(SvROK(qr_ref));
7226 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7227 /* the leaving below frees the tmp qr_ref.
7228 * Give qr a life of its own */
7236 if (!RExC_utf8 && SvUTF8(qr)) {
7237 /* first time through; the pattern got upgraded; save the
7238 * qr for the next time through */
7239 assert(!pRExC_state->runtime_code_qr);
7240 pRExC_state->runtime_code_qr = qr;
7245 /* extract any code blocks within the returned qr// */
7248 /* merge the main (r1) and run-time (r2) code blocks into one */
7250 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7251 struct reg_code_block *new_block, *dst;
7252 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7256 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7258 SvREFCNT_dec_NN(qr);
7262 if (!r1->code_blocks)
7263 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7265 r1c = r1->code_blocks->count;
7266 r2c = r2->code_blocks->count;
7268 Newx(new_block, r1c + r2c, struct reg_code_block);
7272 while (i1 < r1c || i2 < r2c) {
7273 struct reg_code_block *src;
7277 src = &r2->code_blocks->cb[i2++];
7281 src = &r1->code_blocks->cb[i1++];
7282 else if ( r1->code_blocks->cb[i1].start
7283 < r2->code_blocks->cb[i2].start)
7285 src = &r1->code_blocks->cb[i1++];
7286 assert(src->end < r2->code_blocks->cb[i2].start);
7289 assert( r1->code_blocks->cb[i1].start
7290 > r2->code_blocks->cb[i2].start);
7291 src = &r2->code_blocks->cb[i2++];
7293 assert(src->end < r1->code_blocks->cb[i1].start);
7296 assert(pat[src->start] == '(');
7297 assert(pat[src->end] == ')');
7298 dst->start = src->start;
7299 dst->end = src->end;
7300 dst->block = src->block;
7301 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7305 r1->code_blocks->count += r2c;
7306 Safefree(r1->code_blocks->cb);
7307 r1->code_blocks->cb = new_block;
7310 SvREFCNT_dec_NN(qr);
7316 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7317 struct reg_substr_datum *rsd,
7318 struct scan_data_substrs *sub,
7319 STRLEN longest_length)
7321 /* This is the common code for setting up the floating and fixed length
7322 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7323 * as to whether succeeded or not */
7327 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7328 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7330 if (! (longest_length
7331 || (eol /* Can't have SEOL and MULTI */
7332 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7334 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7335 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7340 /* copy the information about the longest from the reg_scan_data
7341 over to the program. */
7342 if (SvUTF8(sub->str)) {
7344 rsd->utf8_substr = sub->str;
7346 rsd->substr = sub->str;
7347 rsd->utf8_substr = NULL;
7349 /* end_shift is how many chars that must be matched that
7350 follow this item. We calculate it ahead of time as once the
7351 lookbehind offset is added in we lose the ability to correctly
7353 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7354 rsd->end_shift = ml - sub->min_offset
7356 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7358 + (SvTAIL(sub->str) != 0)
7362 t = (eol/* Can't have SEOL and MULTI */
7363 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7364 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7370 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7372 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7373 * properly wrapped with the right modifiers */
7375 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7376 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7377 != REGEX_DEPENDS_CHARSET);
7379 /* The caret is output if there are any defaults: if not all the STD
7380 * flags are set, or if no character set specifier is needed */
7382 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7384 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7385 == REG_RUN_ON_COMMENT_SEEN);
7386 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7387 >> RXf_PMf_STD_PMMOD_SHIFT);
7388 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7390 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7392 /* We output all the necessary flags; we never output a minus, as all
7393 * those are defaults, so are
7394 * covered by the caret */
7395 const STRLEN wraplen = pat_len + has_p + has_runon
7396 + has_default /* If needs a caret */
7397 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7399 /* If needs a character set specifier */
7400 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7401 + (sizeof("(?:)") - 1);
7403 PERL_ARGS_ASSERT_SET_REGEX_PV;
7405 /* make sure PL_bitcount bounds not exceeded */
7406 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7408 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7411 SvFLAGS(Rx) |= SVf_UTF8;
7414 /* If a default, cover it using the caret */
7416 *p++= DEFAULT_PAT_MOD;
7422 name = get_regex_charset_name(RExC_rx->extflags, &len);
7423 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7425 name = UNICODE_PAT_MODS;
7426 len = sizeof(UNICODE_PAT_MODS) - 1;
7428 Copy(name, p, len, char);
7432 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7435 while((ch = *fptr++)) {
7443 Copy(RExC_precomp, p, pat_len, char);
7444 assert ((RX_WRAPPED(Rx) - p) < 16);
7445 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7448 /* Adding a trailing \n causes this to compile properly:
7449 my $R = qr / A B C # D E/x; /($R)/
7450 Otherwise the parens are considered part of the comment */
7455 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7459 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7460 * regular expression into internal code.
7461 * The pattern may be passed either as:
7462 * a list of SVs (patternp plus pat_count)
7463 * a list of OPs (expr)
7464 * If both are passed, the SV list is used, but the OP list indicates
7465 * which SVs are actually pre-compiled code blocks
7467 * The SVs in the list have magic and qr overloading applied to them (and
7468 * the list may be modified in-place with replacement SVs in the latter
7471 * If the pattern hasn't changed from old_re, then old_re will be
7474 * eng is the current engine. If that engine has an op_comp method, then
7475 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7476 * do the initial concatenation of arguments and pass on to the external
7479 * If is_bare_re is not null, set it to a boolean indicating whether the
7480 * arg list reduced (after overloading) to a single bare regex which has
7481 * been returned (i.e. /$qr/).
7483 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7485 * pm_flags contains the PMf_* flags, typically based on those from the
7486 * pm_flags field of the related PMOP. Currently we're only interested in
7487 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7489 * For many years this code had an initial sizing pass that calculated
7490 * (sometimes incorrectly, leading to security holes) the size needed for the
7491 * compiled pattern. That was changed by commit
7492 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7493 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7494 * references to this sizing pass.
7496 * Now, an initial crude guess as to the size needed is made, based on the
7497 * length of the pattern. Patches welcome to improve that guess. That amount
7498 * of space is malloc'd and then immediately freed, and then clawed back node
7499 * by node. This design is to minimze, to the extent possible, memory churn
7500 * when doing the reallocs.
7502 * A separate parentheses counting pass may be needed in some cases.
7503 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7506 * The existence of a sizing pass necessitated design decisions that are no
7507 * longer needed. There are potential areas of simplification.
7509 * Beware that the optimization-preparation code in here knows about some
7510 * of the structure of the compiled regexp. [I'll say.]
7514 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7515 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7516 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7518 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7526 SV** new_patternp = patternp;
7528 /* these are all flags - maybe they should be turned
7529 * into a single int with different bit masks */
7530 I32 sawlookahead = 0;
7535 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7537 bool runtime_code = 0;
7539 RExC_state_t RExC_state;
7540 RExC_state_t * const pRExC_state = &RExC_state;
7541 #ifdef TRIE_STUDY_OPT
7543 RExC_state_t copyRExC_state;
7545 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7547 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7549 DEBUG_r(if (!PL_colorset) reginitcolors());
7552 pRExC_state->warn_text = NULL;
7553 pRExC_state->unlexed_names = NULL;
7554 pRExC_state->code_blocks = NULL;
7557 *is_bare_re = FALSE;
7559 if (expr && (expr->op_type == OP_LIST ||
7560 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7561 /* allocate code_blocks if needed */
7565 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7566 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7567 ncode++; /* count of DO blocks */
7570 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7574 /* compile-time pattern with just OP_CONSTs and DO blocks */
7579 /* find how many CONSTs there are */
7582 if (expr->op_type == OP_CONST)
7585 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7586 if (o->op_type == OP_CONST)
7590 /* fake up an SV array */
7592 assert(!new_patternp);
7593 Newx(new_patternp, n, SV*);
7594 SAVEFREEPV(new_patternp);
7598 if (expr->op_type == OP_CONST)
7599 new_patternp[n] = cSVOPx_sv(expr);
7601 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7602 if (o->op_type == OP_CONST)
7603 new_patternp[n++] = cSVOPo_sv;
7608 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7609 "Assembling pattern from %d elements%s\n", pat_count,
7610 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7612 /* set expr to the first arg op */
7614 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7615 && expr->op_type != OP_CONST)
7617 expr = cLISTOPx(expr)->op_first;
7618 assert( expr->op_type == OP_PUSHMARK
7619 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7620 || expr->op_type == OP_PADRANGE);
7621 expr = OpSIBLING(expr);
7624 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7625 expr, &recompile, NULL);
7627 /* handle bare (possibly after overloading) regex: foo =~ $re */
7632 if (SvTYPE(re) == SVt_REGEXP) {
7636 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7637 "Precompiled pattern%s\n",
7638 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7644 exp = SvPV_nomg(pat, plen);
7646 if (!eng->op_comp) {
7647 if ((SvUTF8(pat) && IN_BYTES)
7648 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7650 /* make a temporary copy; either to convert to bytes,
7651 * or to avoid repeating get-magic / overloaded stringify */
7652 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7653 (IN_BYTES ? 0 : SvUTF8(pat)));
7655 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7658 /* ignore the utf8ness if the pattern is 0 length */
7659 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7660 RExC_uni_semantics = 0;
7661 RExC_contains_locale = 0;
7662 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7663 RExC_in_script_run = 0;
7664 RExC_study_started = 0;
7665 pRExC_state->runtime_code_qr = NULL;
7666 RExC_frame_head= NULL;
7667 RExC_frame_last= NULL;
7668 RExC_frame_count= 0;
7669 RExC_latest_warn_offset = 0;
7670 RExC_use_BRANCHJ = 0;
7671 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7672 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7673 RExC_total_parens = 0;
7674 RExC_open_parens = NULL;
7675 RExC_close_parens = NULL;
7676 RExC_paren_names = NULL;
7678 RExC_seen_d_op = FALSE;
7680 RExC_paren_name_list = NULL;
7684 RExC_mysv1= sv_newmortal();
7685 RExC_mysv2= sv_newmortal();
7689 SV *dsv= sv_newmortal();
7690 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7691 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7692 PL_colors[4], PL_colors[5], s);
7695 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7698 if ((pm_flags & PMf_USE_RE_EVAL)
7699 /* this second condition covers the non-regex literal case,
7700 * i.e. $foo =~ '(?{})'. */
7701 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7703 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7706 /* return old regex if pattern hasn't changed */
7707 /* XXX: note in the below we have to check the flags as well as the
7710 * Things get a touch tricky as we have to compare the utf8 flag
7711 * independently from the compile flags. */
7715 && !!RX_UTF8(old_re) == !!RExC_utf8
7716 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7717 && RX_PRECOMP(old_re)
7718 && RX_PRELEN(old_re) == plen
7719 && memEQ(RX_PRECOMP(old_re), exp, plen)
7720 && !runtime_code /* with runtime code, always recompile */ )
7723 SV *dsv= sv_newmortal();
7724 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7725 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
7726 PL_colors[4], PL_colors[5], s);
7731 /* Allocate the pattern's SV */
7732 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7733 RExC_rx = ReANY(Rx);
7734 if ( RExC_rx == NULL )
7735 FAIL("Regexp out of space");
7737 rx_flags = orig_rx_flags;
7739 if ( (UTF || RExC_uni_semantics)
7740 && initial_charset == REGEX_DEPENDS_CHARSET)
7743 /* Set to use unicode semantics if the pattern is in utf8 and has the
7744 * 'depends' charset specified, as it means unicode when utf8 */
7745 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7746 RExC_uni_semantics = 1;
7749 RExC_pm_flags = pm_flags;
7752 assert(TAINTING_get || !TAINT_get);
7754 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7756 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7757 /* whoops, we have a non-utf8 pattern, whilst run-time code
7758 * got compiled as utf8. Try again with a utf8 pattern */
7759 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7760 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7764 assert(!pRExC_state->runtime_code_qr);
7770 RExC_in_lookbehind = 0;
7771 RExC_in_lookahead = 0;
7772 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7773 RExC_recode_x_to_native = 0;
7774 RExC_in_multi_char_class = 0;
7776 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7777 RExC_precomp_end = RExC_end = exp + plen;
7779 RExC_whilem_seen = 0;
7781 RExC_recurse = NULL;
7782 RExC_study_chunk_recursed = NULL;
7783 RExC_study_chunk_recursed_bytes= 0;
7784 RExC_recurse_count = 0;
7785 RExC_sets_depth = 0;
7786 pRExC_state->code_index = 0;
7788 /* Initialize the string in the compiled pattern. This is so that there is
7789 * something to output if necessary */
7790 set_regex_pv(pRExC_state, Rx);
7793 Perl_re_printf( aTHX_
7794 "Starting parse and generation\n");
7796 RExC_lastparse=NULL;
7799 /* Allocate space and zero-initialize. Note, the two step process
7800 of zeroing when in debug mode, thus anything assigned has to
7801 happen after that */
7804 /* On the first pass of the parse, we guess how big this will be. Then
7805 * we grow in one operation to that amount and then give it back. As
7806 * we go along, we re-allocate what we need.
7808 * XXX Currently the guess is essentially that the pattern will be an
7809 * EXACT node with one byte input, one byte output. This is crude, and
7810 * better heuristics are welcome.
7812 * On any subsequent passes, we guess what we actually computed in the
7813 * latest earlier pass. Such a pass probably didn't complete so is
7814 * missing stuff. We could improve those guesses by knowing where the
7815 * parse stopped, and use the length so far plus apply the above
7816 * assumption to what's left. */
7817 RExC_size = STR_SZ(RExC_end - RExC_start);
7820 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7821 if ( RExC_rxi == NULL )
7822 FAIL("Regexp out of space");
7824 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7825 RXi_SET( RExC_rx, RExC_rxi );
7827 /* We start from 0 (over from 0 in the case this is a reparse. The first
7828 * node parsed will give back any excess memory we have allocated so far).
7832 /* non-zero initialization begins here */
7833 RExC_rx->engine= eng;
7834 RExC_rx->extflags = rx_flags;
7835 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7837 if (pm_flags & PMf_IS_QR) {
7838 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7839 if (RExC_rxi->code_blocks) {
7840 RExC_rxi->code_blocks->refcnt++;
7844 RExC_rx->intflags = 0;
7846 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7849 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7850 * code makes sure the final byte is an uncounted NUL. But should this
7851 * ever not be the case, lots of things could read beyond the end of the
7852 * buffer: loops like
7853 * while(isFOO(*RExC_parse)) RExC_parse++;
7854 * strchr(RExC_parse, "foo");
7855 * etc. So it is worth noting. */
7856 assert(*RExC_end == '\0');
7860 RExC_parens_buf_size = 0;
7861 RExC_emit_start = RExC_rxi->program;
7862 pRExC_state->code_index = 0;
7864 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7868 if (reg(pRExC_state, 0, &flags, 1)) {
7870 /* Success!, But we may need to redo the parse knowing how many parens
7871 * there actually are */
7872 if (IN_PARENS_PASS) {
7873 flags |= RESTART_PARSE;
7876 /* We have that number in RExC_npar */
7877 RExC_total_parens = RExC_npar;
7879 else if (! MUST_RESTART(flags)) {
7881 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7884 /* Here, we either have success, or we have to redo the parse for some reason */
7885 if (MUST_RESTART(flags)) {
7887 /* It's possible to write a regexp in ascii that represents Unicode
7888 codepoints outside of the byte range, such as via \x{100}. If we
7889 detect such a sequence we have to convert the entire pattern to utf8
7890 and then recompile, as our sizing calculation will have been based
7891 on 1 byte == 1 character, but we will need to use utf8 to encode
7892 at least some part of the pattern, and therefore must convert the whole
7895 if (flags & NEED_UTF8) {
7897 /* We have stored the offset of the final warning output so far.
7898 * That must be adjusted. Any variant characters between the start
7899 * of the pattern and this warning count for 2 bytes in the final,
7900 * so just add them again */
7901 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7902 RExC_latest_warn_offset +=
7903 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7904 + RExC_latest_warn_offset);
7906 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7907 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7908 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7911 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7914 if (ALL_PARENS_COUNTED) {
7915 /* Make enough room for all the known parens, and zero it */
7916 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7917 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7918 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7920 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7921 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7923 else { /* Parse did not complete. Reinitialize the parentheses
7925 RExC_total_parens = 0;
7926 if (RExC_open_parens) {
7927 Safefree(RExC_open_parens);
7928 RExC_open_parens = NULL;
7930 if (RExC_close_parens) {
7931 Safefree(RExC_close_parens);
7932 RExC_close_parens = NULL;
7936 /* Clean up what we did in this parse */
7937 SvREFCNT_dec_NN(RExC_rx_sv);
7942 /* Here, we have successfully parsed and generated the pattern's program
7943 * for the regex engine. We are ready to finish things up and look for
7946 /* Update the string to compile, with correct modifiers, etc */
7947 set_regex_pv(pRExC_state, Rx);
7949 RExC_rx->nparens = RExC_total_parens - 1;
7951 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7952 if (RExC_whilem_seen > 15)
7953 RExC_whilem_seen = 15;
7956 Perl_re_printf( aTHX_
7957 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7959 RExC_lastparse=NULL;
7962 #ifdef RE_TRACK_PATTERN_OFFSETS
7963 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7964 "%s %" UVuf " bytes for offset annotations.\n",
7965 RExC_offsets ? "Got" : "Couldn't get",
7966 (UV)((RExC_offsets[0] * 2 + 1))));
7967 DEBUG_OFFSETS_r(if (RExC_offsets) {
7968 const STRLEN len = RExC_offsets[0];
7970 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7971 Perl_re_printf( aTHX_
7972 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7973 for (i = 1; i <= len; i++) {
7974 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7975 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7976 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7978 Perl_re_printf( aTHX_ "\n");
7982 SetProgLen(RExC_rxi,RExC_size);
7985 DEBUG_DUMP_PRE_OPTIMIZE_r({
7986 SV * const sv = sv_newmortal();
7987 RXi_GET_DECL(RExC_rx, ri);
7989 Perl_re_printf( aTHX_ "Program before optimization:\n");
7991 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7996 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7999 /* XXXX To minimize changes to RE engine we always allocate
8000 3-units-long substrs field. */
8001 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8002 if (RExC_recurse_count) {
8003 Newx(RExC_recurse, RExC_recurse_count, regnode *);
8004 SAVEFREEPV(RExC_recurse);
8007 if (RExC_seen & REG_RECURSE_SEEN) {
8008 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8009 * So its 1 if there are no parens. */
8010 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8011 ((RExC_total_parens & 0x07) != 0);
8012 Newx(RExC_study_chunk_recursed,
8013 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8014 SAVEFREEPV(RExC_study_chunk_recursed);
8018 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8020 RExC_study_chunk_recursed_count= 0;
8022 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8023 if (RExC_study_chunk_recursed) {
8024 Zero(RExC_study_chunk_recursed,
8025 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8029 #ifdef TRIE_STUDY_OPT
8031 StructCopy(&zero_scan_data, &data, scan_data_t);
8032 copyRExC_state = RExC_state;
8035 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8037 RExC_state = copyRExC_state;
8038 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8039 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8041 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8042 StructCopy(&zero_scan_data, &data, scan_data_t);
8045 StructCopy(&zero_scan_data, &data, scan_data_t);
8048 /* Dig out information for optimizations. */
8049 RExC_rx->extflags = RExC_flags; /* was pm_op */
8050 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8053 SvUTF8_on(Rx); /* Unicode in it? */
8054 RExC_rxi->regstclass = NULL;
8055 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8056 RExC_rx->intflags |= PREGf_NAUGHTY;
8057 scan = RExC_rxi->program + 1; /* First BRANCH. */
8059 /* testing for BRANCH here tells us whether there is "must appear"
8060 data in the pattern. If there is then we can use it for optimisations */
8061 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8064 STRLEN longest_length[2];
8065 regnode_ssc ch_class; /* pointed to by data */
8067 SSize_t last_close = 0; /* pointed to by data */
8068 regnode *first= scan;
8069 regnode *first_next= regnext(first);
8073 * Skip introductions and multiplicators >= 1
8074 * so that we can extract the 'meat' of the pattern that must
8075 * match in the large if() sequence following.
8076 * NOTE that EXACT is NOT covered here, as it is normally
8077 * picked up by the optimiser separately.
8079 * This is unfortunate as the optimiser isnt handling lookahead
8080 * properly currently.
8083 while ((OP(first) == OPEN && (sawopen = 1)) ||
8084 /* An OR of *one* alternative - should not happen now. */
8085 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8086 /* for now we can't handle lookbehind IFMATCH*/
8087 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8088 (OP(first) == PLUS) ||
8089 (OP(first) == MINMOD) ||
8090 /* An {n,m} with n>0 */
8091 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8092 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8095 * the only op that could be a regnode is PLUS, all the rest
8096 * will be regnode_1 or regnode_2.
8098 * (yves doesn't think this is true)
8100 if (OP(first) == PLUS)
8103 if (OP(first) == MINMOD)
8105 first += regarglen[OP(first)];
8107 first = NEXTOPER(first);
8108 first_next= regnext(first);
8111 /* Starting-point info. */
8113 DEBUG_PEEP("first:", first, 0, 0);
8114 /* Ignore EXACT as we deal with it later. */
8115 if (PL_regkind[OP(first)] == EXACT) {
8116 if (! isEXACTFish(OP(first))) {
8117 NOOP; /* Empty, get anchored substr later. */
8120 RExC_rxi->regstclass = first;
8123 else if (PL_regkind[OP(first)] == TRIE &&
8124 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8126 /* this can happen only on restudy */
8127 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8130 else if (REGNODE_SIMPLE(OP(first)))
8131 RExC_rxi->regstclass = first;
8132 else if (PL_regkind[OP(first)] == BOUND ||
8133 PL_regkind[OP(first)] == NBOUND)
8134 RExC_rxi->regstclass = first;
8135 else if (PL_regkind[OP(first)] == BOL) {
8136 RExC_rx->intflags |= (OP(first) == MBOL
8139 first = NEXTOPER(first);
8142 else if (OP(first) == GPOS) {
8143 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8144 first = NEXTOPER(first);
8147 else if ((!sawopen || !RExC_sawback) &&
8149 (OP(first) == STAR &&
8150 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8151 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8153 /* turn .* into ^.* with an implied $*=1 */
8155 (OP(NEXTOPER(first)) == REG_ANY)
8158 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8159 first = NEXTOPER(first);
8162 if (sawplus && !sawminmod && !sawlookahead
8163 && (!sawopen || !RExC_sawback)
8164 && !pRExC_state->code_blocks) /* May examine pos and $& */
8165 /* x+ must match at the 1st pos of run of x's */
8166 RExC_rx->intflags |= PREGf_SKIP;
8168 /* Scan is after the zeroth branch, first is atomic matcher. */
8169 #ifdef TRIE_STUDY_OPT
8172 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8173 (IV)(first - scan + 1))
8177 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8178 (IV)(first - scan + 1))
8184 * If there's something expensive in the r.e., find the
8185 * longest literal string that must appear and make it the
8186 * regmust. Resolve ties in favor of later strings, since
8187 * the regstart check works with the beginning of the r.e.
8188 * and avoiding duplication strengthens checking. Not a
8189 * strong reason, but sufficient in the absence of others.
8190 * [Now we resolve ties in favor of the earlier string if
8191 * it happens that c_offset_min has been invalidated, since the
8192 * earlier string may buy us something the later one won't.]
8195 data.substrs[0].str = newSVpvs("");
8196 data.substrs[1].str = newSVpvs("");
8197 data.last_found = newSVpvs("");
8198 data.cur_is_floating = 0; /* initially any found substring is fixed */
8199 ENTER_with_name("study_chunk");
8200 SAVEFREESV(data.substrs[0].str);
8201 SAVEFREESV(data.substrs[1].str);
8202 SAVEFREESV(data.last_found);
8204 if (!RExC_rxi->regstclass) {
8205 ssc_init(pRExC_state, &ch_class);
8206 data.start_class = &ch_class;
8207 stclass_flag = SCF_DO_STCLASS_AND;
8208 } else /* XXXX Check for BOUND? */
8210 data.last_closep = &last_close;
8214 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8215 * (NO top level branches)
8217 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8218 scan + RExC_size, /* Up to end */
8220 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8221 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8225 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8228 if ( RExC_total_parens == 1 && !data.cur_is_floating
8229 && data.last_start_min == 0 && data.last_end > 0
8230 && !RExC_seen_zerolen
8231 && !(RExC_seen & REG_VERBARG_SEEN)
8232 && !(RExC_seen & REG_GPOS_SEEN)
8234 RExC_rx->extflags |= RXf_CHECK_ALL;
8236 scan_commit(pRExC_state, &data,&minlen, 0);
8239 /* XXX this is done in reverse order because that's the way the
8240 * code was before it was parameterised. Don't know whether it
8241 * actually needs doing in reverse order. DAPM */
8242 for (i = 1; i >= 0; i--) {
8243 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8246 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8247 && data.substrs[0].min_offset
8248 == data.substrs[1].min_offset
8249 && SvCUR(data.substrs[0].str)
8250 == SvCUR(data.substrs[1].str)
8252 && S_setup_longest (aTHX_ pRExC_state,
8253 &(RExC_rx->substrs->data[i]),
8257 RExC_rx->substrs->data[i].min_offset =
8258 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8260 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8261 /* Don't offset infinity */
8262 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8263 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8264 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8267 RExC_rx->substrs->data[i].substr = NULL;
8268 RExC_rx->substrs->data[i].utf8_substr = NULL;
8269 longest_length[i] = 0;
8273 LEAVE_with_name("study_chunk");
8275 if (RExC_rxi->regstclass
8276 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8277 RExC_rxi->regstclass = NULL;
8279 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8280 || RExC_rx->substrs->data[0].min_offset)
8282 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8283 && is_ssc_worth_it(pRExC_state, data.start_class))
8285 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8287 ssc_finalize(pRExC_state, data.start_class);
8289 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8290 StructCopy(data.start_class,
8291 (regnode_ssc*)RExC_rxi->data->data[n],
8293 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8294 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8295 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8296 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8297 Perl_re_printf( aTHX_
8298 "synthetic stclass \"%s\".\n",
8299 SvPVX_const(sv));});
8300 data.start_class = NULL;
8303 /* A temporary algorithm prefers floated substr to fixed one of
8304 * same length to dig more info. */
8305 i = (longest_length[0] <= longest_length[1]);
8306 RExC_rx->substrs->check_ix = i;
8307 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8308 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8309 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8310 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8311 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8312 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8313 RExC_rx->intflags |= PREGf_NOSCAN;
8315 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8316 RExC_rx->extflags |= RXf_USE_INTUIT;
8317 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8318 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8321 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8322 if ( (STRLEN)minlen < longest_length[1] )
8323 minlen= longest_length[1];
8324 if ( (STRLEN)minlen < longest_length[0] )
8325 minlen= longest_length[0];
8329 /* Several toplevels. Best we can is to set minlen. */
8331 regnode_ssc ch_class;
8332 SSize_t last_close = 0;
8334 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8336 scan = RExC_rxi->program + 1;
8337 ssc_init(pRExC_state, &ch_class);
8338 data.start_class = &ch_class;
8339 data.last_closep = &last_close;
8343 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8344 * (patterns WITH top level branches)
8346 minlen = study_chunk(pRExC_state,
8347 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8348 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8349 ? SCF_TRIE_DOING_RESTUDY
8353 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8355 RExC_rx->check_substr = NULL;
8356 RExC_rx->check_utf8 = NULL;
8357 RExC_rx->substrs->data[0].substr = NULL;
8358 RExC_rx->substrs->data[0].utf8_substr = NULL;
8359 RExC_rx->substrs->data[1].substr = NULL;
8360 RExC_rx->substrs->data[1].utf8_substr = NULL;
8362 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8363 && is_ssc_worth_it(pRExC_state, data.start_class))
8365 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8367 ssc_finalize(pRExC_state, data.start_class);
8369 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8370 StructCopy(data.start_class,
8371 (regnode_ssc*)RExC_rxi->data->data[n],
8373 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8374 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8375 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8376 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8377 Perl_re_printf( aTHX_
8378 "synthetic stclass \"%s\".\n",
8379 SvPVX_const(sv));});
8380 data.start_class = NULL;
8384 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8385 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8386 RExC_rx->maxlen = REG_INFTY;
8389 RExC_rx->maxlen = RExC_maxlen;
8392 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8393 the "real" pattern. */
8395 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8396 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8398 RExC_rx->minlenret = minlen;
8399 if (RExC_rx->minlen < minlen)
8400 RExC_rx->minlen = minlen;
8402 if (RExC_seen & REG_RECURSE_SEEN ) {
8403 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8404 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8406 if (RExC_seen & REG_GPOS_SEEN)
8407 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8408 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8409 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8411 if (pRExC_state->code_blocks)
8412 RExC_rx->extflags |= RXf_EVAL_SEEN;
8413 if (RExC_seen & REG_VERBARG_SEEN)
8415 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8416 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8418 if (RExC_seen & REG_CUTGROUP_SEEN)
8419 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8420 if (pm_flags & PMf_USE_RE_EVAL)
8421 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8422 if (RExC_paren_names)
8423 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8425 RXp_PAREN_NAMES(RExC_rx) = NULL;
8427 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8428 * so it can be used in pp.c */
8429 if (RExC_rx->intflags & PREGf_ANCH)
8430 RExC_rx->extflags |= RXf_IS_ANCHORED;
8434 /* this is used to identify "special" patterns that might result
8435 * in Perl NOT calling the regex engine and instead doing the match "itself",
8436 * particularly special cases in split//. By having the regex compiler
8437 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8438 * we avoid weird issues with equivalent patterns resulting in different behavior,
8439 * AND we allow non Perl engines to get the same optimizations by the setting the
8440 * flags appropriately - Yves */
8441 regnode *first = RExC_rxi->program + 1;
8443 regnode *next = regnext(first);
8446 if (PL_regkind[fop] == NOTHING && nop == END)
8447 RExC_rx->extflags |= RXf_NULL;
8448 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8449 /* when fop is SBOL first->flags will be true only when it was
8450 * produced by parsing /\A/, and not when parsing /^/. This is
8451 * very important for the split code as there we want to
8452 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8453 * See rt #122761 for more details. -- Yves */
8454 RExC_rx->extflags |= RXf_START_ONLY;
8455 else if (fop == PLUS
8456 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8458 RExC_rx->extflags |= RXf_WHITE;
8459 else if ( RExC_rx->extflags & RXf_SPLIT
8460 && (PL_regkind[fop] == EXACT && ! isEXACTFish(fop))
8461 && STR_LEN(first) == 1
8462 && *(STRING(first)) == ' '
8464 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8468 if (RExC_contains_locale) {
8469 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8473 if (RExC_paren_names) {
8474 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8475 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8476 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8479 RExC_rxi->name_list_idx = 0;
8481 while ( RExC_recurse_count > 0 ) {
8482 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8484 * This data structure is set up in study_chunk() and is used
8485 * to calculate the distance between a GOSUB regopcode and
8486 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8489 * If for some reason someone writes code that optimises
8490 * away a GOSUB opcode then the assert should be changed to
8491 * an if(scan) to guard the ARG2L_SET() - Yves
8494 assert(scan && OP(scan) == GOSUB);
8495 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8498 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8499 /* assume we don't need to swap parens around before we match */
8501 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8502 (unsigned long)RExC_study_chunk_recursed_count);
8506 Perl_re_printf( aTHX_ "Final program:\n");
8510 if (RExC_open_parens) {
8511 Safefree(RExC_open_parens);
8512 RExC_open_parens = NULL;
8514 if (RExC_close_parens) {
8515 Safefree(RExC_close_parens);
8516 RExC_close_parens = NULL;
8520 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8521 * by setting the regexp SV to readonly-only instead. If the
8522 * pattern's been recompiled, the USEDness should remain. */
8523 if (old_re && SvREADONLY(old_re))
8531 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8534 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8536 PERL_UNUSED_ARG(value);
8538 if (flags & RXapif_FETCH) {
8539 return reg_named_buff_fetch(rx, key, flags);
8540 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8541 Perl_croak_no_modify();
8543 } else if (flags & RXapif_EXISTS) {
8544 return reg_named_buff_exists(rx, key, flags)
8547 } else if (flags & RXapif_REGNAMES) {
8548 return reg_named_buff_all(rx, flags);
8549 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8550 return reg_named_buff_scalar(rx, flags);
8552 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8558 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8561 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8562 PERL_UNUSED_ARG(lastkey);
8564 if (flags & RXapif_FIRSTKEY)
8565 return reg_named_buff_firstkey(rx, flags);
8566 else if (flags & RXapif_NEXTKEY)
8567 return reg_named_buff_nextkey(rx, flags);
8569 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8576 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8580 struct regexp *const rx = ReANY(r);
8582 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8584 if (rx && RXp_PAREN_NAMES(rx)) {
8585 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8588 SV* sv_dat=HeVAL(he_str);
8589 I32 *nums=(I32*)SvPVX(sv_dat);
8590 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8591 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8592 if ((I32)(rx->nparens) >= nums[i]
8593 && rx->offs[nums[i]].start != -1
8594 && rx->offs[nums[i]].end != -1)
8597 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8602 ret = newSVsv(&PL_sv_undef);
8605 av_push(retarray, ret);
8608 return newRV_noinc(MUTABLE_SV(retarray));
8615 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8618 struct regexp *const rx = ReANY(r);
8620 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8622 if (rx && RXp_PAREN_NAMES(rx)) {
8623 if (flags & RXapif_ALL) {
8624 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8626 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8628 SvREFCNT_dec_NN(sv);
8640 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8642 struct regexp *const rx = ReANY(r);
8644 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8646 if ( rx && RXp_PAREN_NAMES(rx) ) {
8647 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8649 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8656 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8658 struct regexp *const rx = ReANY(r);
8659 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8661 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8663 if (rx && RXp_PAREN_NAMES(rx)) {
8664 HV *hv = RXp_PAREN_NAMES(rx);
8666 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8669 SV* sv_dat = HeVAL(temphe);
8670 I32 *nums = (I32*)SvPVX(sv_dat);
8671 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8672 if ((I32)(rx->lastparen) >= nums[i] &&
8673 rx->offs[nums[i]].start != -1 &&
8674 rx->offs[nums[i]].end != -1)
8680 if (parno || flags & RXapif_ALL) {
8681 return newSVhek(HeKEY_hek(temphe));
8689 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8694 struct regexp *const rx = ReANY(r);
8696 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8698 if (rx && RXp_PAREN_NAMES(rx)) {
8699 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8700 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8701 } else if (flags & RXapif_ONE) {
8702 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8703 av = MUTABLE_AV(SvRV(ret));
8704 length = av_count(av);
8705 SvREFCNT_dec_NN(ret);
8706 return newSViv(length);
8708 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8713 return &PL_sv_undef;
8717 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8719 struct regexp *const rx = ReANY(r);
8722 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8724 if (rx && RXp_PAREN_NAMES(rx)) {
8725 HV *hv= RXp_PAREN_NAMES(rx);
8727 (void)hv_iterinit(hv);
8728 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8731 SV* sv_dat = HeVAL(temphe);
8732 I32 *nums = (I32*)SvPVX(sv_dat);
8733 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8734 if ((I32)(rx->lastparen) >= nums[i] &&
8735 rx->offs[nums[i]].start != -1 &&
8736 rx->offs[nums[i]].end != -1)
8742 if (parno || flags & RXapif_ALL) {
8743 av_push(av, newSVhek(HeKEY_hek(temphe)));
8748 return newRV_noinc(MUTABLE_SV(av));
8752 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8755 struct regexp *const rx = ReANY(r);
8761 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8763 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8764 || n == RX_BUFF_IDX_CARET_FULLMATCH
8765 || n == RX_BUFF_IDX_CARET_POSTMATCH
8768 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8770 /* on something like
8773 * the KEEPCOPY is set on the PMOP rather than the regex */
8774 if (PL_curpm && r == PM_GETRE(PL_curpm))
8775 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8784 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8785 /* no need to distinguish between them any more */
8786 n = RX_BUFF_IDX_FULLMATCH;
8788 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8789 && rx->offs[0].start != -1)
8791 /* $`, ${^PREMATCH} */
8792 i = rx->offs[0].start;
8796 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8797 && rx->offs[0].end != -1)
8799 /* $', ${^POSTMATCH} */
8800 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8801 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8804 if (inRANGE(n, 0, (I32)rx->nparens) &&
8805 (s1 = rx->offs[n].start) != -1 &&
8806 (t1 = rx->offs[n].end) != -1)
8808 /* $&, ${^MATCH}, $1 ... */
8810 s = rx->subbeg + s1 - rx->suboffset;
8815 assert(s >= rx->subbeg);
8816 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8818 #ifdef NO_TAINT_SUPPORT
8819 sv_setpvn(sv, s, i);
8821 const int oldtainted = TAINT_get;
8823 sv_setpvn(sv, s, i);
8824 TAINT_set(oldtainted);
8826 if (RXp_MATCH_UTF8(rx))
8831 if (RXp_MATCH_TAINTED(rx)) {
8832 if (SvTYPE(sv) >= SVt_PVMG) {
8833 MAGIC* const mg = SvMAGIC(sv);
8836 SvMAGIC_set(sv, mg->mg_moremagic);
8838 if ((mgt = SvMAGIC(sv))) {
8839 mg->mg_moremagic = mgt;
8840 SvMAGIC_set(sv, mg);
8857 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8858 SV const * const value)
8860 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8862 PERL_UNUSED_ARG(rx);
8863 PERL_UNUSED_ARG(paren);
8864 PERL_UNUSED_ARG(value);
8867 Perl_croak_no_modify();
8871 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8874 struct regexp *const rx = ReANY(r);
8878 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8880 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8881 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8882 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8885 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8887 /* on something like
8890 * the KEEPCOPY is set on the PMOP rather than the regex */
8891 if (PL_curpm && r == PM_GETRE(PL_curpm))
8892 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8898 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8900 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8901 case RX_BUFF_IDX_PREMATCH: /* $` */
8902 if (rx->offs[0].start != -1) {
8903 i = rx->offs[0].start;
8912 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8913 case RX_BUFF_IDX_POSTMATCH: /* $' */
8914 if (rx->offs[0].end != -1) {
8915 i = rx->sublen - rx->offs[0].end;
8917 s1 = rx->offs[0].end;
8924 default: /* $& / ${^MATCH}, $1, $2, ... */
8925 if (paren <= (I32)rx->nparens &&
8926 (s1 = rx->offs[paren].start) != -1 &&
8927 (t1 = rx->offs[paren].end) != -1)
8933 if (ckWARN(WARN_UNINITIALIZED))
8934 report_uninit((const SV *)sv);
8939 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8940 const char * const s = rx->subbeg - rx->suboffset + s1;
8945 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8952 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8954 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8955 PERL_UNUSED_ARG(rx);
8959 return newSVpvs("Regexp");
8962 /* Scans the name of a named buffer from the pattern.
8963 * If flags is REG_RSN_RETURN_NULL returns null.
8964 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8965 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8966 * to the parsed name as looked up in the RExC_paren_names hash.
8967 * If there is an error throws a vFAIL().. type exception.
8970 #define REG_RSN_RETURN_NULL 0
8971 #define REG_RSN_RETURN_NAME 1
8972 #define REG_RSN_RETURN_DATA 2
8975 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8977 char *name_start = RExC_parse;
8980 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8982 assert (RExC_parse <= RExC_end);
8983 if (RExC_parse == RExC_end) NOOP;
8984 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8985 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8986 * using do...while */
8989 RExC_parse += UTF8SKIP(RExC_parse);
8990 } while ( RExC_parse < RExC_end
8991 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8995 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8997 RExC_parse++; /* so the <- from the vFAIL is after the offending
8999 vFAIL("Group name must start with a non-digit word character");
9001 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9002 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9003 if ( flags == REG_RSN_RETURN_NAME)
9005 else if (flags==REG_RSN_RETURN_DATA) {
9008 if ( ! sv_name ) /* should not happen*/
9009 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9010 if (RExC_paren_names)
9011 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9013 sv_dat = HeVAL(he_str);
9014 if ( ! sv_dat ) { /* Didn't find group */
9016 /* It might be a forward reference; we can't fail until we
9017 * know, by completing the parse to get all the groups, and
9019 if (ALL_PARENS_COUNTED) {
9020 vFAIL("Reference to nonexistent named group");
9023 REQUIRE_PARENS_PASS;
9029 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9030 (unsigned long) flags);
9033 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9034 if (RExC_lastparse!=RExC_parse) { \
9035 Perl_re_printf( aTHX_ "%s", \
9036 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9037 RExC_end - RExC_parse, 16, \
9039 PERL_PV_ESCAPE_UNI_DETECT | \
9040 PERL_PV_PRETTY_ELLIPSES | \
9041 PERL_PV_PRETTY_LTGT | \
9042 PERL_PV_ESCAPE_RE | \
9043 PERL_PV_PRETTY_EXACTSIZE \
9047 Perl_re_printf( aTHX_ "%16s",""); \
9049 if (RExC_lastnum!=RExC_emit) \
9050 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9052 Perl_re_printf( aTHX_ "|%4s",""); \
9053 Perl_re_printf( aTHX_ "|%*s%-4s", \
9054 (int)((depth*2)), "", \
9057 RExC_lastnum=RExC_emit; \
9058 RExC_lastparse=RExC_parse; \
9063 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9064 DEBUG_PARSE_MSG((funcname)); \
9065 Perl_re_printf( aTHX_ "%4s","\n"); \
9067 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9068 DEBUG_PARSE_MSG((funcname)); \
9069 Perl_re_printf( aTHX_ fmt "\n",args); \
9072 /* This section of code defines the inversion list object and its methods. The
9073 * interfaces are highly subject to change, so as much as possible is static to
9074 * this file. An inversion list is here implemented as a malloc'd C UV array
9075 * as an SVt_INVLIST scalar.
9077 * An inversion list for Unicode is an array of code points, sorted by ordinal
9078 * number. Each element gives the code point that begins a range that extends
9079 * up-to but not including the code point given by the next element. The final
9080 * element gives the first code point of a range that extends to the platform's
9081 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9082 * ...) give ranges whose code points are all in the inversion list. We say
9083 * that those ranges are in the set. The odd-numbered elements give ranges
9084 * whose code points are not in the inversion list, and hence not in the set.
9085 * Thus, element [0] is the first code point in the list. Element [1]
9086 * is the first code point beyond that not in the list; and element [2] is the
9087 * first code point beyond that that is in the list. In other words, the first
9088 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9089 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9090 * all code points in that range are not in the inversion list. The third
9091 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9092 * list, and so forth. Thus every element whose index is divisible by two
9093 * gives the beginning of a range that is in the list, and every element whose
9094 * index is not divisible by two gives the beginning of a range not in the
9095 * list. If the final element's index is divisible by two, the inversion list
9096 * extends to the platform's infinity; otherwise the highest code point in the
9097 * inversion list is the contents of that element minus 1.
9099 * A range that contains just a single code point N will look like
9101 * invlist[i+1] == N+1
9103 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9104 * impossible to represent, so element [i+1] is omitted. The single element
9106 * invlist[0] == UV_MAX
9107 * contains just UV_MAX, but is interpreted as matching to infinity.
9109 * Taking the complement (inverting) an inversion list is quite simple, if the
9110 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9111 * This implementation reserves an element at the beginning of each inversion
9112 * list to always contain 0; there is an additional flag in the header which
9113 * indicates if the list begins at the 0, or is offset to begin at the next
9114 * element. This means that the inversion list can be inverted without any
9115 * copying; just flip the flag.
9117 * More about inversion lists can be found in "Unicode Demystified"
9118 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9120 * The inversion list data structure is currently implemented as an SV pointing
9121 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9122 * array of UV whose memory management is automatically handled by the existing
9123 * facilities for SV's.
9125 * Some of the methods should always be private to the implementation, and some
9126 * should eventually be made public */
9128 /* The header definitions are in F<invlist_inline.h> */
9130 #ifndef PERL_IN_XSUB_RE
9132 PERL_STATIC_INLINE UV*
9133 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9135 /* Returns a pointer to the first element in the inversion list's array.
9136 * This is called upon initialization of an inversion list. Where the
9137 * array begins depends on whether the list has the code point U+0000 in it
9138 * or not. The other parameter tells it whether the code that follows this
9139 * call is about to put a 0 in the inversion list or not. The first
9140 * element is either the element reserved for 0, if TRUE, or the element
9141 * after it, if FALSE */
9143 bool* offset = get_invlist_offset_addr(invlist);
9144 UV* zero_addr = (UV *) SvPVX(invlist);
9146 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9149 assert(! _invlist_len(invlist));
9153 /* 1^1 = 0; 1^0 = 1 */
9154 *offset = 1 ^ will_have_0;
9155 return zero_addr + *offset;
9159 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9161 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9162 * steals the list from 'src', so 'src' is made to have a NULL list. This
9163 * is similar to what SvSetMagicSV() would do, if it were implemented on
9164 * inversion lists, though this routine avoids a copy */
9166 const UV src_len = _invlist_len(src);
9167 const bool src_offset = *get_invlist_offset_addr(src);
9168 const STRLEN src_byte_len = SvLEN(src);
9169 char * array = SvPVX(src);
9171 const int oldtainted = TAINT_get;
9173 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9175 assert(is_invlist(src));
9176 assert(is_invlist(dest));
9177 assert(! invlist_is_iterating(src));
9178 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9180 /* Make sure it ends in the right place with a NUL, as our inversion list
9181 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9183 array[src_byte_len - 1] = '\0';
9185 TAINT_NOT; /* Otherwise it breaks */
9186 sv_usepvn_flags(dest,
9190 /* This flag is documented to cause a copy to be avoided */
9191 SV_HAS_TRAILING_NUL);
9192 TAINT_set(oldtainted);
9197 /* Finish up copying over the other fields in an inversion list */
9198 *get_invlist_offset_addr(dest) = src_offset;
9199 invlist_set_len(dest, src_len, src_offset);
9200 *get_invlist_previous_index_addr(dest) = 0;
9201 invlist_iterfinish(dest);
9204 PERL_STATIC_INLINE IV*
9205 S_get_invlist_previous_index_addr(SV* invlist)
9207 /* Return the address of the IV that is reserved to hold the cached index
9209 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9211 assert(is_invlist(invlist));
9213 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9216 PERL_STATIC_INLINE IV
9217 S_invlist_previous_index(SV* const invlist)
9219 /* Returns cached index of previous search */
9221 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9223 return *get_invlist_previous_index_addr(invlist);
9226 PERL_STATIC_INLINE void
9227 S_invlist_set_previous_index(SV* const invlist, const IV index)
9229 /* Caches <index> for later retrieval */
9231 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9233 assert(index == 0 || index < (int) _invlist_len(invlist));
9235 *get_invlist_previous_index_addr(invlist) = index;
9238 PERL_STATIC_INLINE void
9239 S_invlist_trim(SV* invlist)
9241 /* Free the not currently-being-used space in an inversion list */
9243 /* But don't free up the space needed for the 0 UV that is always at the
9244 * beginning of the list, nor the trailing NUL */
9245 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9247 PERL_ARGS_ASSERT_INVLIST_TRIM;
9249 assert(is_invlist(invlist));
9251 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9254 PERL_STATIC_INLINE void
9255 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9257 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9259 assert(is_invlist(invlist));
9261 invlist_set_len(invlist, 0, 0);
9262 invlist_trim(invlist);
9265 #endif /* ifndef PERL_IN_XSUB_RE */
9267 PERL_STATIC_INLINE bool
9268 S_invlist_is_iterating(SV* const invlist)
9270 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9272 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9275 #ifndef PERL_IN_XSUB_RE
9277 PERL_STATIC_INLINE UV
9278 S_invlist_max(SV* const invlist)
9280 /* Returns the maximum number of elements storable in the inversion list's
9281 * array, without having to realloc() */
9283 PERL_ARGS_ASSERT_INVLIST_MAX;
9285 assert(is_invlist(invlist));
9287 /* Assumes worst case, in which the 0 element is not counted in the
9288 * inversion list, so subtracts 1 for that */
9289 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9290 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9291 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9295 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9297 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9299 /* First 1 is in case the zero element isn't in the list; second 1 is for
9301 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9302 invlist_set_len(invlist, 0, 0);
9304 /* Force iterinit() to be used to get iteration to work */
9305 invlist_iterfinish(invlist);
9307 *get_invlist_previous_index_addr(invlist) = 0;
9308 SvPOK_on(invlist); /* This allows B to extract the PV */
9312 Perl__new_invlist(pTHX_ IV initial_size)
9315 /* Return a pointer to a newly constructed inversion list, with enough
9316 * space to store 'initial_size' elements. If that number is negative, a
9317 * system default is used instead */
9321 if (initial_size < 0) {
9325 new_list = newSV_type(SVt_INVLIST);
9326 initialize_invlist_guts(new_list, initial_size);
9332 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9334 /* Return a pointer to a newly constructed inversion list, initialized to
9335 * point to <list>, which has to be in the exact correct inversion list
9336 * form, including internal fields. Thus this is a dangerous routine that
9337 * should not be used in the wrong hands. The passed in 'list' contains
9338 * several header fields at the beginning that are not part of the
9339 * inversion list body proper */
9341 const STRLEN length = (STRLEN) list[0];
9342 const UV version_id = list[1];
9343 const bool offset = cBOOL(list[2]);
9344 #define HEADER_LENGTH 3
9345 /* If any of the above changes in any way, you must change HEADER_LENGTH
9346 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9347 * perl -E 'say int(rand 2**31-1)'
9349 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9350 data structure type, so that one being
9351 passed in can be validated to be an
9352 inversion list of the correct vintage.
9355 SV* invlist = newSV_type(SVt_INVLIST);
9357 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9359 if (version_id != INVLIST_VERSION_ID) {
9360 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9363 /* The generated array passed in includes header elements that aren't part
9364 * of the list proper, so start it just after them */
9365 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9367 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9368 shouldn't touch it */
9370 *(get_invlist_offset_addr(invlist)) = offset;
9372 /* The 'length' passed to us is the physical number of elements in the
9373 * inversion list. But if there is an offset the logical number is one
9375 invlist_set_len(invlist, length - offset, offset);
9377 invlist_set_previous_index(invlist, 0);
9379 /* Initialize the iteration pointer. */
9380 invlist_iterfinish(invlist);
9382 SvREADONLY_on(invlist);
9389 S__append_range_to_invlist(pTHX_ SV* const invlist,
9390 const UV start, const UV end)
9392 /* Subject to change or removal. Append the range from 'start' to 'end' at
9393 * the end of the inversion list. The range must be above any existing
9397 UV max = invlist_max(invlist);
9398 UV len = _invlist_len(invlist);
9401 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9403 if (len == 0) { /* Empty lists must be initialized */
9404 offset = start != 0;
9405 array = _invlist_array_init(invlist, ! offset);
9408 /* Here, the existing list is non-empty. The current max entry in the
9409 * list is generally the first value not in the set, except when the
9410 * set extends to the end of permissible values, in which case it is
9411 * the first entry in that final set, and so this call is an attempt to
9412 * append out-of-order */
9414 UV final_element = len - 1;
9415 array = invlist_array(invlist);
9416 if ( array[final_element] > start
9417 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9419 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9420 array[final_element], start,
9421 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9424 /* Here, it is a legal append. If the new range begins 1 above the end
9425 * of the range below it, it is extending the range below it, so the
9426 * new first value not in the set is one greater than the newly
9427 * extended range. */
9428 offset = *get_invlist_offset_addr(invlist);
9429 if (array[final_element] == start) {
9430 if (end != UV_MAX) {
9431 array[final_element] = end + 1;
9434 /* But if the end is the maximum representable on the machine,
9435 * assume that infinity was actually what was meant. Just let
9436 * the range that this would extend to have no end */
9437 invlist_set_len(invlist, len - 1, offset);
9443 /* Here the new range doesn't extend any existing set. Add it */
9445 len += 2; /* Includes an element each for the start and end of range */
9447 /* If wll overflow the existing space, extend, which may cause the array to
9450 invlist_extend(invlist, len);
9452 /* Have to set len here to avoid assert failure in invlist_array() */
9453 invlist_set_len(invlist, len, offset);
9455 array = invlist_array(invlist);
9458 invlist_set_len(invlist, len, offset);
9461 /* The next item on the list starts the range, the one after that is
9462 * one past the new range. */
9463 array[len - 2] = start;
9464 if (end != UV_MAX) {
9465 array[len - 1] = end + 1;
9468 /* But if the end is the maximum representable on the machine, just let
9469 * the range have no end */
9470 invlist_set_len(invlist, len - 1, offset);
9475 Perl__invlist_search(SV* const invlist, const UV cp)
9477 /* Searches the inversion list for the entry that contains the input code
9478 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9479 * return value is the index into the list's array of the range that
9480 * contains <cp>, that is, 'i' such that
9481 * array[i] <= cp < array[i+1]
9486 IV high = _invlist_len(invlist);
9487 const IV highest_element = high - 1;
9490 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9492 /* If list is empty, return failure. */
9497 /* (We can't get the array unless we know the list is non-empty) */
9498 array = invlist_array(invlist);
9500 mid = invlist_previous_index(invlist);
9502 if (mid > highest_element) {
9503 mid = highest_element;
9506 /* <mid> contains the cache of the result of the previous call to this
9507 * function (0 the first time). See if this call is for the same result,
9508 * or if it is for mid-1. This is under the theory that calls to this
9509 * function will often be for related code points that are near each other.
9510 * And benchmarks show that caching gives better results. We also test
9511 * here if the code point is within the bounds of the list. These tests
9512 * replace others that would have had to be made anyway to make sure that
9513 * the array bounds were not exceeded, and these give us extra information
9514 * at the same time */
9515 if (cp >= array[mid]) {
9516 if (cp >= array[highest_element]) {
9517 return highest_element;
9520 /* Here, array[mid] <= cp < array[highest_element]. This means that
9521 * the final element is not the answer, so can exclude it; it also
9522 * means that <mid> is not the final element, so can refer to 'mid + 1'
9524 if (cp < array[mid + 1]) {
9530 else { /* cp < aray[mid] */
9531 if (cp < array[0]) { /* Fail if outside the array */
9535 if (cp >= array[mid - 1]) {
9540 /* Binary search. What we are looking for is <i> such that
9541 * array[i] <= cp < array[i+1]
9542 * The loop below converges on the i+1. Note that there may not be an
9543 * (i+1)th element in the array, and things work nonetheless */
9544 while (low < high) {
9545 mid = (low + high) / 2;
9546 assert(mid <= highest_element);
9547 if (array[mid] <= cp) { /* cp >= array[mid] */
9550 /* We could do this extra test to exit the loop early.
9551 if (cp < array[low]) {
9556 else { /* cp < array[mid] */
9563 invlist_set_previous_index(invlist, high);
9568 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9569 const bool complement_b, SV** output)
9571 /* Take the union of two inversion lists and point '*output' to it. On
9572 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9573 * even 'a' or 'b'). If to an inversion list, the contents of the original
9574 * list will be replaced by the union. The first list, 'a', may be
9575 * NULL, in which case a copy of the second list is placed in '*output'.
9576 * If 'complement_b' is TRUE, the union is taken of the complement
9577 * (inversion) of 'b' instead of b itself.
9579 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9580 * Richard Gillam, published by Addison-Wesley, and explained at some
9581 * length there. The preface says to incorporate its examples into your
9582 * code at your own risk.
9584 * The algorithm is like a merge sort. */
9586 const UV* array_a; /* a's array */
9588 UV len_a; /* length of a's array */
9591 SV* u; /* the resulting union */
9595 UV i_a = 0; /* current index into a's array */
9599 /* running count, as explained in the algorithm source book; items are
9600 * stopped accumulating and are output when the count changes to/from 0.
9601 * The count is incremented when we start a range that's in an input's set,
9602 * and decremented when we start a range that's not in a set. So this
9603 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9604 * and hence nothing goes into the union; 1, just one of the inputs is in
9605 * its set (and its current range gets added to the union); and 2 when both
9606 * inputs are in their sets. */
9609 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9611 assert(*output == NULL || is_invlist(*output));
9613 len_b = _invlist_len(b);
9616 /* Here, 'b' is empty, hence it's complement is all possible code
9617 * points. So if the union includes the complement of 'b', it includes
9618 * everything, and we need not even look at 'a'. It's easiest to
9619 * create a new inversion list that matches everything. */
9621 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9623 if (*output == NULL) { /* If the output didn't exist, just point it
9625 *output = everything;
9627 else { /* Otherwise, replace its contents with the new list */
9628 invlist_replace_list_destroys_src(*output, everything);
9629 SvREFCNT_dec_NN(everything);
9635 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9636 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9637 * output will be empty */
9639 if (a == NULL || _invlist_len(a) == 0) {
9640 if (*output == NULL) {
9641 *output = _new_invlist(0);
9644 invlist_clear(*output);
9649 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9650 * union. We can just return a copy of 'a' if '*output' doesn't point
9651 * to an existing list */
9652 if (*output == NULL) {
9653 *output = invlist_clone(a, NULL);
9657 /* If the output is to overwrite 'a', we have a no-op, as it's
9663 /* Here, '*output' is to be overwritten by 'a' */
9664 u = invlist_clone(a, NULL);
9665 invlist_replace_list_destroys_src(*output, u);
9671 /* Here 'b' is not empty. See about 'a' */
9673 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9675 /* Here, 'a' is empty (and b is not). That means the union will come
9676 * entirely from 'b'. If '*output' is NULL, we can directly return a
9677 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9680 SV ** dest = (*output == NULL) ? output : &u;
9681 *dest = invlist_clone(b, NULL);
9683 _invlist_invert(*dest);
9687 invlist_replace_list_destroys_src(*output, u);
9694 /* Here both lists exist and are non-empty */
9695 array_a = invlist_array(a);
9696 array_b = invlist_array(b);
9698 /* If are to take the union of 'a' with the complement of b, set it
9699 * up so are looking at b's complement. */
9702 /* To complement, we invert: if the first element is 0, remove it. To
9703 * do this, we just pretend the array starts one later */
9704 if (array_b[0] == 0) {
9710 /* But if the first element is not zero, we pretend the list starts
9711 * at the 0 that is always stored immediately before the array. */
9717 /* Size the union for the worst case: that the sets are completely
9719 u = _new_invlist(len_a + len_b);
9721 /* Will contain U+0000 if either component does */
9722 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9723 || (len_b > 0 && array_b[0] == 0));
9725 /* Go through each input list item by item, stopping when have exhausted
9727 while (i_a < len_a && i_b < len_b) {
9728 UV cp; /* The element to potentially add to the union's array */
9729 bool cp_in_set; /* is it in the input list's set or not */
9731 /* We need to take one or the other of the two inputs for the union.
9732 * Since we are merging two sorted lists, we take the smaller of the
9733 * next items. In case of a tie, we take first the one that is in its
9734 * set. If we first took the one not in its set, it would decrement
9735 * the count, possibly to 0 which would cause it to be output as ending
9736 * the range, and the next time through we would take the same number,
9737 * and output it again as beginning the next range. By doing it the
9738 * opposite way, there is no possibility that the count will be
9739 * momentarily decremented to 0, and thus the two adjoining ranges will
9740 * be seamlessly merged. (In a tie and both are in the set or both not
9741 * in the set, it doesn't matter which we take first.) */
9742 if ( array_a[i_a] < array_b[i_b]
9743 || ( array_a[i_a] == array_b[i_b]
9744 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9746 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9747 cp = array_a[i_a++];
9750 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9751 cp = array_b[i_b++];
9754 /* Here, have chosen which of the two inputs to look at. Only output
9755 * if the running count changes to/from 0, which marks the
9756 * beginning/end of a range that's in the set */
9759 array_u[i_u++] = cp;
9766 array_u[i_u++] = cp;
9772 /* The loop above increments the index into exactly one of the input lists
9773 * each iteration, and ends when either index gets to its list end. That
9774 * means the other index is lower than its end, and so something is
9775 * remaining in that one. We decrement 'count', as explained below, if
9776 * that list is in its set. (i_a and i_b each currently index the element
9777 * beyond the one we care about.) */
9778 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9779 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9784 /* Above we decremented 'count' if the list that had unexamined elements in
9785 * it was in its set. This has made it so that 'count' being non-zero
9786 * means there isn't anything left to output; and 'count' equal to 0 means
9787 * that what is left to output is precisely that which is left in the
9788 * non-exhausted input list.
9790 * To see why, note first that the exhausted input obviously has nothing
9791 * left to add to the union. If it was in its set at its end, that means
9792 * the set extends from here to the platform's infinity, and hence so does
9793 * the union and the non-exhausted set is irrelevant. The exhausted set
9794 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9795 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9796 * 'count' remains at 1. This is consistent with the decremented 'count'
9797 * != 0 meaning there's nothing left to add to the union.
9799 * But if the exhausted input wasn't in its set, it contributed 0 to
9800 * 'count', and the rest of the union will be whatever the other input is.
9801 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9802 * otherwise it gets decremented to 0. This is consistent with 'count'
9803 * == 0 meaning the remainder of the union is whatever is left in the
9804 * non-exhausted list. */
9809 IV copy_count = len_a - i_a;
9810 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9811 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9813 else { /* The non-exhausted input is b */
9814 copy_count = len_b - i_b;
9815 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9817 len_u = i_u + copy_count;
9820 /* Set the result to the final length, which can change the pointer to
9821 * array_u, so re-find it. (Note that it is unlikely that this will
9822 * change, as we are shrinking the space, not enlarging it) */
9823 if (len_u != _invlist_len(u)) {
9824 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9826 array_u = invlist_array(u);
9829 if (*output == NULL) { /* Simply return the new inversion list */
9833 /* Otherwise, overwrite the inversion list that was in '*output'. We
9834 * could instead free '*output', and then set it to 'u', but experience
9835 * has shown [perl #127392] that if the input is a mortal, we can get a
9836 * huge build-up of these during regex compilation before they get
9838 invlist_replace_list_destroys_src(*output, u);
9846 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9847 const bool complement_b, SV** i)
9849 /* Take the intersection of two inversion lists and point '*i' to it. On
9850 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9851 * even 'a' or 'b'). If to an inversion list, the contents of the original
9852 * list will be replaced by the intersection. The first list, 'a', may be
9853 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9854 * TRUE, the result will be the intersection of 'a' and the complement (or
9855 * inversion) of 'b' instead of 'b' directly.
9857 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9858 * Richard Gillam, published by Addison-Wesley, and explained at some
9859 * length there. The preface says to incorporate its examples into your
9860 * code at your own risk. In fact, it had bugs
9862 * The algorithm is like a merge sort, and is essentially the same as the
9866 const UV* array_a; /* a's array */
9868 UV len_a; /* length of a's array */
9871 SV* r; /* the resulting intersection */
9875 UV i_a = 0; /* current index into a's array */
9879 /* running count of how many of the two inputs are postitioned at ranges
9880 * that are in their sets. As explained in the algorithm source book,
9881 * items are stopped accumulating and are output when the count changes
9882 * to/from 2. The count is incremented when we start a range that's in an
9883 * input's set, and decremented when we start a range that's not in a set.
9884 * Only when it is 2 are we in the intersection. */
9887 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9889 assert(*i == NULL || is_invlist(*i));
9891 /* Special case if either one is empty */
9892 len_a = (a == NULL) ? 0 : _invlist_len(a);
9893 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9894 if (len_a != 0 && complement_b) {
9896 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9897 * must be empty. Here, also we are using 'b's complement, which
9898 * hence must be every possible code point. Thus the intersection
9901 if (*i == a) { /* No-op */
9906 *i = invlist_clone(a, NULL);
9910 r = invlist_clone(a, NULL);
9911 invlist_replace_list_destroys_src(*i, r);
9916 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9917 * intersection must be empty */
9919 *i = _new_invlist(0);
9927 /* Here both lists exist and are non-empty */
9928 array_a = invlist_array(a);
9929 array_b = invlist_array(b);
9931 /* If are to take the intersection of 'a' with the complement of b, set it
9932 * up so are looking at b's complement. */
9935 /* To complement, we invert: if the first element is 0, remove it. To
9936 * do this, we just pretend the array starts one later */
9937 if (array_b[0] == 0) {
9943 /* But if the first element is not zero, we pretend the list starts
9944 * at the 0 that is always stored immediately before the array. */
9950 /* Size the intersection for the worst case: that the intersection ends up
9951 * fragmenting everything to be completely disjoint */
9952 r= _new_invlist(len_a + len_b);
9954 /* Will contain U+0000 iff both components do */
9955 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9956 && len_b > 0 && array_b[0] == 0);
9958 /* Go through each list item by item, stopping when have exhausted one of
9960 while (i_a < len_a && i_b < len_b) {
9961 UV cp; /* The element to potentially add to the intersection's
9963 bool cp_in_set; /* Is it in the input list's set or not */
9965 /* We need to take one or the other of the two inputs for the
9966 * intersection. Since we are merging two sorted lists, we take the
9967 * smaller of the next items. In case of a tie, we take first the one
9968 * that is not in its set (a difference from the union algorithm). If
9969 * we first took the one in its set, it would increment the count,
9970 * possibly to 2 which would cause it to be output as starting a range
9971 * in the intersection, and the next time through we would take that
9972 * same number, and output it again as ending the set. By doing the
9973 * opposite of this, there is no possibility that the count will be
9974 * momentarily incremented to 2. (In a tie and both are in the set or
9975 * both not in the set, it doesn't matter which we take first.) */
9976 if ( array_a[i_a] < array_b[i_b]
9977 || ( array_a[i_a] == array_b[i_b]
9978 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9980 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9981 cp = array_a[i_a++];
9984 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9988 /* Here, have chosen which of the two inputs to look at. Only output
9989 * if the running count changes to/from 2, which marks the
9990 * beginning/end of a range that's in the intersection */
9994 array_r[i_r++] = cp;
9999 array_r[i_r++] = cp;
10006 /* The loop above increments the index into exactly one of the input lists
10007 * each iteration, and ends when either index gets to its list end. That
10008 * means the other index is lower than its end, and so something is
10009 * remaining in that one. We increment 'count', as explained below, if the
10010 * exhausted list was in its set. (i_a and i_b each currently index the
10011 * element beyond the one we care about.) */
10012 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10013 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10018 /* Above we incremented 'count' if the exhausted list was in its set. This
10019 * has made it so that 'count' being below 2 means there is nothing left to
10020 * output; otheriwse what's left to add to the intersection is precisely
10021 * that which is left in the non-exhausted input list.
10023 * To see why, note first that the exhausted input obviously has nothing
10024 * left to affect the intersection. If it was in its set at its end, that
10025 * means the set extends from here to the platform's infinity, and hence
10026 * anything in the non-exhausted's list will be in the intersection, and
10027 * anything not in it won't be. Hence, the rest of the intersection is
10028 * precisely what's in the non-exhausted list The exhausted set also
10029 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10030 * it means 'count' is now at least 2. This is consistent with the
10031 * incremented 'count' being >= 2 means to add the non-exhausted list to
10032 * the intersection.
10034 * But if the exhausted input wasn't in its set, it contributed 0 to
10035 * 'count', and the intersection can't include anything further; the
10036 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10037 * incremented. This is consistent with 'count' being < 2 meaning nothing
10038 * further to add to the intersection. */
10039 if (count < 2) { /* Nothing left to put in the intersection. */
10042 else { /* copy the non-exhausted list, unchanged. */
10043 IV copy_count = len_a - i_a;
10044 if (copy_count > 0) { /* a is the one with stuff left */
10045 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10047 else { /* b is the one with stuff left */
10048 copy_count = len_b - i_b;
10049 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10051 len_r = i_r + copy_count;
10054 /* Set the result to the final length, which can change the pointer to
10055 * array_r, so re-find it. (Note that it is unlikely that this will
10056 * change, as we are shrinking the space, not enlarging it) */
10057 if (len_r != _invlist_len(r)) {
10058 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10060 array_r = invlist_array(r);
10063 if (*i == NULL) { /* Simply return the calculated intersection */
10066 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10067 instead free '*i', and then set it to 'r', but experience has
10068 shown [perl #127392] that if the input is a mortal, we can get a
10069 huge build-up of these during regex compilation before they get
10072 invlist_replace_list_destroys_src(*i, r);
10077 SvREFCNT_dec_NN(r);
10084 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10086 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10087 * set. A pointer to the inversion list is returned. This may actually be
10088 * a new list, in which case the passed in one has been destroyed. The
10089 * passed-in inversion list can be NULL, in which case a new one is created
10090 * with just the one range in it. The new list is not necessarily
10091 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10092 * result of this function. The gain would not be large, and in many
10093 * cases, this is called multiple times on a single inversion list, so
10094 * anything freed may almost immediately be needed again.
10096 * This used to mostly call the 'union' routine, but that is much more
10097 * heavyweight than really needed for a single range addition */
10099 UV* array; /* The array implementing the inversion list */
10100 UV len; /* How many elements in 'array' */
10101 SSize_t i_s; /* index into the invlist array where 'start'
10103 SSize_t i_e = 0; /* And the index where 'end' should go */
10104 UV cur_highest; /* The highest code point in the inversion list
10105 upon entry to this function */
10107 /* This range becomes the whole inversion list if none already existed */
10108 if (invlist == NULL) {
10109 invlist = _new_invlist(2);
10110 _append_range_to_invlist(invlist, start, end);
10114 /* Likewise, if the inversion list is currently empty */
10115 len = _invlist_len(invlist);
10117 _append_range_to_invlist(invlist, start, end);
10121 /* Starting here, we have to know the internals of the list */
10122 array = invlist_array(invlist);
10124 /* If the new range ends higher than the current highest ... */
10125 cur_highest = invlist_highest(invlist);
10126 if (end > cur_highest) {
10128 /* If the whole range is higher, we can just append it */
10129 if (start > cur_highest) {
10130 _append_range_to_invlist(invlist, start, end);
10134 /* Otherwise, add the portion that is higher ... */
10135 _append_range_to_invlist(invlist, cur_highest + 1, end);
10137 /* ... and continue on below to handle the rest. As a result of the
10138 * above append, we know that the index of the end of the range is the
10139 * final even numbered one of the array. Recall that the final element
10140 * always starts a range that extends to infinity. If that range is in
10141 * the set (meaning the set goes from here to infinity), it will be an
10142 * even index, but if it isn't in the set, it's odd, and the final
10143 * range in the set is one less, which is even. */
10144 if (end == UV_MAX) {
10152 /* We have dealt with appending, now see about prepending. If the new
10153 * range starts lower than the current lowest ... */
10154 if (start < array[0]) {
10156 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10157 * Let the union code handle it, rather than having to know the
10158 * trickiness in two code places. */
10159 if (UNLIKELY(start == 0)) {
10162 range_invlist = _new_invlist(2);
10163 _append_range_to_invlist(range_invlist, start, end);
10165 _invlist_union(invlist, range_invlist, &invlist);
10167 SvREFCNT_dec_NN(range_invlist);
10172 /* If the whole new range comes before the first entry, and doesn't
10173 * extend it, we have to insert it as an additional range */
10174 if (end < array[0] - 1) {
10176 goto splice_in_new_range;
10179 /* Here the new range adjoins the existing first range, extending it
10183 /* And continue on below to handle the rest. We know that the index of
10184 * the beginning of the range is the first one of the array */
10187 else { /* Not prepending any part of the new range to the existing list.
10188 * Find where in the list it should go. This finds i_s, such that:
10189 * invlist[i_s] <= start < array[i_s+1]
10191 i_s = _invlist_search(invlist, start);
10194 /* At this point, any extending before the beginning of the inversion list
10195 * and/or after the end has been done. This has made it so that, in the
10196 * code below, each endpoint of the new range is either in a range that is
10197 * in the set, or is in a gap between two ranges that are. This means we
10198 * don't have to worry about exceeding the array bounds.
10200 * Find where in the list the new range ends (but we can skip this if we
10201 * have already determined what it is, or if it will be the same as i_s,
10202 * which we already have computed) */
10204 i_e = (start == end)
10206 : _invlist_search(invlist, end);
10209 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10210 * is a range that goes to infinity there is no element at invlist[i_e+1],
10211 * so only the first relation holds. */
10213 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10215 /* Here, the ranges on either side of the beginning of the new range
10216 * are in the set, and this range starts in the gap between them.
10218 * The new range extends the range above it downwards if the new range
10219 * ends at or above that range's start */
10220 const bool extends_the_range_above = ( end == UV_MAX
10221 || end + 1 >= array[i_s+1]);
10223 /* The new range extends the range below it upwards if it begins just
10224 * after where that range ends */
10225 if (start == array[i_s]) {
10227 /* If the new range fills the entire gap between the other ranges,
10228 * they will get merged together. Other ranges may also get
10229 * merged, depending on how many of them the new range spans. In
10230 * the general case, we do the merge later, just once, after we
10231 * figure out how many to merge. But in the case where the new
10232 * range exactly spans just this one gap (possibly extending into
10233 * the one above), we do the merge here, and an early exit. This
10234 * is done here to avoid having to special case later. */
10235 if (i_e - i_s <= 1) {
10237 /* If i_e - i_s == 1, it means that the new range terminates
10238 * within the range above, and hence 'extends_the_range_above'
10239 * must be true. (If the range above it extends to infinity,
10240 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10241 * will be 0, so no harm done.) */
10242 if (extends_the_range_above) {
10243 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10244 invlist_set_len(invlist,
10246 *(get_invlist_offset_addr(invlist)));
10250 /* Here, i_e must == i_s. We keep them in sync, as they apply
10251 * to the same range, and below we are about to decrement i_s
10256 /* Here, the new range is adjacent to the one below. (It may also
10257 * span beyond the range above, but that will get resolved later.)
10258 * Extend the range below to include this one. */
10259 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10261 start = array[i_s];
10263 else if (extends_the_range_above) {
10265 /* Here the new range only extends the range above it, but not the
10266 * one below. It merges with the one above. Again, we keep i_e
10267 * and i_s in sync if they point to the same range */
10272 array[i_s] = start;
10276 /* Here, we've dealt with the new range start extending any adjoining
10279 * If the new range extends to infinity, it is now the final one,
10280 * regardless of what was there before */
10281 if (UNLIKELY(end == UV_MAX)) {
10282 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10286 /* If i_e started as == i_s, it has also been dealt with,
10287 * and been updated to the new i_s, which will fail the following if */
10288 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10290 /* Here, the ranges on either side of the end of the new range are in
10291 * the set, and this range ends in the gap between them.
10293 * If this range is adjacent to (hence extends) the range above it, it
10294 * becomes part of that range; likewise if it extends the range below,
10295 * it becomes part of that range */
10296 if (end + 1 == array[i_e+1]) {
10298 array[i_e] = start;
10300 else if (start <= array[i_e]) {
10301 array[i_e] = end + 1;
10308 /* If the range fits entirely in an existing range (as possibly already
10309 * extended above), it doesn't add anything new */
10310 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10314 /* Here, no part of the range is in the list. Must add it. It will
10315 * occupy 2 more slots */
10316 splice_in_new_range:
10318 invlist_extend(invlist, len + 2);
10319 array = invlist_array(invlist);
10320 /* Move the rest of the array down two slots. Don't include any
10322 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10324 /* Do the actual splice */
10325 array[i_e+1] = start;
10326 array[i_e+2] = end + 1;
10327 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10331 /* Here the new range crossed the boundaries of a pre-existing range. The
10332 * code above has adjusted things so that both ends are in ranges that are
10333 * in the set. This means everything in between must also be in the set.
10334 * Just squash things together */
10335 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10336 invlist_set_len(invlist,
10338 *(get_invlist_offset_addr(invlist)));
10344 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10345 UV** other_elements_ptr)
10347 /* Create and return an inversion list whose contents are to be populated
10348 * by the caller. The caller gives the number of elements (in 'size') and
10349 * the very first element ('element0'). This function will set
10350 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10351 * are to be placed.
10353 * Obviously there is some trust involved that the caller will properly
10354 * fill in the other elements of the array.
10356 * (The first element needs to be passed in, as the underlying code does
10357 * things differently depending on whether it is zero or non-zero) */
10359 SV* invlist = _new_invlist(size);
10362 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10364 invlist = add_cp_to_invlist(invlist, element0);
10365 offset = *get_invlist_offset_addr(invlist);
10367 invlist_set_len(invlist, size, offset);
10368 *other_elements_ptr = invlist_array(invlist) + 1;
10374 #ifndef PERL_IN_XSUB_RE
10376 Perl__invlist_invert(pTHX_ SV* const invlist)
10378 /* Complement the input inversion list. This adds a 0 if the list didn't
10379 * have a zero; removes it otherwise. As described above, the data
10380 * structure is set up so that this is very efficient */
10382 PERL_ARGS_ASSERT__INVLIST_INVERT;
10384 assert(! invlist_is_iterating(invlist));
10386 /* The inverse of matching nothing is matching everything */
10387 if (_invlist_len(invlist) == 0) {
10388 _append_range_to_invlist(invlist, 0, UV_MAX);
10392 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10396 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10398 /* Return a new inversion list that is a copy of the input one, which is
10399 * unchanged. The new list will not be mortal even if the old one was. */
10401 const STRLEN nominal_length = _invlist_len(invlist);
10402 const STRLEN physical_length = SvCUR(invlist);
10403 const bool offset = *(get_invlist_offset_addr(invlist));
10405 PERL_ARGS_ASSERT_INVLIST_CLONE;
10407 if (new_invlist == NULL) {
10408 new_invlist = _new_invlist(nominal_length);
10411 sv_upgrade(new_invlist, SVt_INVLIST);
10412 initialize_invlist_guts(new_invlist, nominal_length);
10415 *(get_invlist_offset_addr(new_invlist)) = offset;
10416 invlist_set_len(new_invlist, nominal_length, offset);
10417 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10419 return new_invlist;
10424 PERL_STATIC_INLINE UV
10425 S_invlist_lowest(SV* const invlist)
10427 /* Returns the lowest code point that matches an inversion list. This API
10428 * has an ambiguity, as it returns 0 under either the lowest is actually
10429 * 0, or if the list is empty. If this distinction matters to you, check
10430 * for emptiness before calling this function */
10432 UV len = _invlist_len(invlist);
10435 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10441 array = invlist_array(invlist);
10447 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10449 /* Get the contents of an inversion list into a string SV so that they can
10450 * be printed out. If 'traditional_style' is TRUE, it uses the format
10451 * traditionally done for debug tracing; otherwise it uses a format
10452 * suitable for just copying to the output, with blanks between ranges and
10453 * a dash between range components */
10457 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10458 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10460 if (traditional_style) {
10461 output = newSVpvs("\n");
10464 output = newSVpvs("");
10467 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10469 assert(! invlist_is_iterating(invlist));
10471 invlist_iterinit(invlist);
10472 while (invlist_iternext(invlist, &start, &end)) {
10473 if (end == UV_MAX) {
10474 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10475 start, intra_range_delimiter,
10476 inter_range_delimiter);
10478 else if (end != start) {
10479 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10481 intra_range_delimiter,
10482 end, inter_range_delimiter);
10485 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10486 start, inter_range_delimiter);
10490 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10491 SvCUR_set(output, SvCUR(output) - 1);
10497 #ifndef PERL_IN_XSUB_RE
10499 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10500 const char * const indent, SV* const invlist)
10502 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10503 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10504 * the string 'indent'. The output looks like this:
10505 [0] 0x000A .. 0x000D
10507 [4] 0x2028 .. 0x2029
10508 [6] 0x3104 .. INFTY
10509 * This means that the first range of code points matched by the list are
10510 * 0xA through 0xD; the second range contains only the single code point
10511 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10512 * are used to define each range (except if the final range extends to
10513 * infinity, only a single element is needed). The array index of the
10514 * first element for the corresponding range is given in brackets. */
10519 PERL_ARGS_ASSERT__INVLIST_DUMP;
10521 if (invlist_is_iterating(invlist)) {
10522 Perl_dump_indent(aTHX_ level, file,
10523 "%sCan't dump inversion list because is in middle of iterating\n",
10528 invlist_iterinit(invlist);
10529 while (invlist_iternext(invlist, &start, &end)) {
10530 if (end == UV_MAX) {
10531 Perl_dump_indent(aTHX_ level, file,
10532 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10533 indent, (UV)count, start);
10535 else if (end != start) {
10536 Perl_dump_indent(aTHX_ level, file,
10537 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10538 indent, (UV)count, start, end);
10541 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10542 indent, (UV)count, start);
10550 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10552 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10554 /* Return a boolean as to if the two passed in inversion lists are
10555 * identical. The final argument, if TRUE, says to take the complement of
10556 * the second inversion list before doing the comparison */
10558 const UV len_a = _invlist_len(a);
10559 UV len_b = _invlist_len(b);
10561 const UV* array_a = NULL;
10562 const UV* array_b = NULL;
10564 PERL_ARGS_ASSERT__INVLISTEQ;
10566 /* This code avoids accessing the arrays unless it knows the length is
10571 return ! complement_b;
10575 array_a = invlist_array(a);
10579 array_b = invlist_array(b);
10582 /* If are to compare 'a' with the complement of b, set it
10583 * up so are looking at b's complement. */
10584 if (complement_b) {
10586 /* The complement of nothing is everything, so <a> would have to have
10587 * just one element, starting at zero (ending at infinity) */
10589 return (len_a == 1 && array_a[0] == 0);
10591 if (array_b[0] == 0) {
10593 /* Otherwise, to complement, we invert. Here, the first element is
10594 * 0, just remove it. To do this, we just pretend the array starts
10602 /* But if the first element is not zero, we pretend the list starts
10603 * at the 0 that is always stored immediately before the array. */
10609 return len_a == len_b
10610 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10616 * As best we can, determine the characters that can match the start of
10617 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10618 * can be false positive matches
10620 * Returns the invlist as a new SV*; it is the caller's responsibility to
10621 * call SvREFCNT_dec() when done with it.
10624 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10626 const U8 * s = (U8*)STRING(node);
10627 SSize_t bytelen = STR_LEN(node);
10629 /* Start out big enough for 2 separate code points */
10630 SV* invlist = _new_invlist(4);
10632 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10637 /* We punt and assume can match anything if the node begins
10638 * with a multi-character fold. Things are complicated. For
10639 * example, /ffi/i could match any of:
10640 * "\N{LATIN SMALL LIGATURE FFI}"
10641 * "\N{LATIN SMALL LIGATURE FF}I"
10642 * "F\N{LATIN SMALL LIGATURE FI}"
10643 * plus several other things; and making sure we have all the
10644 * possibilities is hard. */
10645 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10646 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10649 /* Any Latin1 range character can potentially match any
10650 * other depending on the locale, and in Turkic locales, U+130 and
10652 if (OP(node) == EXACTFL) {
10653 _invlist_union(invlist, PL_Latin1, &invlist);
10654 invlist = add_cp_to_invlist(invlist,
10655 LATIN_SMALL_LETTER_DOTLESS_I);
10656 invlist = add_cp_to_invlist(invlist,
10657 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10660 /* But otherwise, it matches at least itself. We can
10661 * quickly tell if it has a distinct fold, and if so,
10662 * it matches that as well */
10663 invlist = add_cp_to_invlist(invlist, uc);
10664 if (IS_IN_SOME_FOLD_L1(uc))
10665 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10668 /* Some characters match above-Latin1 ones under /i. This
10669 * is true of EXACTFL ones when the locale is UTF-8 */
10670 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10671 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
10672 EXACTFAA_NO_TRIE)))
10674 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10678 else { /* Pattern is UTF-8 */
10679 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10680 const U8* e = s + bytelen;
10683 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10685 /* The only code points that aren't folded in a UTF EXACTFish
10686 * node are the problematic ones in EXACTFL nodes */
10687 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10688 /* We need to check for the possibility that this EXACTFL
10689 * node begins with a multi-char fold. Therefore we fold
10690 * the first few characters of it so that we can make that
10696 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10698 *(d++) = (U8) toFOLD(*s);
10699 if (fc < 0) { /* Save the first fold */
10706 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10707 if (fc < 0) { /* Save the first fold */
10715 /* And set up so the code below that looks in this folded
10716 * buffer instead of the node's string */
10721 /* When we reach here 's' points to the fold of the first
10722 * character(s) of the node; and 'e' points to far enough along
10723 * the folded string to be just past any possible multi-char
10726 * Like the non-UTF case above, we punt if the node begins with a
10727 * multi-char fold */
10729 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10730 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10732 else { /* Single char fold */
10735 const U32 * remaining_folds;
10736 Size_t folds_count;
10738 /* It matches itself */
10739 invlist = add_cp_to_invlist(invlist, fc);
10741 /* ... plus all the things that fold to it, which are found in
10742 * PL_utf8_foldclosures */
10743 folds_count = _inverse_folds(fc, &first_fold,
10745 for (k = 0; k < folds_count; k++) {
10746 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10748 /* /aa doesn't allow folds between ASCII and non- */
10749 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
10750 && isASCII(c) != isASCII(fc))
10755 invlist = add_cp_to_invlist(invlist, c);
10758 if (OP(node) == EXACTFL) {
10760 /* If either [iI] are present in an EXACTFL node the above code
10761 * should have added its normal case pair, but under a Turkish
10762 * locale they could match instead the case pairs from it. Add
10763 * those as potential matches as well */
10764 if (isALPHA_FOLD_EQ(fc, 'I')) {
10765 invlist = add_cp_to_invlist(invlist,
10766 LATIN_SMALL_LETTER_DOTLESS_I);
10767 invlist = add_cp_to_invlist(invlist,
10768 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10770 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10771 invlist = add_cp_to_invlist(invlist, 'I');
10773 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10774 invlist = add_cp_to_invlist(invlist, 'i');
10783 #undef HEADER_LENGTH
10784 #undef TO_INTERNAL_SIZE
10785 #undef FROM_INTERNAL_SIZE
10786 #undef INVLIST_VERSION_ID
10788 /* End of inversion list object */
10791 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10793 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10794 * constructs, and updates RExC_flags with them. On input, RExC_parse
10795 * should point to the first flag; it is updated on output to point to the
10796 * final ')' or ':'. There needs to be at least one flag, or this will
10799 /* for (?g), (?gc), and (?o) warnings; warning
10800 about (?c) will warn about (?g) -- japhy */
10802 #define WASTED_O 0x01
10803 #define WASTED_G 0x02
10804 #define WASTED_C 0x04
10805 #define WASTED_GC (WASTED_G|WASTED_C)
10806 I32 wastedflags = 0x00;
10807 U32 posflags = 0, negflags = 0;
10808 U32 *flagsp = &posflags;
10809 char has_charset_modifier = '\0';
10811 bool has_use_defaults = FALSE;
10812 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10813 int x_mod_count = 0;
10815 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10817 /* '^' as an initial flag sets certain defaults */
10818 if (UCHARAT(RExC_parse) == '^') {
10820 has_use_defaults = TRUE;
10821 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10822 cs = (RExC_uni_semantics)
10823 ? REGEX_UNICODE_CHARSET
10824 : REGEX_DEPENDS_CHARSET;
10825 set_regex_charset(&RExC_flags, cs);
10828 cs = get_regex_charset(RExC_flags);
10829 if ( cs == REGEX_DEPENDS_CHARSET
10830 && RExC_uni_semantics)
10832 cs = REGEX_UNICODE_CHARSET;
10836 while (RExC_parse < RExC_end) {
10837 /* && memCHRs("iogcmsx", *RExC_parse) */
10838 /* (?g), (?gc) and (?o) are useless here
10839 and must be globally applied -- japhy */
10840 if ((RExC_pm_flags & PMf_WILDCARD)) {
10841 if (flagsp == & negflags) {
10842 if (*RExC_parse == 'm') {
10844 /* diag_listed_as: Use of %s is not allowed in Unicode
10845 property wildcard subpatterns in regex; marked by <--
10847 vFAIL("Use of modifier '-m' is not allowed in Unicode"
10848 " property wildcard subpatterns");
10852 if (*RExC_parse == 's') {
10853 goto modifier_illegal_in_wildcard;
10858 switch (*RExC_parse) {
10860 /* Code for the imsxn flags */
10861 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10863 case LOCALE_PAT_MOD:
10864 if (has_charset_modifier) {
10865 goto excess_modifier;
10867 else if (flagsp == &negflags) {
10870 cs = REGEX_LOCALE_CHARSET;
10871 has_charset_modifier = LOCALE_PAT_MOD;
10873 case UNICODE_PAT_MOD:
10874 if (has_charset_modifier) {
10875 goto excess_modifier;
10877 else if (flagsp == &negflags) {
10880 cs = REGEX_UNICODE_CHARSET;
10881 has_charset_modifier = UNICODE_PAT_MOD;
10883 case ASCII_RESTRICT_PAT_MOD:
10884 if (flagsp == &negflags) {
10887 if (has_charset_modifier) {
10888 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10889 goto excess_modifier;
10891 /* Doubled modifier implies more restricted */
10892 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10895 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10897 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10899 case DEPENDS_PAT_MOD:
10900 if (has_use_defaults) {
10901 goto fail_modifiers;
10903 else if (flagsp == &negflags) {
10906 else if (has_charset_modifier) {
10907 goto excess_modifier;
10910 /* The dual charset means unicode semantics if the
10911 * pattern (or target, not known until runtime) are
10912 * utf8, or something in the pattern indicates unicode
10914 cs = (RExC_uni_semantics)
10915 ? REGEX_UNICODE_CHARSET
10916 : REGEX_DEPENDS_CHARSET;
10917 has_charset_modifier = DEPENDS_PAT_MOD;
10921 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10922 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10924 else if (has_charset_modifier == *(RExC_parse - 1)) {
10925 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10926 *(RExC_parse - 1));
10929 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10931 NOT_REACHED; /*NOTREACHED*/
10934 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10935 *(RExC_parse - 1));
10936 NOT_REACHED; /*NOTREACHED*/
10937 case GLOBAL_PAT_MOD: /* 'g' */
10938 if (RExC_pm_flags & PMf_WILDCARD) {
10939 goto modifier_illegal_in_wildcard;
10942 case ONCE_PAT_MOD: /* 'o' */
10943 if (ckWARN(WARN_REGEXP)) {
10944 const I32 wflagbit = *RExC_parse == 'o'
10947 if (! (wastedflags & wflagbit) ) {
10948 wastedflags |= wflagbit;
10949 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10952 "Useless (%s%c) - %suse /%c modifier",
10953 flagsp == &negflags ? "?-" : "?",
10955 flagsp == &negflags ? "don't " : "",
10962 case CONTINUE_PAT_MOD: /* 'c' */
10963 if (RExC_pm_flags & PMf_WILDCARD) {
10964 goto modifier_illegal_in_wildcard;
10966 if (ckWARN(WARN_REGEXP)) {
10967 if (! (wastedflags & WASTED_C) ) {
10968 wastedflags |= WASTED_GC;
10969 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10972 "Useless (%sc) - %suse /gc modifier",
10973 flagsp == &negflags ? "?-" : "?",
10974 flagsp == &negflags ? "don't " : ""
10979 case KEEPCOPY_PAT_MOD: /* 'p' */
10980 if (RExC_pm_flags & PMf_WILDCARD) {
10981 goto modifier_illegal_in_wildcard;
10983 if (flagsp == &negflags) {
10984 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10986 *flagsp |= RXf_PMf_KEEPCOPY;
10990 /* A flag is a default iff it is following a minus, so
10991 * if there is a minus, it means will be trying to
10992 * re-specify a default which is an error */
10993 if (has_use_defaults || flagsp == &negflags) {
10994 goto fail_modifiers;
10996 flagsp = &negflags;
10997 wastedflags = 0; /* reset so (?g-c) warns twice */
11003 if ( (RExC_pm_flags & PMf_WILDCARD)
11004 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11007 /* diag_listed_as: Use of %s is not allowed in Unicode
11008 property wildcard subpatterns in regex; marked by <--
11010 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11011 " property wildcard subpatterns",
11012 has_charset_modifier);
11015 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11016 negflags |= RXf_PMf_EXTENDED_MORE;
11018 RExC_flags |= posflags;
11020 if (negflags & RXf_PMf_EXTENDED) {
11021 negflags |= RXf_PMf_EXTENDED_MORE;
11023 RExC_flags &= ~negflags;
11024 set_regex_charset(&RExC_flags, cs);
11029 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11030 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11031 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11032 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11033 NOT_REACHED; /*NOTREACHED*/
11036 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11039 vFAIL("Sequence (?... not terminated");
11041 modifier_illegal_in_wildcard:
11043 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11044 subpatterns in regex; marked by <-- HERE in m/%s/ */
11045 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11046 " subpatterns", *(RExC_parse - 1));
11050 - reg - regular expression, i.e. main body or parenthesized thing
11052 * Caller must absorb opening parenthesis.
11054 * Combining parenthesis handling with the base level of regular expression
11055 * is a trifle forced, but the need to tie the tails of the branches to what
11056 * follows makes it hard to avoid.
11058 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11060 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11062 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11065 STATIC regnode_offset
11066 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11068 char * parse_start,
11072 regnode_offset ret;
11073 char* name_start = RExC_parse;
11075 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11076 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11078 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11080 if (RExC_parse == name_start || *RExC_parse != ch) {
11081 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11082 vFAIL2("Sequence %.3s... not terminated", parse_start);
11086 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11087 RExC_rxi->data->data[num]=(void*)sv_dat;
11088 SvREFCNT_inc_simple_void_NN(sv_dat);
11091 ret = reganode(pRExC_state,
11094 : (ASCII_FOLD_RESTRICTED)
11096 : (AT_LEAST_UNI_SEMANTICS)
11102 *flagp |= HASWIDTH;
11104 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11105 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11107 nextchar(pRExC_state);
11111 /* On success, returns the offset at which any next node should be placed into
11112 * the regex engine program being compiled.
11114 * Returns 0 otherwise, with *flagp set to indicate why:
11115 * TRYAGAIN at the end of (?) that only sets flags.
11116 * RESTART_PARSE if the parse needs to be restarted, or'd with
11117 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11118 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11120 STATIC regnode_offset
11121 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11122 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11123 * 2 is like 1, but indicates that nextchar() has been called to advance
11124 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11125 * this flag alerts us to the need to check for that */
11127 regnode_offset ret = 0; /* Will be the head of the group. */
11129 regnode_offset lastbr;
11130 regnode_offset ender = 0;
11133 U32 oregflags = RExC_flags;
11134 bool have_branch = 0;
11136 I32 freeze_paren = 0;
11137 I32 after_freeze = 0;
11138 I32 num; /* numeric backreferences */
11139 SV * max_open; /* Max number of unclosed parens */
11141 char * parse_start = RExC_parse; /* MJD */
11142 char * const oregcomp_parse = RExC_parse;
11144 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11146 PERL_ARGS_ASSERT_REG;
11147 DEBUG_PARSE("reg ");
11149 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11151 if (!SvIOK(max_open)) {
11152 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11154 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11156 vFAIL("Too many nested open parens");
11159 *flagp = 0; /* Initialize. */
11161 if (RExC_in_lookbehind) {
11162 RExC_in_lookbehind++;
11164 if (RExC_in_lookahead) {
11165 RExC_in_lookahead++;
11168 /* Having this true makes it feasible to have a lot fewer tests for the
11169 * parse pointer being in scope. For example, we can write
11170 * while(isFOO(*RExC_parse)) RExC_parse++;
11172 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11174 assert(*RExC_end == '\0');
11176 /* Make an OPEN node, if parenthesized. */
11179 /* Under /x, space and comments can be gobbled up between the '(' and
11180 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11181 * intervening space, as the sequence is a token, and a token should be
11183 bool has_intervening_patws = (paren == 2)
11184 && *(RExC_parse - 1) != '(';
11186 if (RExC_parse >= RExC_end) {
11187 vFAIL("Unmatched (");
11190 if (paren == 'r') { /* Atomic script run */
11194 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11195 char *start_verb = RExC_parse + 1;
11197 char *start_arg = NULL;
11198 unsigned char op = 0;
11199 int arg_required = 0;
11200 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11201 bool has_upper = FALSE;
11203 if (has_intervening_patws) {
11204 RExC_parse++; /* past the '*' */
11206 /* For strict backwards compatibility, don't change the message
11207 * now that we also have lowercase operands */
11208 if (isUPPER(*RExC_parse)) {
11209 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11212 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11215 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11216 if ( *RExC_parse == ':' ) {
11217 start_arg = RExC_parse + 1;
11221 if (isUPPER(*RExC_parse)) {
11227 RExC_parse += UTF8SKIP(RExC_parse);
11230 verb_len = RExC_parse - start_verb;
11232 if (RExC_parse >= RExC_end) {
11233 goto unterminated_verb_pattern;
11236 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11237 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11238 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11240 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11241 unterminated_verb_pattern:
11243 vFAIL("Unterminated verb pattern argument");
11246 vFAIL("Unterminated '(*...' argument");
11250 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11252 vFAIL("Unterminated verb pattern");
11255 vFAIL("Unterminated '(*...' construct");
11260 /* Here, we know that RExC_parse < RExC_end */
11262 switch ( *start_verb ) {
11263 case 'A': /* (*ACCEPT) */
11264 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11266 internal_argval = RExC_nestroot;
11269 case 'C': /* (*COMMIT) */
11270 if ( memEQs(start_verb, verb_len,"COMMIT") )
11273 case 'F': /* (*FAIL) */
11274 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11278 case ':': /* (*:NAME) */
11279 case 'M': /* (*MARK:NAME) */
11280 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11285 case 'P': /* (*PRUNE) */
11286 if ( memEQs(start_verb, verb_len,"PRUNE") )
11289 case 'S': /* (*SKIP) */
11290 if ( memEQs(start_verb, verb_len,"SKIP") )
11293 case 'T': /* (*THEN) */
11294 /* [19:06] <TimToady> :: is then */
11295 if ( memEQs(start_verb, verb_len,"THEN") ) {
11297 RExC_seen |= REG_CUTGROUP_SEEN;
11301 if ( memEQs(start_verb, verb_len, "asr")
11302 || memEQs(start_verb, verb_len, "atomic_script_run"))
11304 paren = 'r'; /* Mnemonic: recursed run */
11307 else if (memEQs(start_verb, verb_len, "atomic")) {
11308 paren = 't'; /* AtOMIC */
11309 goto alpha_assertions;
11313 if ( memEQs(start_verb, verb_len, "plb")
11314 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11317 goto lookbehind_alpha_assertions;
11319 else if ( memEQs(start_verb, verb_len, "pla")
11320 || memEQs(start_verb, verb_len, "positive_lookahead"))
11323 goto alpha_assertions;
11327 if ( memEQs(start_verb, verb_len, "nlb")
11328 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11331 goto lookbehind_alpha_assertions;
11333 else if ( memEQs(start_verb, verb_len, "nla")
11334 || memEQs(start_verb, verb_len, "negative_lookahead"))
11337 goto alpha_assertions;
11341 if ( memEQs(start_verb, verb_len, "sr")
11342 || memEQs(start_verb, verb_len, "script_run"))
11344 regnode_offset atomic;
11350 /* This indicates Unicode rules. */
11351 REQUIRE_UNI_RULES(flagp, 0);
11357 RExC_parse = start_arg;
11359 if (RExC_in_script_run) {
11361 /* Nested script runs are treated as no-ops, because
11362 * if the nested one fails, the outer one must as
11363 * well. It could fail sooner, and avoid (??{} with
11364 * side effects, but that is explicitly documented as
11365 * undefined behavior. */
11369 if (paren == 's') {
11374 /* But, the atomic part of a nested atomic script run
11375 * isn't a no-op, but can be treated just like a '(?>'
11381 if (paren == 's') {
11382 /* Here, we're starting a new regular script run */
11383 ret = reg_node(pRExC_state, SROPEN);
11384 RExC_in_script_run = 1;
11389 /* Here, we are starting an atomic script run. This is
11390 * handled by recursing to deal with the atomic portion
11391 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11393 ret = reg_node(pRExC_state, SROPEN);
11395 RExC_in_script_run = 1;
11397 atomic = reg(pRExC_state, 'r', &flags, depth);
11398 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11399 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11403 if (! REGTAIL(pRExC_state, ret, atomic)) {
11404 REQUIRE_BRANCHJ(flagp, 0);
11407 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11410 REQUIRE_BRANCHJ(flagp, 0);
11413 RExC_in_script_run = 0;
11419 lookbehind_alpha_assertions:
11420 RExC_seen |= REG_LOOKBEHIND_SEEN;
11421 RExC_in_lookbehind++;
11426 RExC_seen_zerolen++;
11432 /* An empty negative lookahead assertion simply is failure */
11433 if (paren == 'A' && RExC_parse == start_arg) {
11434 ret=reganode(pRExC_state, OPFAIL, 0);
11435 nextchar(pRExC_state);
11439 RExC_parse = start_arg;
11444 "'(*%" UTF8f "' requires a terminating ':'",
11445 UTF8fARG(UTF, verb_len, start_verb));
11446 NOT_REACHED; /*NOTREACHED*/
11448 } /* End of switch */
11451 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11453 if (has_upper || verb_len == 0) {
11455 "Unknown verb pattern '%" UTF8f "'",
11456 UTF8fARG(UTF, verb_len, start_verb));
11460 "Unknown '(*...)' construct '%" UTF8f "'",
11461 UTF8fARG(UTF, verb_len, start_verb));
11464 if ( RExC_parse == start_arg ) {
11467 if ( arg_required && !start_arg ) {
11468 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11469 (int) verb_len, start_verb);
11471 if (internal_argval == -1) {
11472 ret = reganode(pRExC_state, op, 0);
11474 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11476 RExC_seen |= REG_VERBARG_SEEN;
11478 SV *sv = newSVpvn( start_arg,
11479 RExC_parse - start_arg);
11480 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11481 STR_WITH_LEN("S"));
11482 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11483 FLAGS(REGNODE_p(ret)) = 1;
11485 FLAGS(REGNODE_p(ret)) = 0;
11487 if ( internal_argval != -1 )
11488 ARG2L_SET(REGNODE_p(ret), internal_argval);
11489 nextchar(pRExC_state);
11492 else if (*RExC_parse == '?') { /* (?...) */
11493 bool is_logical = 0;
11494 const char * const seqstart = RExC_parse;
11495 const char * endptr;
11496 const char non_existent_group_msg[]
11497 = "Reference to nonexistent group";
11498 const char impossible_group[] = "Invalid reference to group";
11500 if (has_intervening_patws) {
11502 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11505 RExC_parse++; /* past the '?' */
11506 paren = *RExC_parse; /* might be a trailing NUL, if not
11508 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11509 if (RExC_parse > RExC_end) {
11512 ret = 0; /* For look-ahead/behind. */
11515 case 'P': /* (?P...) variants for those used to PCRE/Python */
11516 paren = *RExC_parse;
11517 if ( paren == '<') { /* (?P<...>) named capture */
11519 if (RExC_parse >= RExC_end) {
11520 vFAIL("Sequence (?P<... not terminated");
11522 goto named_capture;
11524 else if (paren == '>') { /* (?P>name) named recursion */
11526 if (RExC_parse >= RExC_end) {
11527 vFAIL("Sequence (?P>... not terminated");
11529 goto named_recursion;
11531 else if (paren == '=') { /* (?P=...) named backref */
11533 return handle_named_backref(pRExC_state, flagp,
11536 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11537 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11538 vFAIL3("Sequence (%.*s...) not recognized",
11539 (int) (RExC_parse - seqstart), seqstart);
11540 NOT_REACHED; /*NOTREACHED*/
11541 case '<': /* (?<...) */
11542 /* If you want to support (?<*...), first reconcile with GH #17363 */
11543 if (*RExC_parse == '!')
11545 else if (*RExC_parse != '=')
11552 case '\'': /* (?'...') */
11553 name_start = RExC_parse;
11554 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11555 if ( RExC_parse == name_start
11556 || RExC_parse >= RExC_end
11557 || *RExC_parse != paren)
11559 vFAIL2("Sequence (?%c... not terminated",
11560 paren=='>' ? '<' : (char) paren);
11565 if (!svname) /* shouldn't happen */
11567 "panic: reg_scan_name returned NULL");
11568 if (!RExC_paren_names) {
11569 RExC_paren_names= newHV();
11570 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11572 RExC_paren_name_list= newAV();
11573 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11576 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11578 sv_dat = HeVAL(he_str);
11580 /* croak baby croak */
11582 "panic: paren_name hash element allocation failed");
11583 } else if ( SvPOK(sv_dat) ) {
11584 /* (?|...) can mean we have dupes so scan to check
11585 its already been stored. Maybe a flag indicating
11586 we are inside such a construct would be useful,
11587 but the arrays are likely to be quite small, so
11588 for now we punt -- dmq */
11589 IV count = SvIV(sv_dat);
11590 I32 *pv = (I32*)SvPVX(sv_dat);
11592 for ( i = 0 ; i < count ; i++ ) {
11593 if ( pv[i] == RExC_npar ) {
11599 pv = (I32*)SvGROW(sv_dat,
11600 SvCUR(sv_dat) + sizeof(I32)+1);
11601 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11602 pv[count] = RExC_npar;
11603 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11606 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11607 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11610 SvIV_set(sv_dat, 1);
11613 /* Yes this does cause a memory leak in debugging Perls
11615 if (!av_store(RExC_paren_name_list,
11616 RExC_npar, SvREFCNT_inc_NN(svname)))
11617 SvREFCNT_dec_NN(svname);
11620 /*sv_dump(sv_dat);*/
11622 nextchar(pRExC_state);
11624 goto capturing_parens;
11627 RExC_seen |= REG_LOOKBEHIND_SEEN;
11628 RExC_in_lookbehind++;
11630 if (RExC_parse >= RExC_end) {
11631 vFAIL("Sequence (?... not terminated");
11633 RExC_seen_zerolen++;
11635 case '=': /* (?=...) */
11636 RExC_seen_zerolen++;
11637 RExC_in_lookahead++;
11639 case '!': /* (?!...) */
11640 RExC_seen_zerolen++;
11641 /* check if we're really just a "FAIL" assertion */
11642 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11643 FALSE /* Don't force to /x */ );
11644 if (*RExC_parse == ')') {
11645 ret=reganode(pRExC_state, OPFAIL, 0);
11646 nextchar(pRExC_state);
11650 case '|': /* (?|...) */
11651 /* branch reset, behave like a (?:...) except that
11652 buffers in alternations share the same numbers */
11654 after_freeze = freeze_paren = RExC_npar;
11656 /* XXX This construct currently requires an extra pass.
11657 * Investigation would be required to see if that could be
11659 REQUIRE_PARENS_PASS;
11661 case ':': /* (?:...) */
11662 case '>': /* (?>...) */
11664 case '$': /* (?$...) */
11665 case '@': /* (?@...) */
11666 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11668 case '0' : /* (?0) */
11669 case 'R' : /* (?R) */
11670 if (RExC_parse == RExC_end || *RExC_parse != ')')
11671 FAIL("Sequence (?R) not terminated");
11673 RExC_seen |= REG_RECURSE_SEEN;
11675 /* XXX These constructs currently require an extra pass.
11676 * It probably could be changed */
11677 REQUIRE_PARENS_PASS;
11679 *flagp |= POSTPONED;
11680 goto gen_recurse_regop;
11682 /* named and numeric backreferences */
11683 case '&': /* (?&NAME) */
11684 parse_start = RExC_parse - 1;
11687 SV *sv_dat = reg_scan_name(pRExC_state,
11688 REG_RSN_RETURN_DATA);
11689 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11691 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11692 vFAIL("Sequence (?&... not terminated");
11693 goto gen_recurse_regop;
11696 if (! inRANGE(RExC_parse[0], '1', '9')) {
11698 vFAIL("Illegal pattern");
11700 goto parse_recursion;
11702 case '-': /* (?-1) */
11703 if (! inRANGE(RExC_parse[0], '1', '9')) {
11704 RExC_parse--; /* rewind to let it be handled later */
11708 case '1': case '2': case '3': case '4': /* (?1) */
11709 case '5': case '6': case '7': case '8': case '9':
11710 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11713 bool is_neg = FALSE;
11715 parse_start = RExC_parse - 1; /* MJD */
11716 if (*RExC_parse == '-') {
11721 if (grok_atoUV(RExC_parse, &unum, &endptr)
11725 RExC_parse = (char*)endptr;
11727 else { /* Overflow, or something like that. Position
11728 beyond all digits for the message */
11729 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
11732 vFAIL(impossible_group);
11735 /* -num is always representable on 1 and 2's complement
11740 if (*RExC_parse!=')')
11741 vFAIL("Expecting close bracket");
11744 if (paren == '-' || paren == '+') {
11746 /* Don't overflow */
11747 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11749 vFAIL(impossible_group);
11753 Diagram of capture buffer numbering.
11754 Top line is the normal capture buffer numbers
11755 Bottom line is the negative indexing as from
11759 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11760 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11761 - 5 4 3 2 1 X Y x x
11763 Resolve to absolute group. Recall that RExC_npar is +1 of
11764 the actual parenthesis group number. For lookahead, we
11765 have to compensate for that. Using the above example, when
11766 we get to Y in the parse, num is 2 and RExC_npar is 6. We
11767 want 7 for +2, and 4 for -2.
11769 if ( paren == '+' ) {
11775 if (paren == '-' && num < 1) {
11777 vFAIL(non_existent_group_msg);
11781 if (num >= RExC_npar) {
11783 /* It might be a forward reference; we can't fail until we
11784 * know, by completing the parse to get all the groups, and
11785 * then reparsing */
11786 if (ALL_PARENS_COUNTED) {
11787 if (num >= RExC_total_parens) {
11789 vFAIL(non_existent_group_msg);
11793 REQUIRE_PARENS_PASS;
11797 /* We keep track how many GOSUB items we have produced.
11798 To start off the ARG2L() of the GOSUB holds its "id",
11799 which is used later in conjunction with RExC_recurse
11800 to calculate the offset we need to jump for the GOSUB,
11801 which it will store in the final representation.
11802 We have to defer the actual calculation until much later
11803 as the regop may move.
11805 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11806 RExC_recurse_count++;
11807 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11808 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11809 22, "| |", (int)(depth * 2 + 1), "",
11810 (UV)ARG(REGNODE_p(ret)),
11811 (IV)ARG2L(REGNODE_p(ret))));
11812 RExC_seen |= REG_RECURSE_SEEN;
11814 Set_Node_Length(REGNODE_p(ret),
11815 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11816 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11818 *flagp |= POSTPONED;
11819 assert(*RExC_parse == ')');
11820 nextchar(pRExC_state);
11825 case '?': /* (??...) */
11827 if (*RExC_parse != '{') {
11828 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11829 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11831 "Sequence (%" UTF8f "...) not recognized",
11832 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11833 NOT_REACHED; /*NOTREACHED*/
11835 *flagp |= POSTPONED;
11839 case '{': /* (?{...}) */
11842 struct reg_code_block *cb;
11845 RExC_seen_zerolen++;
11847 if ( !pRExC_state->code_blocks
11848 || pRExC_state->code_index
11849 >= pRExC_state->code_blocks->count
11850 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11851 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11854 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11855 FAIL("panic: Sequence (?{...}): no code block found\n");
11856 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11858 /* this is a pre-compiled code block (?{...}) */
11859 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11860 RExC_parse = RExC_start + cb->end;
11862 if (cb->src_regex) {
11863 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11864 RExC_rxi->data->data[n] =
11865 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11866 RExC_rxi->data->data[n+1] = (void*)o;
11869 n = add_data(pRExC_state,
11870 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11871 RExC_rxi->data->data[n] = (void*)o;
11873 pRExC_state->code_index++;
11874 nextchar(pRExC_state);
11877 regnode_offset eval;
11878 ret = reg_node(pRExC_state, LOGICAL);
11880 eval = reg2Lanode(pRExC_state, EVAL,
11883 /* for later propagation into (??{})
11885 RExC_flags & RXf_PMf_COMPILETIME
11887 FLAGS(REGNODE_p(ret)) = 2;
11888 if (! REGTAIL(pRExC_state, ret, eval)) {
11889 REQUIRE_BRANCHJ(flagp, 0);
11891 /* deal with the length of this later - MJD */
11894 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11895 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11896 Set_Node_Offset(REGNODE_p(ret), parse_start);
11899 case '(': /* (?(?{...})...) and (?(?=...)...) */
11902 const int DEFINE_len = sizeof("DEFINE") - 1;
11903 if ( RExC_parse < RExC_end - 1
11904 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11905 && ( RExC_parse[1] == '='
11906 || RExC_parse[1] == '!'
11907 || RExC_parse[1] == '<'
11908 || RExC_parse[1] == '{'))
11909 || ( RExC_parse[0] == '*' /* (?(*...)) */
11910 && ( memBEGINs(RExC_parse + 1,
11911 (Size_t) (RExC_end - (RExC_parse + 1)),
11913 || memBEGINs(RExC_parse + 1,
11914 (Size_t) (RExC_end - (RExC_parse + 1)),
11916 || memBEGINs(RExC_parse + 1,
11917 (Size_t) (RExC_end - (RExC_parse + 1)),
11919 || memBEGINs(RExC_parse + 1,
11920 (Size_t) (RExC_end - (RExC_parse + 1)),
11922 || memBEGINs(RExC_parse + 1,
11923 (Size_t) (RExC_end - (RExC_parse + 1)),
11924 "positive_lookahead:")
11925 || memBEGINs(RExC_parse + 1,
11926 (Size_t) (RExC_end - (RExC_parse + 1)),
11927 "positive_lookbehind:")
11928 || memBEGINs(RExC_parse + 1,
11929 (Size_t) (RExC_end - (RExC_parse + 1)),
11930 "negative_lookahead:")
11931 || memBEGINs(RExC_parse + 1,
11932 (Size_t) (RExC_end - (RExC_parse + 1)),
11933 "negative_lookbehind:"))))
11934 ) { /* Lookahead or eval. */
11936 regnode_offset tail;
11938 ret = reg_node(pRExC_state, LOGICAL);
11939 FLAGS(REGNODE_p(ret)) = 1;
11941 tail = reg(pRExC_state, 1, &flag, depth+1);
11942 RETURN_FAIL_ON_RESTART(flag, flagp);
11943 if (! REGTAIL(pRExC_state, ret, tail)) {
11944 REQUIRE_BRANCHJ(flagp, 0);
11948 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11949 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11951 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11952 char *name_start= RExC_parse++;
11954 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11955 if ( RExC_parse == name_start
11956 || RExC_parse >= RExC_end
11957 || *RExC_parse != ch)
11959 vFAIL2("Sequence (?(%c... not terminated",
11960 (ch == '>' ? '<' : ch));
11964 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11965 RExC_rxi->data->data[num]=(void*)sv_dat;
11966 SvREFCNT_inc_simple_void_NN(sv_dat);
11968 ret = reganode(pRExC_state, GROUPPN, num);
11969 goto insert_if_check_paren;
11971 else if (memBEGINs(RExC_parse,
11972 (STRLEN) (RExC_end - RExC_parse),
11975 ret = reganode(pRExC_state, DEFINEP, 0);
11976 RExC_parse += DEFINE_len;
11978 goto insert_if_check_paren;
11980 else if (RExC_parse[0] == 'R') {
11982 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11983 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11984 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11987 if (RExC_parse[0] == '0') {
11991 else if (inRANGE(RExC_parse[0], '1', '9')) {
11994 if (grok_atoUV(RExC_parse, &uv, &endptr)
11997 parno = (I32)uv + 1;
11998 RExC_parse = (char*)endptr;
12000 /* else "Switch condition not recognized" below */
12001 } else if (RExC_parse[0] == '&') {
12004 sv_dat = reg_scan_name(pRExC_state,
12005 REG_RSN_RETURN_DATA);
12007 parno = 1 + *((I32 *)SvPVX(sv_dat));
12009 ret = reganode(pRExC_state, INSUBP, parno);
12010 goto insert_if_check_paren;
12012 else if (inRANGE(RExC_parse[0], '1', '9')) {
12017 if (grok_atoUV(RExC_parse, &uv, &endptr)
12021 RExC_parse = (char*)endptr;
12024 vFAIL("panic: grok_atoUV returned FALSE");
12026 ret = reganode(pRExC_state, GROUPP, parno);
12028 insert_if_check_paren:
12029 if (UCHARAT(RExC_parse) != ')') {
12031 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12033 vFAIL("Switch condition not recognized");
12035 nextchar(pRExC_state);
12037 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12040 REQUIRE_BRANCHJ(flagp, 0);
12042 br = regbranch(pRExC_state, &flags, 1, depth+1);
12044 RETURN_FAIL_ON_RESTART(flags,flagp);
12045 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12048 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12051 REQUIRE_BRANCHJ(flagp, 0);
12053 c = UCHARAT(RExC_parse);
12054 nextchar(pRExC_state);
12055 if (flags&HASWIDTH)
12056 *flagp |= HASWIDTH;
12059 vFAIL("(?(DEFINE)....) does not allow branches");
12061 /* Fake one for optimizer. */
12062 lastbr = reganode(pRExC_state, IFTHEN, 0);
12064 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12065 RETURN_FAIL_ON_RESTART(flags, flagp);
12066 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12069 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12070 REQUIRE_BRANCHJ(flagp, 0);
12072 if (flags&HASWIDTH)
12073 *flagp |= HASWIDTH;
12074 c = UCHARAT(RExC_parse);
12075 nextchar(pRExC_state);
12080 if (RExC_parse >= RExC_end)
12081 vFAIL("Switch (?(condition)... not terminated");
12083 vFAIL("Switch (?(condition)... contains too many branches");
12085 ender = reg_node(pRExC_state, TAIL);
12086 if (! REGTAIL(pRExC_state, br, ender)) {
12087 REQUIRE_BRANCHJ(flagp, 0);
12090 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12091 REQUIRE_BRANCHJ(flagp, 0);
12093 if (! REGTAIL(pRExC_state,
12096 NEXTOPER(REGNODE_p(lastbr)))),
12099 REQUIRE_BRANCHJ(flagp, 0);
12103 if (! REGTAIL(pRExC_state, ret, ender)) {
12104 REQUIRE_BRANCHJ(flagp, 0);
12106 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12107 RExC_size++; /* XXX WHY do we need this?!!
12108 For large programs it seems to be required
12109 but I can't figure out why. -- dmq*/
12114 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12116 vFAIL("Unknown switch condition (?(...))");
12118 case '[': /* (?[ ... ]) */
12119 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12121 case 0: /* A NUL */
12122 RExC_parse--; /* for vFAIL to print correctly */
12123 vFAIL("Sequence (? incomplete");
12127 if (RExC_strict) { /* [perl #132851] */
12128 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12131 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12133 default: /* e.g., (?i) */
12134 RExC_parse = (char *) seqstart + 1;
12136 parse_lparen_question_flags(pRExC_state);
12137 if (UCHARAT(RExC_parse) != ':') {
12138 if (RExC_parse < RExC_end)
12139 nextchar(pRExC_state);
12144 nextchar(pRExC_state);
12149 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12153 if (! ALL_PARENS_COUNTED) {
12154 /* If we are in our first pass through (and maybe only pass),
12155 * we need to allocate memory for the capturing parentheses
12159 if (!RExC_parens_buf_size) {
12160 /* first guess at number of parens we might encounter */
12161 RExC_parens_buf_size = 10;
12163 /* setup RExC_open_parens, which holds the address of each
12164 * OPEN tag, and to make things simpler for the 0 index the
12165 * start of the program - this is used later for offsets */
12166 Newxz(RExC_open_parens, RExC_parens_buf_size,
12168 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12170 /* setup RExC_close_parens, which holds the address of each
12171 * CLOSE tag, and to make things simpler for the 0 index
12172 * the end of the program - this is used later for offsets
12174 Newxz(RExC_close_parens, RExC_parens_buf_size,
12176 /* we dont know where end op starts yet, so we dont need to
12177 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12180 else if (RExC_npar > RExC_parens_buf_size) {
12181 I32 old_size = RExC_parens_buf_size;
12183 RExC_parens_buf_size *= 2;
12185 Renew(RExC_open_parens, RExC_parens_buf_size,
12187 Zero(RExC_open_parens + old_size,
12188 RExC_parens_buf_size - old_size, regnode_offset);
12190 Renew(RExC_close_parens, RExC_parens_buf_size,
12192 Zero(RExC_close_parens + old_size,
12193 RExC_parens_buf_size - old_size, regnode_offset);
12197 ret = reganode(pRExC_state, OPEN, parno);
12198 if (!RExC_nestroot)
12199 RExC_nestroot = parno;
12200 if (RExC_open_parens && !RExC_open_parens[parno])
12202 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12203 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12204 22, "| |", (int)(depth * 2 + 1), "",
12206 RExC_open_parens[parno]= ret;
12209 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12210 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12213 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12222 /* Pick up the branches, linking them together. */
12223 parse_start = RExC_parse; /* MJD */
12224 br = regbranch(pRExC_state, &flags, 1, depth+1);
12226 /* branch_len = (paren != 0); */
12229 RETURN_FAIL_ON_RESTART(flags, flagp);
12230 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12232 if (*RExC_parse == '|') {
12233 if (RExC_use_BRANCHJ) {
12234 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12237 reginsert(pRExC_state, BRANCH, br, depth+1);
12238 Set_Node_Length(REGNODE_p(br), paren != 0);
12239 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12243 else if (paren == ':') {
12244 *flagp |= flags&SIMPLE;
12246 if (is_open) { /* Starts with OPEN. */
12247 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12248 REQUIRE_BRANCHJ(flagp, 0);
12251 else if (paren != '?') /* Not Conditional */
12253 *flagp |= flags & (HASWIDTH | POSTPONED);
12255 while (*RExC_parse == '|') {
12256 if (RExC_use_BRANCHJ) {
12259 ender = reganode(pRExC_state, LONGJMP, 0);
12261 /* Append to the previous. */
12262 shut_gcc_up = REGTAIL(pRExC_state,
12263 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12265 PERL_UNUSED_VAR(shut_gcc_up);
12267 nextchar(pRExC_state);
12268 if (freeze_paren) {
12269 if (RExC_npar > after_freeze)
12270 after_freeze = RExC_npar;
12271 RExC_npar = freeze_paren;
12273 br = regbranch(pRExC_state, &flags, 0, depth+1);
12276 RETURN_FAIL_ON_RESTART(flags, flagp);
12277 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12279 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12280 REQUIRE_BRANCHJ(flagp, 0);
12283 *flagp |= flags & (HASWIDTH | POSTPONED);
12286 if (have_branch || paren != ':') {
12289 /* Make a closing node, and hook it on the end. */
12292 ender = reg_node(pRExC_state, TAIL);
12295 ender = reganode(pRExC_state, CLOSE, parno);
12296 if ( RExC_close_parens ) {
12297 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12298 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12299 22, "| |", (int)(depth * 2 + 1), "",
12300 (IV)parno, ender));
12301 RExC_close_parens[parno]= ender;
12302 if (RExC_nestroot == parno)
12305 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12306 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12309 ender = reg_node(pRExC_state, SRCLOSE);
12310 RExC_in_script_run = 0;
12320 *flagp &= ~HASWIDTH;
12322 case 't': /* aTomic */
12324 ender = reg_node(pRExC_state, SUCCEED);
12327 ender = reg_node(pRExC_state, END);
12328 assert(!RExC_end_op); /* there can only be one! */
12329 RExC_end_op = REGNODE_p(ender);
12330 if (RExC_close_parens) {
12331 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12332 "%*s%*s Setting close paren #0 (END) to %zu\n",
12333 22, "| |", (int)(depth * 2 + 1), "",
12336 RExC_close_parens[0]= ender;
12341 DEBUG_PARSE_MSG("lsbr");
12342 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12343 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12344 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12345 SvPV_nolen_const(RExC_mysv1),
12347 SvPV_nolen_const(RExC_mysv2),
12349 (IV)(ender - lastbr)
12352 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12353 REQUIRE_BRANCHJ(flagp, 0);
12357 char is_nothing= 1;
12359 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12361 /* Hook the tails of the branches to the closing node. */
12362 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12363 const U8 op = PL_regkind[OP(br)];
12364 if (op == BRANCH) {
12365 if (! REGTAIL_STUDY(pRExC_state,
12366 REGNODE_OFFSET(NEXTOPER(br)),
12369 REQUIRE_BRANCHJ(flagp, 0);
12371 if ( OP(NEXTOPER(br)) != NOTHING
12372 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12375 else if (op == BRANCHJ) {
12376 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12377 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12379 PERL_UNUSED_VAR(shut_gcc_up);
12380 /* for now we always disable this optimisation * /
12381 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12382 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12388 regnode * ret_as_regnode = REGNODE_p(ret);
12389 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12390 ? regnext(ret_as_regnode)
12393 DEBUG_PARSE_MSG("NADA");
12394 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12395 NULL, pRExC_state);
12396 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12397 NULL, pRExC_state);
12398 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12399 SvPV_nolen_const(RExC_mysv1),
12400 (IV)REG_NODE_NUM(ret_as_regnode),
12401 SvPV_nolen_const(RExC_mysv2),
12407 if (OP(REGNODE_p(ender)) == TAIL) {
12409 RExC_emit= REGNODE_OFFSET(br) + 1;
12412 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12413 OP(opt)= OPTIMIZED;
12414 NEXT_OFF(br)= REGNODE_p(ender) - br;
12422 /* Even/odd or x=don't care: 010101x10x */
12423 static const char parens[] = "=!aA<,>Bbt";
12424 /* flag below is set to 0 up through 'A'; 1 for larger */
12426 if (paren && (p = strchr(parens, paren))) {
12427 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12428 int flag = (p - parens) > 3;
12430 if (paren == '>' || paren == 't') {
12431 node = SUSPEND, flag = 0;
12434 reginsert(pRExC_state, node, ret, depth+1);
12435 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12436 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12437 FLAGS(REGNODE_p(ret)) = flag;
12438 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12440 REQUIRE_BRANCHJ(flagp, 0);
12445 /* Check for proper termination. */
12447 /* restore original flags, but keep (?p) and, if we've encountered
12448 * something in the parse that changes /d rules into /u, keep the /u */
12449 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12450 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12451 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12453 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12454 RExC_parse = oregcomp_parse;
12455 vFAIL("Unmatched (");
12457 nextchar(pRExC_state);
12459 else if (!paren && RExC_parse < RExC_end) {
12460 if (*RExC_parse == ')') {
12462 vFAIL("Unmatched )");
12465 FAIL("Junk on end of regexp"); /* "Can't happen". */
12466 NOT_REACHED; /* NOTREACHED */
12469 if (RExC_in_lookbehind) {
12470 RExC_in_lookbehind--;
12472 if (RExC_in_lookahead) {
12473 RExC_in_lookahead--;
12475 if (after_freeze > RExC_npar)
12476 RExC_npar = after_freeze;
12481 - regbranch - one alternative of an | operator
12483 * Implements the concatenation operator.
12485 * On success, returns the offset at which any next node should be placed into
12486 * the regex engine program being compiled.
12488 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12489 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12492 STATIC regnode_offset
12493 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12495 regnode_offset ret;
12496 regnode_offset chain = 0;
12497 regnode_offset latest;
12498 I32 flags = 0, c = 0;
12499 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12501 PERL_ARGS_ASSERT_REGBRANCH;
12503 DEBUG_PARSE("brnc");
12508 if (RExC_use_BRANCHJ)
12509 ret = reganode(pRExC_state, BRANCHJ, 0);
12511 ret = reg_node(pRExC_state, BRANCH);
12512 Set_Node_Length(REGNODE_p(ret), 1);
12516 *flagp = 0; /* Initialize. */
12518 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12519 FALSE /* Don't force to /x */ );
12520 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12521 flags &= ~TRYAGAIN;
12522 latest = regpiece(pRExC_state, &flags, depth+1);
12524 if (flags & TRYAGAIN)
12526 RETURN_FAIL_ON_RESTART(flags, flagp);
12527 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12531 *flagp |= flags&(HASWIDTH|POSTPONED);
12533 /* FIXME adding one for every branch after the first is probably
12534 * excessive now we have TRIE support. (hv) */
12536 if (! REGTAIL(pRExC_state, chain, latest)) {
12537 /* XXX We could just redo this branch, but figuring out what
12538 * bookkeeping needs to be reset is a pain, and it's likely
12539 * that other branches that goto END will also be too large */
12540 REQUIRE_BRANCHJ(flagp, 0);
12546 if (chain == 0) { /* Loop ran zero times. */
12547 chain = reg_node(pRExC_state, NOTHING);
12552 *flagp |= flags&SIMPLE;
12559 - regpiece - something followed by possible quantifier * + ? {n,m}
12561 * Note that the branching code sequences used for ? and the general cases
12562 * of * and + are somewhat optimized: they use the same NOTHING node as
12563 * both the endmarker for their branch list and the body of the last branch.
12564 * It might seem that this node could be dispensed with entirely, but the
12565 * endmarker role is not redundant.
12567 * On success, returns the offset at which any next node should be placed into
12568 * the regex engine program being compiled.
12570 * Returns 0 otherwise, with *flagp set to indicate why:
12571 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12572 * RESTART_PARSE if the parse needs to be restarted, or'd with
12573 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12575 STATIC regnode_offset
12576 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12578 regnode_offset ret;
12582 const char * const origparse = RExC_parse;
12584 I32 max = REG_INFTY;
12585 #ifdef RE_TRACK_PATTERN_OFFSETS
12588 const char *maxpos = NULL;
12591 /* Save the original in case we change the emitted regop to a FAIL. */
12592 const regnode_offset orig_emit = RExC_emit;
12594 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12596 PERL_ARGS_ASSERT_REGPIECE;
12598 DEBUG_PARSE("piec");
12600 ret = regatom(pRExC_state, &flags, depth+1);
12602 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12603 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12606 #ifdef RE_TRACK_PATTERN_OFFSETS
12607 parse_start = RExC_parse;
12614 nextchar(pRExC_state);
12619 nextchar(pRExC_state);
12624 nextchar(pRExC_state);
12628 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
12629 to determine which */
12630 if (regcurly(RExC_parse)) {
12631 const char* endptr;
12633 /* Here is a quantifier, parse for min and max values */
12635 next = RExC_parse + 1;
12636 while (isDIGIT(*next) || *next == ',') {
12637 if (*next == ',') {
12646 assert(*next == '}');
12651 if (isDIGIT(*RExC_parse)) {
12653 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12654 vFAIL("Invalid quantifier in {,}");
12655 if (uv >= REG_INFTY)
12656 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12661 if (*maxpos == ',')
12664 maxpos = RExC_parse;
12665 if (isDIGIT(*maxpos)) {
12667 if (!grok_atoUV(maxpos, &uv, &endptr))
12668 vFAIL("Invalid quantifier in {,}");
12669 if (uv >= REG_INFTY)
12670 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12673 max = REG_INFTY; /* meaning "infinity" */
12677 nextchar(pRExC_state);
12678 if (max < min) { /* If can't match, warn and optimize to fail
12680 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12681 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12682 NEXT_OFF(REGNODE_p(orig_emit)) =
12683 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12686 else if (min == max && *RExC_parse == '?')
12688 ckWARN2reg(RExC_parse + 1,
12689 "Useless use of greediness modifier '%c'",
12694 } /* End of is regcurly() */
12696 /* Here was a '{', but what followed it didn't form a quantifier. */
12702 NOT_REACHED; /*NOTREACHED*/
12705 /* Here we have a quantifier, and have calculated 'min' and 'max'.
12707 * Check and possibly adjust a zero width operand */
12708 if (! (flags & (HASWIDTH|POSTPONED))) {
12709 if (max > REG_INFTY/3) {
12710 if (origparse[0] == '\\' && origparse[1] == 'K') {
12712 "%" UTF8f " is forbidden - matches null string"
12714 UTF8fARG(UTF, (RExC_parse >= origparse
12715 ? RExC_parse - origparse
12719 ckWARN2reg(RExC_parse,
12720 "%" UTF8f " matches null string many times",
12721 UTF8fARG(UTF, (RExC_parse >= origparse
12722 ? RExC_parse - origparse
12728 /* There's no point in trying to match something 0 length more than
12729 * once except for extra side effects, which we don't have here since
12739 /* If this is a code block pass it up */
12740 *flagp |= (flags & POSTPONED);
12743 *flagp |= (flags & HASWIDTH);
12744 if (max == REG_INFTY)
12745 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12748 /* 'SIMPLE' operands don't require full generality */
12749 if ((flags&SIMPLE)) {
12750 if (max == REG_INFTY) {
12752 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
12753 goto min0_maxINF_wildcard_forbidden;
12756 reginsert(pRExC_state, STAR, ret, depth+1);
12760 else if (min == 1) {
12761 reginsert(pRExC_state, PLUS, ret, depth+1);
12767 /* Here, SIMPLE, but not the '*' and '+' special cases */
12769 MARK_NAUGHTY_EXP(2, 2);
12770 reginsert(pRExC_state, CURLY, ret, depth+1);
12771 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12772 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12774 else { /* not SIMPLE */
12775 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12777 FLAGS(REGNODE_p(w)) = 0;
12778 if (! REGTAIL(pRExC_state, ret, w)) {
12779 REQUIRE_BRANCHJ(flagp, 0);
12781 if (RExC_use_BRANCHJ) {
12782 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12783 reginsert(pRExC_state, NOTHING, ret, depth+1);
12784 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12786 reginsert(pRExC_state, CURLYX, ret, depth+1);
12788 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12789 Set_Node_Length(REGNODE_p(ret),
12790 op == '{' ? (RExC_parse - parse_start) : 1);
12792 if (RExC_use_BRANCHJ)
12793 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12795 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12798 REQUIRE_BRANCHJ(flagp, 0);
12800 RExC_whilem_seen++;
12801 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12804 /* Finish up the CURLY/CURLYX case */
12805 FLAGS(REGNODE_p(ret)) = 0;
12807 ARG1_SET(REGNODE_p(ret), (U16)min);
12808 ARG2_SET(REGNODE_p(ret), (U16)max);
12812 /* Process any greediness modifiers */
12813 if (*RExC_parse == '?') {
12814 nextchar(pRExC_state);
12815 reginsert(pRExC_state, MINMOD, ret, depth+1);
12816 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12817 REQUIRE_BRANCHJ(flagp, 0);
12820 else if (*RExC_parse == '+') {
12821 regnode_offset ender;
12822 nextchar(pRExC_state);
12823 ender = reg_node(pRExC_state, SUCCEED);
12824 if (! REGTAIL(pRExC_state, ret, ender)) {
12825 REQUIRE_BRANCHJ(flagp, 0);
12827 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12828 ender = reg_node(pRExC_state, TAIL);
12829 if (! REGTAIL(pRExC_state, ret, ender)) {
12830 REQUIRE_BRANCHJ(flagp, 0);
12834 /* Forbid extra quantifiers */
12835 if (ISMULT2(RExC_parse)) {
12837 vFAIL("Nested quantifiers");
12842 min0_maxINF_wildcard_forbidden:
12844 /* Here we are in a wildcard match, and the minimum match length is 0, and
12845 * the max could be infinity. This is currently forbidden. The only
12846 * reason is to make it harder to write patterns that take a long long time
12847 * to halt, and because the use of this construct isn't necessary in
12848 * matching Unicode property values */
12850 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
12851 subpatterns in regex; marked by <-- HERE in m/%s/
12853 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
12856 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
12857 * legal at all in wildcards, so can't get this far */
12859 NOT_REACHED; /*NOTREACHED*/
12863 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12864 regnode_offset * node_p,
12872 /* This routine teases apart the various meanings of \N and returns
12873 * accordingly. The input parameters constrain which meaning(s) is/are valid
12874 * in the current context.
12876 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12878 * If <code_point_p> is not NULL, the context is expecting the result to be a
12879 * single code point. If this \N instance turns out to a single code point,
12880 * the function returns TRUE and sets *code_point_p to that code point.
12882 * If <node_p> is not NULL, the context is expecting the result to be one of
12883 * the things representable by a regnode. If this \N instance turns out to be
12884 * one such, the function generates the regnode, returns TRUE and sets *node_p
12885 * to point to the offset of that regnode into the regex engine program being
12888 * If this instance of \N isn't legal in any context, this function will
12889 * generate a fatal error and not return.
12891 * On input, RExC_parse should point to the first char following the \N at the
12892 * time of the call. On successful return, RExC_parse will have been updated
12893 * to point to just after the sequence identified by this routine. Also
12894 * *flagp has been updated as needed.
12896 * When there is some problem with the current context and this \N instance,
12897 * the function returns FALSE, without advancing RExC_parse, nor setting
12898 * *node_p, nor *code_point_p, nor *flagp.
12900 * If <cp_count> is not NULL, the caller wants to know the length (in code
12901 * points) that this \N sequence matches. This is set, and the input is
12902 * parsed for errors, even if the function returns FALSE, as detailed below.
12904 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12906 * Probably the most common case is for the \N to specify a single code point.
12907 * *cp_count will be set to 1, and *code_point_p will be set to that code
12910 * Another possibility is for the input to be an empty \N{}. This is no
12911 * longer accepted, and will generate a fatal error.
12913 * Another possibility is for a custom charnames handler to be in effect which
12914 * translates the input name to an empty string. *cp_count will be set to 0.
12915 * *node_p will be set to a generated NOTHING node.
12917 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12918 * set to 0. *node_p will be set to a generated REG_ANY node.
12920 * The fifth possibility is that \N resolves to a sequence of more than one
12921 * code points. *cp_count will be set to the number of code points in the
12922 * sequence. *node_p will be set to a generated node returned by this
12923 * function calling S_reg().
12925 * The final possibility is that it is premature to be calling this function;
12926 * the parse needs to be restarted. This can happen when this changes from
12927 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12928 * latter occurs only when the fifth possibility would otherwise be in
12929 * effect, and is because one of those code points requires the pattern to be
12930 * recompiled as UTF-8. The function returns FALSE, and sets the
12931 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12932 * happens, the caller needs to desist from continuing parsing, and return
12933 * this information to its caller. This is not set for when there is only one
12934 * code point, as this can be called as part of an ANYOF node, and they can
12935 * store above-Latin1 code points without the pattern having to be in UTF-8.
12937 * For non-single-quoted regexes, the tokenizer has resolved character and
12938 * sequence names inside \N{...} into their Unicode values, normalizing the
12939 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12940 * hex-represented code points in the sequence. This is done there because
12941 * the names can vary based on what charnames pragma is in scope at the time,
12942 * so we need a way to take a snapshot of what they resolve to at the time of
12943 * the original parse. [perl #56444].
12945 * That parsing is skipped for single-quoted regexes, so here we may get
12946 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12947 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12948 * the native character set for non-ASCII platforms. The other possibilities
12949 * are already native, so no translation is done. */
12951 char * endbrace; /* points to '}' following the name */
12952 char* p = RExC_parse; /* Temporary */
12954 SV * substitute_parse = NULL;
12959 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12961 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12963 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12964 assert(! (node_p && cp_count)); /* At most 1 should be set */
12966 if (cp_count) { /* Initialize return for the most common case */
12970 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12971 * modifier. The other meanings do not, so use a temporary until we find
12972 * out which we are being called with */
12973 skip_to_be_ignored_text(pRExC_state, &p,
12974 FALSE /* Don't force to /x */ );
12976 /* Disambiguate between \N meaning a named character versus \N meaning
12977 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12978 * quantifier, or if there is no '{' at all */
12979 if (*p != '{' || regcurly(p)) {
12989 *node_p = reg_node(pRExC_state, REG_ANY);
12990 *flagp |= HASWIDTH|SIMPLE;
12992 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12996 /* The test above made sure that the next real character is a '{', but
12997 * under the /x modifier, it could be separated by space (or a comment and
12998 * \n) and this is not allowed (for consistency with \x{...} and the
12999 * tokenizer handling of \N{NAME}). */
13000 if (*RExC_parse != '{') {
13001 vFAIL("Missing braces on \\N{}");
13004 RExC_parse++; /* Skip past the '{' */
13006 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13007 if (! endbrace) { /* no trailing brace */
13008 vFAIL2("Missing right brace on \\%c{}", 'N');
13011 /* Here, we have decided it should be a named character or sequence. These
13012 * imply Unicode semantics */
13013 REQUIRE_UNI_RULES(flagp, FALSE);
13015 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13016 * nothing at all (not allowed under strict) */
13017 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13018 RExC_parse = endbrace;
13020 RExC_parse++; /* Position after the "}" */
13021 vFAIL("Zero length \\N{}");
13027 nextchar(pRExC_state);
13032 *node_p = reg_node(pRExC_state, NOTHING);
13036 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13038 /* Here, the name isn't of the form U+.... This can happen if the
13039 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13040 * is the time to find out what the name means */
13042 const STRLEN name_len = endbrace - RExC_parse;
13043 SV * value_sv; /* What does this name evaluate to */
13045 const U8 * value; /* string of name's value */
13046 STRLEN value_len; /* and its length */
13048 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13049 * toke.c, and their values. Make sure is initialized */
13050 if (! RExC_unlexed_names) {
13051 RExC_unlexed_names = newHV();
13054 /* If we have already seen this name in this pattern, use that. This
13055 * allows us to only call the charnames handler once per name per
13056 * pattern. A broken or malicious handler could return something
13057 * different each time, which could cause the results to vary depending
13058 * on if something gets added or subtracted from the pattern that
13059 * causes the number of passes to change, for example */
13060 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13063 value_sv = *value_svp;
13065 else { /* Otherwise we have to go out and get the name */
13066 const char * error_msg = NULL;
13067 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13071 RExC_parse = endbrace;
13075 /* If no error message, should have gotten a valid return */
13078 /* Save the name's meaning for later use */
13079 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13082 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13086 /* Here, we have the value the name evaluates to in 'value_sv' */
13087 value = (U8 *) SvPV(value_sv, value_len);
13089 /* See if the result is one code point vs 0 or multiple */
13090 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13094 /* Here, exactly one code point. If that isn't what is wanted,
13096 if (! code_point_p) {
13101 /* Convert from string to numeric code point */
13102 *code_point_p = (SvUTF8(value_sv))
13103 ? valid_utf8_to_uvchr(value, NULL)
13106 /* Have parsed this entire single code point \N{...}. *cp_count
13107 * has already been set to 1, so don't do it again. */
13108 RExC_parse = endbrace;
13109 nextchar(pRExC_state);
13111 } /* End of is a single code point */
13113 /* Count the code points, if caller desires. The API says to do this
13114 * even if we will later return FALSE */
13118 *cp_count = (SvUTF8(value_sv))
13119 ? utf8_length(value, value + value_len)
13123 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13124 * But don't back the pointer up if the caller wants to know how many
13125 * code points there are (they need to handle it themselves in this
13134 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13135 * reg recursively to parse it. That way, it retains its atomicness,
13136 * while not having to worry about any special handling that some code
13137 * points may have. */
13139 substitute_parse = newSVpvs("?:");
13140 sv_catsv(substitute_parse, value_sv);
13141 sv_catpv(substitute_parse, ")");
13143 /* The value should already be native, so no need to convert on EBCDIC
13145 assert(! RExC_recode_x_to_native);
13148 else { /* \N{U+...} */
13149 Size_t count = 0; /* code point count kept internally */
13151 /* We can get to here when the input is \N{U+...} or when toke.c has
13152 * converted a name to the \N{U+...} form. This include changing a
13153 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13155 RExC_parse += 2; /* Skip past the 'U+' */
13157 /* Code points are separated by dots. The '}' terminates the whole
13160 do { /* Loop until the ending brace */
13161 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13162 | PERL_SCAN_SILENT_ILLDIGIT
13163 | PERL_SCAN_NOTIFY_ILLDIGIT
13164 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13165 | PERL_SCAN_DISALLOW_PREFIX;
13166 STRLEN len = endbrace - RExC_parse;
13168 char * start_digit = RExC_parse;
13169 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13174 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13179 if (cp > MAX_LEGAL_CP) {
13180 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13183 if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13188 /* Here, is a single code point; fail if doesn't want that */
13189 if (! code_point_p) {
13194 /* A single code point is easy to handle; just return it */
13195 *code_point_p = UNI_TO_NATIVE(cp);
13196 RExC_parse = endbrace;
13197 nextchar(pRExC_state);
13201 /* Here, the parse stopped bfore the ending brace. This is legal
13202 * only if that character is a dot separating code points, like a
13203 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13204 * So the next character must be a dot (and the one after that
13205 * can't be the endbrace, or we'd have something like \N{U+100.} )
13207 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13208 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13209 ? UTF8SKIP(RExC_parse)
13211 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13216 /* Here, looks like its really a multiple character sequence. Fail
13217 * if that's not what the caller wants. But continue with counting
13218 * and error checking if they still want a count */
13219 if (! node_p && ! cp_count) {
13223 /* What is done here is to convert this to a sub-pattern of the
13224 * form \x{char1}\x{char2}... and then call reg recursively to
13225 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13226 * atomicness, while not having to worry about special handling
13227 * that some code points may have. We don't create a subpattern,
13228 * but go through the motions of code point counting and error
13229 * checking, if the caller doesn't want a node returned. */
13231 if (node_p && ! substitute_parse) {
13232 substitute_parse = newSVpvs("?:");
13238 /* Convert to notation the rest of the code understands */
13239 sv_catpvs(substitute_parse, "\\x{");
13240 sv_catpvn(substitute_parse, start_digit,
13241 RExC_parse - start_digit);
13242 sv_catpvs(substitute_parse, "}");
13245 /* Move to after the dot (or ending brace the final time through.)
13250 } while (RExC_parse < endbrace);
13252 if (! node_p) { /* Doesn't want the node */
13259 sv_catpvs(substitute_parse, ")");
13261 /* The values are Unicode, and therefore have to be converted to native
13262 * on a non-Unicode (meaning non-ASCII) platform. */
13263 SET_recode_x_to_native(1);
13266 /* Here, we have the string the name evaluates to, ready to be parsed,
13267 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13268 * constructs. This can be called from within a substitute parse already.
13269 * The error reporting mechanism doesn't work for 2 levels of this, but the
13270 * code above has validated this new construct, so there should be no
13271 * errors generated by the below. And this isn' an exact copy, so the
13272 * mechanism to seamlessly deal with this won't work, so turn off warnings
13274 save_start = RExC_start;
13275 orig_end = RExC_end;
13277 RExC_parse = RExC_start = SvPVX(substitute_parse);
13278 RExC_end = RExC_parse + SvCUR(substitute_parse);
13279 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13281 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13283 /* Restore the saved values */
13285 RExC_start = save_start;
13286 RExC_parse = endbrace;
13287 RExC_end = orig_end;
13288 SET_recode_x_to_native(0);
13290 SvREFCNT_dec_NN(substitute_parse);
13293 RETURN_FAIL_ON_RESTART(flags, flagp);
13294 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13297 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13299 nextchar(pRExC_state);
13306 S_compute_EXACTish(RExC_state_t *pRExC_state)
13310 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13318 op = get_regex_charset(RExC_flags);
13319 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13320 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13321 been, so there is no hole */
13324 return op + EXACTF;
13328 S_new_regcurly(const char *s, const char *e)
13330 /* This is a temporary function designed to match the most lenient form of
13331 * a {m,n} quantifier we ever envision, with either number omitted, and
13332 * spaces anywhere between/before/after them.
13334 * If this function fails, then the string it matches is very unlikely to
13335 * ever be considered a valid quantifier, so we can allow the '{' that
13336 * begins it to be considered as a literal */
13338 bool has_min = FALSE;
13339 bool has_max = FALSE;
13341 PERL_ARGS_ASSERT_NEW_REGCURLY;
13343 if (s >= e || *s++ != '{')
13346 while (s < e && isSPACE(*s)) {
13349 while (s < e && isDIGIT(*s)) {
13353 while (s < e && isSPACE(*s)) {
13359 while (s < e && isSPACE(*s)) {
13362 while (s < e && isDIGIT(*s)) {
13366 while (s < e && isSPACE(*s)) {
13371 return s < e && *s == '}' && (has_min || has_max);
13374 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13375 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13378 S_backref_value(char *p, char *e)
13380 const char* endptr = e;
13382 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13389 - regatom - the lowest level
13391 Try to identify anything special at the start of the current parse position.
13392 If there is, then handle it as required. This may involve generating a
13393 single regop, such as for an assertion; or it may involve recursing, such as
13394 to handle a () structure.
13396 If the string doesn't start with something special then we gobble up
13397 as much literal text as we can. If we encounter a quantifier, we have to
13398 back off the final literal character, as that quantifier applies to just it
13399 and not to the whole string of literals.
13401 Once we have been able to handle whatever type of thing started the
13402 sequence, we return the offset into the regex engine program being compiled
13403 at which any next regnode should be placed.
13405 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13406 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13407 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13408 Otherwise does not return 0.
13410 Note: we have to be careful with escapes, as they can be both literal
13411 and special, and in the case of \10 and friends, context determines which.
13413 A summary of the code structure is:
13415 switch (first_byte) {
13416 cases for each special:
13417 handle this special;
13420 switch (2nd byte) {
13421 cases for each unambiguous special:
13422 handle this special;
13424 cases for each ambigous special/literal:
13426 if (special) handle here
13428 default: // unambiguously literal:
13431 default: // is a literal char
13434 create EXACTish node for literal;
13435 while (more input and node isn't full) {
13436 switch (input_byte) {
13437 cases for each special;
13438 make sure parse pointer is set so that the next call to
13439 regatom will see this special first
13440 goto loopdone; // EXACTish node terminated by prev. char
13442 append char to EXACTISH node;
13444 get next input byte;
13448 return the generated node;
13450 Specifically there are two separate switches for handling
13451 escape sequences, with the one for handling literal escapes requiring
13452 a dummy entry for all of the special escapes that are actually handled
13457 STATIC regnode_offset
13458 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13460 regnode_offset ret = 0;
13466 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13468 *flagp = 0; /* Initialize. */
13470 DEBUG_PARSE("atom");
13472 PERL_ARGS_ASSERT_REGATOM;
13475 parse_start = RExC_parse;
13476 assert(RExC_parse < RExC_end);
13477 switch ((U8)*RExC_parse) {
13479 RExC_seen_zerolen++;
13480 nextchar(pRExC_state);
13481 if (RExC_flags & RXf_PMf_MULTILINE)
13482 ret = reg_node(pRExC_state, MBOL);
13484 ret = reg_node(pRExC_state, SBOL);
13485 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13488 nextchar(pRExC_state);
13490 RExC_seen_zerolen++;
13491 if (RExC_flags & RXf_PMf_MULTILINE)
13492 ret = reg_node(pRExC_state, MEOL);
13494 ret = reg_node(pRExC_state, SEOL);
13495 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13498 nextchar(pRExC_state);
13499 if (RExC_flags & RXf_PMf_SINGLELINE)
13500 ret = reg_node(pRExC_state, SANY);
13502 ret = reg_node(pRExC_state, REG_ANY);
13503 *flagp |= HASWIDTH|SIMPLE;
13505 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13509 char * const oregcomp_parse = ++RExC_parse;
13510 ret = regclass(pRExC_state, flagp, depth+1,
13511 FALSE, /* means parse the whole char class */
13512 TRUE, /* allow multi-char folds */
13513 FALSE, /* don't silence non-portable warnings. */
13514 (bool) RExC_strict,
13515 TRUE, /* Allow an optimized regnode result */
13518 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13519 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13522 if (*RExC_parse != ']') {
13523 RExC_parse = oregcomp_parse;
13524 vFAIL("Unmatched [");
13526 nextchar(pRExC_state);
13527 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13531 nextchar(pRExC_state);
13532 ret = reg(pRExC_state, 2, &flags, depth+1);
13534 if (flags & TRYAGAIN) {
13535 if (RExC_parse >= RExC_end) {
13536 /* Make parent create an empty node if needed. */
13537 *flagp |= TRYAGAIN;
13542 RETURN_FAIL_ON_RESTART(flags, flagp);
13543 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13546 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13550 if (flags & TRYAGAIN) {
13551 *flagp |= TRYAGAIN;
13554 vFAIL("Internal urp");
13555 /* Supposed to be caught earlier. */
13561 vFAIL("Quantifier follows nothing");
13566 This switch handles escape sequences that resolve to some kind
13567 of special regop and not to literal text. Escape sequences that
13568 resolve to literal text are handled below in the switch marked
13571 Every entry in this switch *must* have a corresponding entry
13572 in the literal escape switch. However, the opposite is not
13573 required, as the default for this switch is to jump to the
13574 literal text handling code.
13577 switch ((U8)*RExC_parse) {
13578 /* Special Escapes */
13580 RExC_seen_zerolen++;
13581 /* Under wildcards, this is changed to match \n; should be
13582 * invisible to the user, as they have to compile under /m */
13583 if (RExC_pm_flags & PMf_WILDCARD) {
13584 ret = reg_node(pRExC_state, MBOL);
13587 ret = reg_node(pRExC_state, SBOL);
13588 /* SBOL is shared with /^/ so we set the flags so we can tell
13589 * /\A/ from /^/ in split. */
13590 FLAGS(REGNODE_p(ret)) = 1;
13592 goto finish_meta_pat;
13594 if (RExC_pm_flags & PMf_WILDCARD) {
13596 /* diag_listed_as: Use of %s is not allowed in Unicode property
13597 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13599 vFAIL("Use of '\\G' is not allowed in Unicode property"
13600 " wildcard subpatterns");
13602 ret = reg_node(pRExC_state, GPOS);
13603 RExC_seen |= REG_GPOS_SEEN;
13604 goto finish_meta_pat;
13606 if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13607 RExC_seen_zerolen++;
13608 ret = reg_node(pRExC_state, KEEPS);
13609 /* XXX:dmq : disabling in-place substitution seems to
13610 * be necessary here to avoid cases of memory corruption, as
13611 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13613 RExC_seen |= REG_LOOKBEHIND_SEEN;
13614 goto finish_meta_pat;
13617 ++RExC_parse; /* advance past the 'K' */
13618 vFAIL("\\K not permitted in lookahead/lookbehind");
13621 if (RExC_pm_flags & PMf_WILDCARD) {
13622 /* See comment under \A above */
13623 ret = reg_node(pRExC_state, MEOL);
13626 ret = reg_node(pRExC_state, SEOL);
13628 RExC_seen_zerolen++; /* Do not optimize RE away */
13629 goto finish_meta_pat;
13631 if (RExC_pm_flags & PMf_WILDCARD) {
13632 /* See comment under \A above */
13633 ret = reg_node(pRExC_state, MEOL);
13636 ret = reg_node(pRExC_state, EOS);
13638 RExC_seen_zerolen++; /* Do not optimize RE away */
13639 goto finish_meta_pat;
13641 vFAIL("\\C no longer supported");
13643 ret = reg_node(pRExC_state, CLUMP);
13644 *flagp |= HASWIDTH;
13645 goto finish_meta_pat;
13653 regex_charset charset = get_regex_charset(RExC_flags);
13655 RExC_seen_zerolen++;
13656 RExC_seen |= REG_LOOKBEHIND_SEEN;
13657 op = BOUND + charset;
13659 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13660 flags = TRADITIONAL_BOUND;
13661 if (op > BOUNDA) { /* /aa is same as /a */
13667 char name = *RExC_parse;
13668 char * endbrace = NULL;
13670 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13673 vFAIL2("Missing right brace on \\%c{}", name);
13675 /* XXX Need to decide whether to take spaces or not. Should be
13676 * consistent with \p{}, but that currently is SPACE, which
13677 * means vertical too, which seems wrong
13678 * while (isBLANK(*RExC_parse)) {
13681 if (endbrace == RExC_parse) {
13682 RExC_parse++; /* After the '}' */
13683 vFAIL2("Empty \\%c{}", name);
13685 length = endbrace - RExC_parse;
13686 /*while (isBLANK(*(RExC_parse + length - 1))) {
13689 switch (*RExC_parse) {
13692 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13694 goto bad_bound_type;
13699 if (length != 2 || *(RExC_parse + 1) != 'b') {
13700 goto bad_bound_type;
13705 if (length != 2 || *(RExC_parse + 1) != 'b') {
13706 goto bad_bound_type;
13711 if (length != 2 || *(RExC_parse + 1) != 'b') {
13712 goto bad_bound_type;
13718 RExC_parse = endbrace;
13720 "'%" UTF8f "' is an unknown bound type",
13721 UTF8fARG(UTF, length, endbrace - length));
13722 NOT_REACHED; /*NOTREACHED*/
13724 RExC_parse = endbrace;
13725 REQUIRE_UNI_RULES(flagp, 0);
13730 else if (op >= BOUNDA) { /* /aa is same as /a */
13734 /* Don't have to worry about UTF-8, in this message because
13735 * to get here the contents of the \b must be ASCII */
13736 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13737 "Using /u for '%.*s' instead of /%s",
13739 endbrace - length + 1,
13740 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13741 ? ASCII_RESTRICT_PAT_MODS
13742 : ASCII_MORE_RESTRICT_PAT_MODS);
13747 RExC_seen_d_op = TRUE;
13749 else if (op == BOUNDL) {
13750 RExC_contains_locale = 1;
13754 op += NBOUND - BOUND;
13757 ret = reg_node(pRExC_state, op);
13758 FLAGS(REGNODE_p(ret)) = flags;
13760 goto finish_meta_pat;
13764 ret = reg_node(pRExC_state, LNBREAK);
13765 *flagp |= HASWIDTH|SIMPLE;
13766 goto finish_meta_pat;
13780 /* These all have the same meaning inside [brackets], and it knows
13781 * how to do the best optimizations for them. So, pretend we found
13782 * these within brackets, and let it do the work */
13785 ret = regclass(pRExC_state, flagp, depth+1,
13786 TRUE, /* means just parse this element */
13787 FALSE, /* don't allow multi-char folds */
13788 FALSE, /* don't silence non-portable warnings. It
13789 would be a bug if these returned
13791 (bool) RExC_strict,
13792 TRUE, /* Allow an optimized regnode result */
13794 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13795 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13796 * multi-char folds are allowed. */
13798 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13801 RExC_parse--; /* regclass() leaves this one too far ahead */
13804 /* The escapes above that don't take a parameter can't be
13805 * followed by a '{'. But 'pX', 'p{foo}' and
13806 * correspondingly 'P' can be */
13807 if ( RExC_parse - parse_start == 1
13808 && UCHARAT(RExC_parse + 1) == '{'
13809 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13812 vFAIL("Unescaped left brace in regex is illegal here");
13814 Set_Node_Offset(REGNODE_p(ret), parse_start);
13815 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13816 nextchar(pRExC_state);
13819 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13820 * \N{...} evaluates to a sequence of more than one code points).
13821 * The function call below returns a regnode, which is our result.
13822 * The parameters cause it to fail if the \N{} evaluates to a
13823 * single code point; we handle those like any other literal. The
13824 * reason that the multicharacter case is handled here and not as
13825 * part of the EXACtish code is because of quantifiers. In
13826 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13827 * this way makes that Just Happen. dmq.
13828 * join_exact() will join this up with adjacent EXACTish nodes
13829 * later on, if appropriate. */
13831 if (grok_bslash_N(pRExC_state,
13832 &ret, /* Want a regnode returned */
13833 NULL, /* Fail if evaluates to a single code
13835 NULL, /* Don't need a count of how many code
13844 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13846 /* Here, evaluates to a single code point. Go get that */
13847 RExC_parse = parse_start;
13850 case 'k': /* Handle \k<NAME> and \k'NAME' */
13854 if ( RExC_parse >= RExC_end - 1
13855 || (( ch = RExC_parse[1]) != '<'
13860 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13861 vFAIL2("Sequence %.2s... not terminated", parse_start);
13864 ret = handle_named_backref(pRExC_state,
13876 case '1': case '2': case '3': case '4':
13877 case '5': case '6': case '7': case '8': case '9':
13882 if (*RExC_parse == 'g') {
13886 if (*RExC_parse == '{') {
13890 if (*RExC_parse == '-') {
13894 if (hasbrace && !isDIGIT(*RExC_parse)) {
13895 if (isrel) RExC_parse--;
13897 goto parse_named_seq;
13900 if (RExC_parse >= RExC_end) {
13901 goto unterminated_g;
13903 num = S_backref_value(RExC_parse, RExC_end);
13905 vFAIL("Reference to invalid group 0");
13906 else if (num == I32_MAX) {
13907 if (isDIGIT(*RExC_parse))
13908 vFAIL("Reference to nonexistent group");
13911 vFAIL("Unterminated \\g... pattern");
13915 num = RExC_npar - num;
13917 vFAIL("Reference to nonexistent or unclosed group");
13921 num = S_backref_value(RExC_parse, RExC_end);
13922 /* bare \NNN might be backref or octal - if it is larger
13923 * than or equal RExC_npar then it is assumed to be an
13924 * octal escape. Note RExC_npar is +1 from the actual
13925 * number of parens. */
13926 /* Note we do NOT check if num == I32_MAX here, as that is
13927 * handled by the RExC_npar check */
13930 /* any numeric escape < 10 is always a backref */
13932 /* any numeric escape < RExC_npar is a backref */
13933 && num >= RExC_npar
13934 /* cannot be an octal escape if it starts with [89] */
13935 && ! inRANGE(*RExC_parse, '8', '9')
13937 /* Probably not meant to be a backref, instead likely
13938 * to be an octal character escape, e.g. \35 or \777.
13939 * The above logic should make it obvious why using
13940 * octal escapes in patterns is problematic. - Yves */
13941 RExC_parse = parse_start;
13946 /* At this point RExC_parse points at a numeric escape like
13947 * \12 or \88 or something similar, which we should NOT treat
13948 * as an octal escape. It may or may not be a valid backref
13949 * escape. For instance \88888888 is unlikely to be a valid
13951 while (isDIGIT(*RExC_parse))
13954 if (*RExC_parse != '}')
13955 vFAIL("Unterminated \\g{...} pattern");
13958 if (num >= (I32)RExC_npar) {
13960 /* It might be a forward reference; we can't fail until we
13961 * know, by completing the parse to get all the groups, and
13962 * then reparsing */
13963 if (ALL_PARENS_COUNTED) {
13964 if (num >= RExC_total_parens) {
13965 vFAIL("Reference to nonexistent group");
13969 REQUIRE_PARENS_PASS;
13973 ret = reganode(pRExC_state,
13976 : (ASCII_FOLD_RESTRICTED)
13978 : (AT_LEAST_UNI_SEMANTICS)
13984 if (OP(REGNODE_p(ret)) == REFF) {
13985 RExC_seen_d_op = TRUE;
13987 *flagp |= HASWIDTH;
13989 /* override incorrect value set in reganode MJD */
13990 Set_Node_Offset(REGNODE_p(ret), parse_start);
13991 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13992 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13993 FALSE /* Don't force to /x */ );
13997 if (RExC_parse >= RExC_end)
13998 FAIL("Trailing \\");
14001 /* Do not generate "unrecognized" warnings here, we fall
14002 back into the quick-grab loop below */
14003 RExC_parse = parse_start;
14005 } /* end of switch on a \foo sequence */
14010 /* '#' comments should have been spaced over before this function was
14012 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14014 if (RExC_flags & RXf_PMf_EXTENDED) {
14015 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14016 if (RExC_parse < RExC_end)
14026 /* Here, we have determined that the next thing is probably a
14027 * literal character. RExC_parse points to the first byte of its
14028 * definition. (It still may be an escape sequence that evaluates
14029 * to a single character) */
14034 char *s, *old_s = NULL, *old_old_s = NULL;
14036 U32 max_string_len = 255;
14038 /* We may have to reparse the node, artificially stopping filling
14039 * it early, based on info gleaned in the first parse. This
14040 * variable gives where we stop. Make it above the normal stopping
14041 * place first time through; otherwise it would stop too early */
14042 U32 upper_fill = max_string_len + 1;
14044 /* We start out as an EXACT node, even if under /i, until we find a
14045 * character which is in a fold. The algorithm now segregates into
14046 * separate nodes, characters that fold from those that don't under
14047 * /i. (This hopefully will create nodes that are fixed strings
14048 * even under /i, giving the optimizer something to grab on to.)
14049 * So, if a node has something in it and the next character is in
14050 * the opposite category, that node is closed up, and the function
14051 * returns. Then regatom is called again, and a new node is
14052 * created for the new category. */
14053 U8 node_type = EXACT;
14055 /* Assume the node will be fully used; the excess is given back at
14056 * the end. Under /i, we may need to temporarily add the fold of
14057 * an extra character or two at the end to check for splitting
14058 * multi-char folds, so allocate extra space for that. We can't
14059 * make any other length assumptions, as a byte input sequence
14060 * could shrink down. */
14061 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14065 ? UTF8_MAXBYTES_CASE
14066 /* Max non-UTF-8 expansion is 2 */ : 2)));
14068 bool next_is_quantifier;
14069 char * oldp = NULL;
14071 /* We can convert EXACTF nodes to EXACTFU if they contain only
14072 * characters that match identically regardless of the target
14073 * string's UTF8ness. The reason to do this is that EXACTF is not
14074 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14077 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14078 * contain only above-Latin1 characters (hence must be in UTF8),
14079 * which don't participate in folds with Latin1-range characters,
14080 * as the latter's folds aren't known until runtime. */
14081 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14083 /* Single-character EXACTish nodes are almost always SIMPLE. This
14084 * allows us to override this as encountered */
14085 U8 maybe_SIMPLE = SIMPLE;
14087 /* Does this node contain something that can't match unless the
14088 * target string is (also) in UTF-8 */
14089 bool requires_utf8_target = FALSE;
14091 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14092 bool has_ss = FALSE;
14094 /* So is the MICRO SIGN */
14095 bool has_micro_sign = FALSE;
14097 /* Set when we fill up the current node and there is still more
14098 * text to process */
14101 /* Allocate an EXACT node. The node_type may change below to
14102 * another EXACTish node, but since the size of the node doesn't
14103 * change, it works */
14104 ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14106 FILL_NODE(ret, node_type);
14109 s = STRING(REGNODE_p(ret));
14120 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14121 maybe_SIMPLE = SIMPLE;
14122 requires_utf8_target = FALSE;
14124 has_micro_sign = FALSE;
14128 /* This breaks under rare circumstances. If folding, we do not
14129 * want to split a node at a character that is a non-final in a
14130 * multi-char fold, as an input string could just happen to want to
14131 * match across the node boundary. The code at the end of the loop
14132 * looks for this, and backs off until it finds not such a
14133 * character, but it is possible (though extremely, extremely
14134 * unlikely) for all characters in the node to be non-final fold
14135 * ones, in which case we just leave the node fully filled, and
14136 * hope that it doesn't match the string in just the wrong place */
14138 assert( ! UTF /* Is at the beginning of a character */
14139 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14140 || UTF8_IS_START(UCHARAT(RExC_parse)));
14142 overflowed = FALSE;
14144 /* Here, we have a literal character. Find the maximal string of
14145 * them in the input that we can fit into a single EXACTish node.
14146 * We quit at the first non-literal or when the node gets full, or
14147 * under /i the categorization of folding/non-folding character
14149 while (p < RExC_end && len < upper_fill) {
14151 /* In most cases each iteration adds one byte to the output.
14152 * The exceptions override this */
14153 Size_t added_len = 1;
14159 /* White space has already been ignored */
14160 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14161 || ! is_PATWS_safe((p), RExC_end, UTF));
14164 const char* message;
14177 /* Literal Escapes Switch
14179 This switch is meant to handle escape sequences that
14180 resolve to a literal character.
14182 Every escape sequence that represents something
14183 else, like an assertion or a char class, is handled
14184 in the switch marked 'Special Escapes' above in this
14185 routine, but also has an entry here as anything that
14186 isn't explicitly mentioned here will be treated as
14187 an unescaped equivalent literal.
14190 switch ((U8)*++p) {
14192 /* These are all the special escapes. */
14193 case 'A': /* Start assertion */
14194 case 'b': case 'B': /* Word-boundary assertion*/
14195 case 'C': /* Single char !DANGEROUS! */
14196 case 'd': case 'D': /* digit class */
14197 case 'g': case 'G': /* generic-backref, pos assertion */
14198 case 'h': case 'H': /* HORIZWS */
14199 case 'k': case 'K': /* named backref, keep marker */
14200 case 'p': case 'P': /* Unicode property */
14201 case 'R': /* LNBREAK */
14202 case 's': case 'S': /* space class */
14203 case 'v': case 'V': /* VERTWS */
14204 case 'w': case 'W': /* word class */
14205 case 'X': /* eXtended Unicode "combining
14206 character sequence" */
14207 case 'z': case 'Z': /* End of line/string assertion */
14211 /* Anything after here is an escape that resolves to a
14212 literal. (Except digits, which may or may not)
14218 case 'N': /* Handle a single-code point named character. */
14219 RExC_parse = p + 1;
14220 if (! grok_bslash_N(pRExC_state,
14221 NULL, /* Fail if evaluates to
14222 anything other than a
14223 single code point */
14224 &ender, /* The returned single code
14226 NULL, /* Don't need a count of
14227 how many code points */
14232 if (*flagp & NEED_UTF8)
14233 FAIL("panic: grok_bslash_N set NEED_UTF8");
14234 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14236 /* Here, it wasn't a single code point. Go close
14237 * up this EXACTish node. The switch() prior to
14238 * this switch handles the other cases */
14239 RExC_parse = p = oldp;
14243 RExC_parse = parse_start;
14245 /* The \N{} means the pattern, if previously /d,
14246 * becomes /u. That means it can't be an EXACTF node,
14247 * but an EXACTFU */
14248 if (node_type == EXACTF) {
14249 node_type = EXACTFU;
14251 /* If the node already contains something that
14252 * differs between EXACTF and EXACTFU, reparse it
14254 if (! maybe_exactfu) {
14275 ender = ESC_NATIVE;
14283 if (! grok_bslash_o(&p,
14288 (bool) RExC_strict,
14289 FALSE, /* No illegal cp's */
14292 RExC_parse = p; /* going to die anyway; point to
14293 exact spot of failure */
14297 if (message && TO_OUTPUT_WARNINGS(p)) {
14298 warn_non_literal_string(p, packed_warn, message);
14302 if (! grok_bslash_x(&p,
14307 (bool) RExC_strict,
14308 FALSE, /* No illegal cp's */
14311 RExC_parse = p; /* going to die anyway; point
14312 to exact spot of failure */
14316 if (message && TO_OUTPUT_WARNINGS(p)) {
14317 warn_non_literal_string(p, packed_warn, message);
14321 if (ender < 0x100) {
14322 if (RExC_recode_x_to_native) {
14323 ender = LATIN1_TO_NATIVE(ender);
14330 if (! grok_bslash_c(*p, &grok_c_char,
14331 &message, &packed_warn))
14333 /* going to die anyway; point to exact spot of
14335 RExC_parse = p + ((UTF)
14336 ? UTF8_SAFE_SKIP(p, RExC_end)
14341 ender = grok_c_char;
14343 if (message && TO_OUTPUT_WARNINGS(p)) {
14344 warn_non_literal_string(p, packed_warn, message);
14348 case '8': case '9': /* must be a backreference */
14350 /* we have an escape like \8 which cannot be an octal escape
14351 * so we exit the loop, and let the outer loop handle this
14352 * escape which may or may not be a legitimate backref. */
14354 case '1': case '2': case '3':case '4':
14355 case '5': case '6': case '7':
14356 /* When we parse backslash escapes there is ambiguity
14357 * between backreferences and octal escapes. Any escape
14358 * from \1 - \9 is a backreference, any multi-digit
14359 * escape which does not start with 0 and which when
14360 * evaluated as decimal could refer to an already
14361 * parsed capture buffer is a back reference. Anything
14364 * Note this implies that \118 could be interpreted as
14365 * 118 OR as "\11" . "8" depending on whether there
14366 * were 118 capture buffers defined already in the
14369 /* NOTE, RExC_npar is 1 more than the actual number of
14370 * parens we have seen so far, hence the "<" as opposed
14372 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14373 { /* Not to be treated as an octal constant, go
14381 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14382 | PERL_SCAN_NOTIFY_ILLDIGIT;
14384 ender = grok_oct(p, &numlen, &flags, NULL);
14386 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14387 && isDIGIT(*p) /* like \08, \178 */
14388 && ckWARN(WARN_REGEXP))
14390 reg_warn_non_literal_string(
14392 form_alien_digit_msg(8, numlen, p,
14393 RExC_end, UTF, FALSE));
14399 FAIL("Trailing \\");
14402 if (isALPHANUMERIC(*p)) {
14403 /* An alpha followed by '{' is going to fail next
14404 * iteration, so don't output this warning in that
14406 if (! isALPHA(*p) || *(p + 1) != '{') {
14407 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14408 " passed through", p);
14411 goto normal_default;
14412 } /* End of switch on '\' */
14415 /* Trying to gain new uses for '{' without breaking too
14416 * much existing code is hard. The solution currently
14418 * 1) If there is no ambiguity that a '{' should always
14419 * be taken literally, at the start of a construct, we
14421 * 2) If the literal '{' conflicts with our desired use
14422 * of it as a metacharacter, we die. The deprecation
14423 * cycles for this have come and gone.
14424 * 3) If there is ambiguity, we raise a simple warning.
14425 * This could happen, for example, if the user
14426 * intended it to introduce a quantifier, but slightly
14427 * misspelled the quantifier. Without this warning,
14428 * the quantifier would silently be taken as a literal
14429 * string of characters instead of a meta construct */
14430 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14432 || ( p > parse_start + 1
14433 && isALPHA_A(*(p - 1))
14434 && *(p - 2) == '\\')
14435 || new_regcurly(p, RExC_end))
14437 RExC_parse = p + 1;
14438 vFAIL("Unescaped left brace in regex is "
14441 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14442 " passed through");
14444 goto normal_default;
14447 if (p > RExC_parse && RExC_strict) {
14448 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14451 default: /* A literal character */
14453 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14455 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14456 &numlen, UTF8_ALLOW_DEFAULT);
14462 } /* End of switch on the literal */
14464 /* Here, have looked at the literal character, and <ender>
14465 * contains its ordinal; <p> points to the character after it.
14469 REQUIRE_UTF8(flagp);
14470 if ( UNICODE_IS_PERL_EXTENDED(ender)
14471 && TO_OUTPUT_WARNINGS(p))
14473 ckWARN2_non_literal_string(p,
14474 packWARN(WARN_PORTABLE),
14475 PL_extended_cp_format,
14480 /* We need to check if the next non-ignored thing is a
14481 * quantifier. Move <p> to after anything that should be
14482 * ignored, which, as a side effect, positions <p> for the next
14483 * loop iteration */
14484 skip_to_be_ignored_text(pRExC_state, &p,
14485 FALSE /* Don't force to /x */ );
14487 /* If the next thing is a quantifier, it applies to this
14488 * character only, which means that this character has to be in
14489 * its own node and can't just be appended to the string in an
14490 * existing node, so if there are already other characters in
14491 * the node, close the node with just them, and set up to do
14492 * this character again next time through, when it will be the
14493 * only thing in its new node */
14495 next_is_quantifier = LIKELY(p < RExC_end)
14496 && UNLIKELY(ISMULT2(p));
14498 if (next_is_quantifier && LIKELY(len)) {
14503 /* Ready to add 'ender' to the node */
14505 if (! FOLD) { /* The simple case, just append the literal */
14508 /* Don't output if it would overflow */
14509 if (UNLIKELY(len > max_string_len - ((UTF)
14510 ? UVCHR_SKIP(ender)
14517 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14518 *(s++) = (char) ender;
14521 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14522 added_len = (char *) new_s - s;
14523 s = (char *) new_s;
14526 requires_utf8_target = TRUE;
14530 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14532 /* Here are folding under /l, and the code point is
14533 * problematic. If this is the first character in the
14534 * node, change the node type to folding. Otherwise, if
14535 * this is the first problematic character, close up the
14536 * existing node, so can start a new node with this one */
14538 node_type = EXACTFL;
14539 RExC_contains_locale = 1;
14541 else if (node_type == EXACT) {
14546 /* This problematic code point means we can't simplify
14548 maybe_exactfu = FALSE;
14550 /* Here, we are adding a problematic fold character.
14551 * "Problematic" in this context means that its fold isn't
14552 * known until runtime. (The non-problematic code points
14553 * are the above-Latin1 ones that fold to also all
14554 * above-Latin1. Their folds don't vary no matter what the
14555 * locale is.) But here we have characters whose fold
14556 * depends on the locale. We just add in the unfolded
14557 * character, and wait until runtime to fold it */
14558 goto not_fold_common;
14560 else /* regular fold; see if actually is in a fold */
14561 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14563 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14565 /* Here, folding, but the character isn't in a fold.
14567 * Start a new node if previous characters in the node were
14569 if (len && node_type != EXACT) {
14574 /* Here, continuing a node with non-folded characters. Add
14576 goto not_fold_common;
14578 else { /* Here, does participate in some fold */
14580 /* If this is the first character in the node, change its
14581 * type to folding. Otherwise, if this is the first
14582 * folding character in the node, close up the existing
14583 * node, so can start a new node with this one. */
14585 node_type = compute_EXACTish(pRExC_state);
14587 else if (node_type == EXACT) {
14592 if (UTF) { /* Alway use the folded value for UTF-8
14594 if (UVCHR_IS_INVARIANT(ender)) {
14595 if (UNLIKELY(len + 1 > max_string_len)) {
14600 *(s)++ = (U8) toFOLD(ender);
14603 UV folded = _to_uni_fold_flags(
14605 (U8 *) s, /* We have allocated extra space
14606 in 's' so can't run off the
14609 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14610 ? FOLD_FLAGS_NOMIX_ASCII
14612 if (UNLIKELY(len + added_len > max_string_len)) {
14620 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14622 /* U+B5 folds to the MU, so its possible for a
14623 * non-UTF-8 target to match it */
14624 requires_utf8_target = TRUE;
14628 else { /* Here is non-UTF8. */
14630 /* The fold will be one or (rarely) two characters.
14631 * Check that there's room for at least a single one
14632 * before setting any flags, etc. Because otherwise an
14633 * overflowing character could cause a flag to be set
14634 * even though it doesn't end up in this node. (For
14635 * the two character fold, we check again, before
14636 * setting any flags) */
14637 if (UNLIKELY(len + 1 > max_string_len)) {
14642 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14643 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14644 || UNICODE_DOT_DOT_VERSION > 0)
14646 /* On non-ancient Unicodes, check for the only possible
14647 * multi-char fold */
14648 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14650 /* This potential multi-char fold means the node
14651 * can't be simple (because it could match more
14652 * than a single char). And in some cases it will
14653 * match 'ss', so set that flag */
14657 /* It can't change to be an EXACTFU (unless already
14658 * is one). We fold it iff under /u rules. */
14659 if (node_type != EXACTFU) {
14660 maybe_exactfu = FALSE;
14663 if (UNLIKELY(len + 2 > max_string_len)) {
14672 goto done_with_this_char;
14675 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14677 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14679 /* Also, the sequence 'ss' is special when not
14680 * under /u. If the target string is UTF-8, it
14681 * should match SHARP S; otherwise it won't. So,
14682 * here we have to exclude the possibility of this
14683 * node moving to /u.*/
14685 maybe_exactfu = FALSE;
14688 /* Here, the fold will be a single character */
14690 if (UNLIKELY(ender == MICRO_SIGN)) {
14691 has_micro_sign = TRUE;
14693 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14695 /* If the character's fold differs between /d and
14696 * /u, this can't change to be an EXACTFU node */
14697 maybe_exactfu = FALSE;
14700 *(s++) = (DEPENDS_SEMANTICS)
14701 ? (char) toFOLD(ender)
14703 /* Under /u, the fold of any character in
14704 * the 0-255 range happens to be its
14705 * lowercase equivalent, except for LATIN
14706 * SMALL LETTER SHARP S, which was handled
14707 * above, and the MICRO SIGN, whose fold
14708 * requires UTF-8 to represent. */
14709 : (char) toLOWER_L1(ender);
14711 } /* End of adding current character to the node */
14713 done_with_this_char:
14717 if (next_is_quantifier) {
14719 /* Here, the next input is a quantifier, and to get here,
14720 * the current character is the only one in the node. */
14724 } /* End of loop through literal characters */
14726 /* Here we have either exhausted the input or run out of room in
14727 * the node. If the former, we are done. (If we encountered a
14728 * character that can't be in the node, transfer is made directly
14729 * to <loopdone>, and so we wouldn't have fallen off the end of the
14731 if (LIKELY(! overflowed)) {
14735 /* Here we have run out of room. We can grow plain EXACT and
14736 * LEXACT nodes. If the pattern is gigantic enough, though,
14737 * eventually we'll have to artificially chunk the pattern into
14738 * multiple nodes. */
14739 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14740 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14741 Size_t overhead_expansion = 0;
14743 Size_t max_nodes_for_string;
14747 /* Here we couldn't fit the final character in the current
14748 * node, so it will have to be reparsed, no matter what else we
14752 /* If would have overflowed a regular EXACT node, switch
14753 * instead to an LEXACT. The code below is structured so that
14754 * the actual growing code is common to changing from an EXACT
14755 * or just increasing the LEXACT size. This means that we have
14756 * to save the string in the EXACT case before growing, and
14757 * then copy it afterwards to its new location */
14758 if (node_type == EXACT) {
14759 overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14760 RExC_emit += overhead_expansion;
14761 Copy(s0, temp, len, char);
14764 /* Ready to grow. If it was a plain EXACT, the string was
14765 * saved, and the first few bytes of it overwritten by adding
14766 * an argument field. We assume, as we do elsewhere in this
14767 * file, that one byte of remaining input will translate into
14768 * one byte of output, and if that's too small, we grow again,
14769 * if too large the excess memory is freed at the end */
14771 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14772 achievable = MIN(max_nodes_for_string,
14773 current_string_nodes + STR_SZ(RExC_end - p));
14774 delta = achievable - current_string_nodes;
14776 /* If there is just no more room, go finish up this chunk of
14782 change_engine_size(pRExC_state, delta + overhead_expansion);
14783 current_string_nodes += delta;
14785 = sizeof(struct regnode) * current_string_nodes;
14786 upper_fill = max_string_len + 1;
14788 /* If the length was small, we know this was originally an
14789 * EXACT node now converted to LEXACT, and the string has to be
14790 * restored. Otherwise the string was untouched. 260 is just
14791 * a number safely above 255 so don't have to worry about
14792 * getting it precise */
14794 node_type = LEXACT;
14795 FILL_NODE(ret, node_type);
14796 s0 = STRING(REGNODE_p(ret));
14797 Copy(temp, s0, len, char);
14801 goto continue_parse;
14804 bool splittable = FALSE;
14805 bool backed_up = FALSE;
14806 char * e; /* should this be U8? */
14807 char * s_start; /* should this be U8? */
14809 /* Here is /i. Running out of room creates a problem if we are
14810 * folding, and the split happens in the middle of a
14811 * multi-character fold, as a match that should have occurred,
14812 * won't, due to the way nodes are matched, and our artificial
14813 * boundary. So back off until we aren't splitting such a
14814 * fold. If there is no such place to back off to, we end up
14815 * taking the entire node as-is. This can happen if the node
14816 * consists entirely of 'f' or entirely of 's' characters (or
14817 * things that fold to them) as 'ff' and 'ss' are
14818 * multi-character folds.
14820 * The Unicode standard says that multi character folds consist
14821 * of either two or three characters. That means we would be
14822 * splitting one if the final character in the node is at the
14823 * beginning of either type, or is the second of a three
14827 * ender is the code point of the character that won't fit
14829 * s points to just beyond the final byte in the node.
14830 * It's where we would place ender if there were
14831 * room, and where in fact we do place ender's fold
14832 * in the code below, as we've over-allocated space
14833 * for s0 (hence s) to allow for this
14834 * e starts at 's' and advances as we append things.
14835 * old_s is the same as 's'. (If ender had fit, 's' would
14836 * have been advanced to beyond it).
14837 * old_old_s points to the beginning byte of the final
14838 * character in the node
14839 * p points to the beginning byte in the input of the
14840 * character beyond 'ender'.
14841 * oldp points to the beginning byte in the input of
14844 * In the case of /il, we haven't folded anything that could be
14845 * affected by the locale. That means only above-Latin1
14846 * characters that fold to other above-latin1 characters get
14847 * folded at compile time. To check where a good place to
14848 * split nodes is, everything in it will have to be folded.
14849 * The boolean 'maybe_exactfu' keeps track in /il if there are
14850 * any unfolded characters in the node. */
14851 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14853 /* If we do need to fold the node, we need a place to store the
14854 * folded copy, and a way to map back to the unfolded original
14856 char * locfold_buf = NULL;
14857 Size_t * loc_correspondence = NULL;
14859 if (! need_to_fold_loc) { /* The normal case. Just
14860 initialize to the actual node */
14863 s = old_old_s; /* Point to the beginning of the final char
14864 that fits in the node */
14868 /* Here, we have filled a /il node, and there are unfolded
14869 * characters in it. If the runtime locale turns out to be
14870 * UTF-8, there are possible multi-character folds, just
14871 * like when not under /l. The node hence can't terminate
14872 * in the middle of such a fold. To determine this, we
14873 * have to create a folded copy of this node. That means
14874 * reparsing the node, folding everything assuming a UTF-8
14875 * locale. (If at runtime it isn't such a locale, the
14876 * actions here wouldn't have been necessary, but we have
14877 * to assume the worst case.) If we find we need to back
14878 * off the folded string, we do so, and then map that
14879 * position back to the original unfolded node, which then
14880 * gets output, truncated at that spot */
14882 char * redo_p = RExC_parse;
14886 /* Allow enough space assuming a single byte input folds to
14887 * a single byte output, plus assume that the two unparsed
14888 * characters (that we may need) fold to the largest number
14889 * of bytes possible, plus extra for one more worst case
14890 * scenario. In the loop below, if we start eating into
14891 * that final spare space, we enlarge this initial space */
14892 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14894 Newxz(locfold_buf, size, char);
14895 Newxz(loc_correspondence, size, Size_t);
14897 /* Redo this node's parse, folding into 'locfold_buf' */
14898 redo_p = RExC_parse;
14899 old_redo_e = redo_e = locfold_buf;
14900 while (redo_p <= oldp) {
14902 old_redo_e = redo_e;
14903 loc_correspondence[redo_e - locfold_buf]
14904 = redo_p - RExC_parse;
14909 (void) _to_utf8_fold_flags((U8 *) redo_p,
14914 redo_e += added_len;
14915 redo_p += UTF8SKIP(redo_p);
14919 /* Note that if this code is run on some ancient
14920 * Unicode versions, SHARP S doesn't fold to 'ss',
14921 * but rather than clutter the code with #ifdef's,
14922 * as is done above, we ignore that possibility.
14923 * This is ok because this code doesn't affect what
14924 * gets matched, but merely where the node gets
14926 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14927 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14937 /* If we're getting so close to the end that a
14938 * worst-case fold in the next character would cause us
14939 * to overflow, increase, assuming one byte output byte
14940 * per one byte input one, plus room for another worst
14942 if ( redo_p <= oldp
14943 && redo_e > locfold_buf + size
14944 - (UTF8_MAXBYTES_CASE + 1))
14946 Size_t new_size = size
14948 + UTF8_MAXBYTES_CASE + 1;
14949 Ptrdiff_t e_offset = redo_e - locfold_buf;
14951 Renew(locfold_buf, new_size, char);
14952 Renew(loc_correspondence, new_size, Size_t);
14955 redo_e = locfold_buf + e_offset;
14959 /* Set so that things are in terms of the folded, temporary
14962 s_start = locfold_buf;
14967 /* Here, we have 's', 's_start' and 'e' set up to point to the
14968 * input that goes into the node, folded.
14970 * If the final character of the node and the fold of ender
14971 * form the first two characters of a three character fold, we
14972 * need to peek ahead at the next (unparsed) character in the
14973 * input to determine if the three actually do form such a
14974 * fold. Just looking at that character is not generally
14975 * sufficient, as it could be, for example, an escape sequence
14976 * that evaluates to something else, and it needs to be folded.
14978 * khw originally thought to just go through the parse loop one
14979 * extra time, but that doesn't work easily as that iteration
14980 * could cause things to think that the parse is over and to
14981 * goto loopdone. The character could be a '$' for example, or
14982 * the character beyond could be a quantifier, and other
14983 * glitches as well.
14985 * The solution used here for peeking ahead is to look at that
14986 * next character. If it isn't ASCII punctuation, then it will
14987 * be something that would continue on in an EXACTish node if
14988 * there were space. We append the fold of it to s, having
14989 * reserved enough room in s0 for the purpose. If we can't
14990 * reasonably peek ahead, we instead assume the worst case:
14991 * that it is something that would form the completion of a
14994 * If we can't split between s and ender, we work backwards
14995 * character-by-character down to s0. At each current point
14996 * see if we are at the beginning of a multi-char fold. If so,
14997 * that means we would be splitting the fold across nodes, and
14998 * so we back up one and try again.
15000 * If we're not at the beginning, we still could be at the
15001 * final two characters of a (rare) three character fold. We
15002 * check if the sequence starting at the character before the
15003 * current position (and including the current and next
15004 * characters) is a three character fold. If not, the node can
15005 * be split here. If it is, we have to backup two characters
15008 * Otherwise, the node can be split at the current position.
15010 * The same logic is used for UTF-8 patterns and not */
15014 /* Append the fold of ender */
15015 (void) _to_uni_fold_flags(
15019 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15020 ? FOLD_FLAGS_NOMIX_ASCII
15024 /* 's' and the character folded to by ender may be the
15025 * first two of a three-character fold, in which case the
15026 * node should not be split here. That may mean examining
15027 * the so-far unparsed character starting at 'p'. But if
15028 * ender folded to more than one character, we already have
15029 * three characters to look at. Also, we first check if
15030 * the sequence consisting of s and the next character form
15031 * the first two of some three character fold. If not,
15032 * there's no need to peek ahead. */
15033 if ( added_len <= UTF8SKIP(e - added_len)
15034 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15036 /* Here, the two do form the beginning of a potential
15037 * three character fold. The unexamined character may
15038 * or may not complete it. Peek at it. It might be
15039 * something that ends the node or an escape sequence,
15040 * in which case we don't know without a lot of work
15041 * what it evaluates to, so we have to assume the worst
15042 * case: that it does complete the fold, and so we
15043 * can't split here. All such instances will have
15044 * that character be an ASCII punctuation character,
15045 * like a backslash. So, for that case, backup one and
15046 * drop down to try at that position */
15048 s = (char *) utf8_hop_back((U8 *) s, -1,
15053 /* Here, since it's not punctuation, it must be a
15054 * real character, and we can append its fold to
15055 * 'e' (having deliberately reserved enough space
15056 * for this eventuality) and drop down to check if
15057 * the three actually do form a folded sequence */
15058 (void) _to_utf8_fold_flags(
15059 (U8 *) p, (U8 *) RExC_end,
15062 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15063 ? FOLD_FLAGS_NOMIX_ASCII
15069 /* Here, we either have three characters available in
15070 * sequence starting at 's', or we have two characters and
15071 * know that the following one can't possibly be part of a
15072 * three character fold. We go through the node backwards
15073 * until we find a place where we can split it without
15074 * breaking apart a multi-character fold. At any given
15075 * point we have to worry about if such a fold begins at
15076 * the current 's', and also if a three-character fold
15077 * begins at s-1, (containing s and s+1). Splitting in
15078 * either case would break apart a fold */
15080 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15083 /* If is a multi-char fold, can't split here. Backup
15084 * one char and try again */
15085 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15091 /* If the two characters beginning at 's' are part of a
15092 * three character fold starting at the character
15093 * before s, we can't split either before or after s.
15094 * Backup two chars and try again */
15095 if ( LIKELY(s > s_start)
15096 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15099 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15104 /* Here there's no multi-char fold between s and the
15105 * next character following it. We can split */
15109 } while (s > s_start); /* End of loops backing up through the node */
15111 /* Here we either couldn't find a place to split the node,
15112 * or else we broke out of the loop setting 'splittable' to
15113 * true. In the latter case, the place to split is between
15114 * the first and second characters in the sequence starting
15120 else { /* Pattern not UTF-8 */
15121 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15122 || ASCII_FOLD_RESTRICTED)
15124 assert( toLOWER_L1(ender) < 256 );
15125 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15133 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15140 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15141 || ASCII_FOLD_RESTRICTED)
15143 assert( toLOWER_L1(ender) < 256 );
15144 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15154 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15160 if ( LIKELY(s > s_start)
15161 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15171 } while (s > s_start);
15178 /* Here, we are done backing up. If we didn't backup at all
15179 * (the likely case), just proceed */
15182 /* If we did find a place to split, reparse the entire node
15183 * stopping where we have calculated. */
15186 /* If we created a temporary folded string under /l, we
15187 * have to map that back to the original */
15188 if (need_to_fold_loc) {
15189 upper_fill = loc_correspondence[s - s_start];
15190 if (upper_fill == 0) {
15191 FAIL2("panic: loc_correspondence[%d] is 0",
15192 (int) (s - s_start));
15194 Safefree(locfold_buf);
15195 Safefree(loc_correspondence);
15198 upper_fill = s - s0;
15203 /* Here the node consists entirely of non-final multi-char
15204 * folds. (Likely it is all 'f's or all 's's.) There's no
15205 * decent place to split it, so give up and just take the
15210 if (need_to_fold_loc) {
15211 Safefree(locfold_buf);
15212 Safefree(loc_correspondence);
15214 } /* End of verifying node ends with an appropriate char */
15216 /* We need to start the next node at the character that didn't fit
15220 loopdone: /* Jumped to when encounters something that shouldn't be
15223 /* Free up any over-allocated space; cast is to silence bogus
15224 * warning in MS VC */
15225 change_engine_size(pRExC_state,
15226 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15228 /* I (khw) don't know if you can get here with zero length, but the
15229 * old code handled this situation by creating a zero-length EXACT
15230 * node. Might as well be NOTHING instead */
15232 OP(REGNODE_p(ret)) = NOTHING;
15236 /* If the node type is EXACT here, check to see if it
15237 * should be EXACTL, or EXACT_REQ8. */
15238 if (node_type == EXACT) {
15240 node_type = EXACTL;
15242 else if (requires_utf8_target) {
15243 node_type = EXACT_REQ8;
15246 else if (node_type == LEXACT) {
15247 if (requires_utf8_target) {
15248 node_type = LEXACT_REQ8;
15252 if ( UNLIKELY(has_micro_sign || has_ss)
15253 && (node_type == EXACTFU || ( node_type == EXACTF
15254 && maybe_exactfu)))
15255 { /* These two conditions are problematic in non-UTF-8
15258 node_type = EXACTFUP;
15260 else if (node_type == EXACTFL) {
15262 /* 'maybe_exactfu' is deliberately set above to
15263 * indicate this node type, where all code points in it
15265 if (maybe_exactfu) {
15266 node_type = EXACTFLU8;
15269 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15271 /* A character that folds to more than one will
15272 * match multiple characters, so can't be SIMPLE.
15273 * We don't have to worry about this with EXACTFLU8
15274 * nodes just above, as they have already been
15275 * folded (since the fold doesn't vary at run
15276 * time). Here, if the final character in the node
15277 * folds to multiple, it can't be simple. (This
15278 * only has an effect if the node has only a single
15279 * character, hence the final one, as elsewhere we
15280 * turn off simple for nodes whose length > 1 */
15284 else if (node_type == EXACTF) { /* Means is /di */
15286 /* This intermediate variable is needed solely because
15287 * the asserts in the macro where used exceed Win32's
15288 * literal string capacity */
15289 char first_char = * STRING(REGNODE_p(ret));
15291 /* If 'maybe_exactfu' is clear, then we need to stay
15292 * /di. If it is set, it means there are no code
15293 * points that match differently depending on UTF8ness
15294 * of the target string, so it can become an EXACTFU
15296 if (! maybe_exactfu) {
15297 RExC_seen_d_op = TRUE;
15299 else if ( isALPHA_FOLD_EQ(first_char, 's')
15300 || isALPHA_FOLD_EQ(ender, 's'))
15302 /* But, if the node begins or ends in an 's' we
15303 * have to defer changing it into an EXACTFU, as
15304 * the node could later get joined with another one
15305 * that ends or begins with 's' creating an 'ss'
15306 * sequence which would then wrongly match the
15307 * sharp s without the target being UTF-8. We
15308 * create a special node that we resolve later when
15309 * we join nodes together */
15311 node_type = EXACTFU_S_EDGE;
15314 node_type = EXACTFU;
15318 if (requires_utf8_target && node_type == EXACTFU) {
15319 node_type = EXACTFU_REQ8;
15323 OP(REGNODE_p(ret)) = node_type;
15324 setSTR_LEN(REGNODE_p(ret), len);
15325 RExC_emit += STR_SZ(len);
15327 /* If the node isn't a single character, it can't be SIMPLE */
15328 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15332 *flagp |= HASWIDTH | maybe_SIMPLE;
15335 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15339 /* len is STRLEN which is unsigned, need to copy to signed */
15342 vFAIL("Internal disaster");
15345 } /* End of label 'defchar:' */
15347 } /* End of giant switch on input character */
15349 /* Position parse to next real character */
15350 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15351 FALSE /* Don't force to /x */ );
15352 if ( *RExC_parse == '{'
15353 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15355 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15357 vFAIL("Unescaped left brace in regex is illegal here");
15359 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15360 " passed through");
15368 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15370 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
15371 * sets up the bitmap and any flags, removing those code points from the
15372 * inversion list, setting it to NULL should it become completely empty */
15375 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15376 assert(PL_regkind[OP(node)] == ANYOF);
15378 /* There is no bitmap for this node type */
15379 if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15383 ANYOF_BITMAP_ZERO(node);
15384 if (*invlist_ptr) {
15386 /* This gets set if we actually need to modify things */
15387 bool change_invlist = FALSE;
15391 /* Start looking through *invlist_ptr */
15392 invlist_iterinit(*invlist_ptr);
15393 while (invlist_iternext(*invlist_ptr, &start, &end)) {
15397 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15398 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15401 /* Quit if are above what we should change */
15402 if (start >= NUM_ANYOF_CODE_POINTS) {
15406 change_invlist = TRUE;
15408 /* Set all the bits in the range, up to the max that we are doing */
15409 high = (end < NUM_ANYOF_CODE_POINTS - 1)
15411 : NUM_ANYOF_CODE_POINTS - 1;
15412 for (i = start; i <= (int) high; i++) {
15413 ANYOF_BITMAP_SET(node, i);
15416 invlist_iterfinish(*invlist_ptr);
15418 /* Done with loop; remove any code points that are in the bitmap from
15419 * *invlist_ptr; similarly for code points above the bitmap if we have
15420 * a flag to match all of them anyways */
15421 if (change_invlist) {
15422 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15424 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15425 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15428 /* If have completely emptied it, remove it completely */
15429 if (_invlist_len(*invlist_ptr) == 0) {
15430 SvREFCNT_dec_NN(*invlist_ptr);
15431 *invlist_ptr = NULL;
15436 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15437 Character classes ([:foo:]) can also be negated ([:^foo:]).
15438 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15439 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15440 but trigger failures because they are currently unimplemented. */
15442 #define POSIXCC_DONE(c) ((c) == ':')
15443 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15444 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15445 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15447 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
15448 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
15449 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
15451 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15453 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15455 #define ADD_POSIX_WARNING(p, text) STMT_START { \
15456 if (posix_warnings) { \
15457 if (! RExC_warn_text ) RExC_warn_text = \
15458 (AV *) sv_2mortal((SV *) newAV()); \
15459 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
15463 REPORT_LOCATION_ARGS(p))); \
15466 #define CLEAR_POSIX_WARNINGS() \
15468 if (posix_warnings && RExC_warn_text) \
15469 av_clear(RExC_warn_text); \
15472 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
15474 CLEAR_POSIX_WARNINGS(); \
15479 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15481 const char * const s, /* Where the putative posix class begins.
15482 Normally, this is one past the '['. This
15483 parameter exists so it can be somewhere
15484 besides RExC_parse. */
15485 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15487 AV ** posix_warnings, /* Where to place any generated warnings, or
15489 const bool check_only /* Don't die if error */
15492 /* This parses what the caller thinks may be one of the three POSIX
15494 * 1) a character class, like [:blank:]
15495 * 2) a collating symbol, like [. .]
15496 * 3) an equivalence class, like [= =]
15497 * In the latter two cases, it croaks if it finds a syntactically legal
15498 * one, as these are not handled by Perl.
15500 * The main purpose is to look for a POSIX character class. It returns:
15501 * a) the class number
15502 * if it is a completely syntactically and semantically legal class.
15503 * 'updated_parse_ptr', if not NULL, is set to point to just after the
15504 * closing ']' of the class
15505 * b) OOB_NAMEDCLASS
15506 * if it appears that one of the three POSIX constructs was meant, but
15507 * its specification was somehow defective. 'updated_parse_ptr', if
15508 * not NULL, is set to point to the character just after the end
15509 * character of the class. See below for handling of warnings.
15510 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15511 * if it doesn't appear that a POSIX construct was intended.
15512 * 'updated_parse_ptr' is not changed. No warnings nor errors are
15515 * In b) there may be errors or warnings generated. If 'check_only' is
15516 * TRUE, then any errors are discarded. Warnings are returned to the
15517 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
15518 * instead it is NULL, warnings are suppressed.
15520 * The reason for this function, and its complexity is that a bracketed
15521 * character class can contain just about anything. But it's easy to
15522 * mistype the very specific posix class syntax but yielding a valid
15523 * regular bracketed class, so it silently gets compiled into something
15524 * quite unintended.
15526 * The solution adopted here maintains backward compatibility except that
15527 * it adds a warning if it looks like a posix class was intended but
15528 * improperly specified. The warning is not raised unless what is input
15529 * very closely resembles one of the 14 legal posix classes. To do this,
15530 * it uses fuzzy parsing. It calculates how many single-character edits it
15531 * would take to transform what was input into a legal posix class. Only
15532 * if that number is quite small does it think that the intention was a
15533 * posix class. Obviously these are heuristics, and there will be cases
15534 * where it errs on one side or another, and they can be tweaked as
15535 * experience informs.
15537 * The syntax for a legal posix class is:
15539 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15541 * What this routine considers syntactically to be an intended posix class
15542 * is this (the comments indicate some restrictions that the pattern
15545 * qr/(?x: \[? # The left bracket, possibly
15547 * \h* # possibly followed by blanks
15548 * (?: \^ \h* )? # possibly a misplaced caret
15549 * [:;]? # The opening class character,
15550 * # possibly omitted. A typo
15551 * # semi-colon can also be used.
15553 * \^? # possibly a correctly placed
15554 * # caret, but not if there was also
15555 * # a misplaced one
15557 * .{3,15} # The class name. If there are
15558 * # deviations from the legal syntax,
15559 * # its edit distance must be close
15560 * # to a real class name in order
15561 * # for it to be considered to be
15562 * # an intended posix class.
15564 * [[:punct:]]? # The closing class character,
15565 * # possibly omitted. If not a colon
15566 * # nor semi colon, the class name
15567 * # must be even closer to a valid
15570 * \]? # The right bracket, possibly
15574 * In the above, \h must be ASCII-only.
15576 * These are heuristics, and can be tweaked as field experience dictates.
15577 * There will be cases when someone didn't intend to specify a posix class
15578 * that this warns as being so. The goal is to minimize these, while
15579 * maximizing the catching of things intended to be a posix class that
15580 * aren't parsed as such.
15584 const char * const e = RExC_end;
15585 unsigned complement = 0; /* If to complement the class */
15586 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15587 bool has_opening_bracket = FALSE;
15588 bool has_opening_colon = FALSE;
15589 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15591 const char * possible_end = NULL; /* used for a 2nd parse pass */
15592 const char* name_start; /* ptr to class name first char */
15594 /* If the number of single-character typos the input name is away from a
15595 * legal name is no more than this number, it is considered to have meant
15596 * the legal name */
15597 int max_distance = 2;
15599 /* to store the name. The size determines the maximum length before we
15600 * decide that no posix class was intended. Should be at least
15601 * sizeof("alphanumeric") */
15603 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15605 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15607 CLEAR_POSIX_WARNINGS();
15610 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15613 if (*(p - 1) != '[') {
15614 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15615 found_problem = TRUE;
15618 has_opening_bracket = TRUE;
15621 /* They could be confused and think you can put spaces between the
15624 found_problem = TRUE;
15628 } while (p < e && isBLANK(*p));
15630 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15633 /* For [. .] and [= =]. These are quite different internally from [: :],
15634 * so they are handled separately. */
15635 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15636 and 1 for at least one char in it
15639 const char open_char = *p;
15640 const char * temp_ptr = p + 1;
15642 /* These two constructs are not handled by perl, and if we find a
15643 * syntactically valid one, we croak. khw, who wrote this code, finds
15644 * this explanation of them very unclear:
15645 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15646 * And searching the rest of the internet wasn't very helpful either.
15647 * It looks like just about any byte can be in these constructs,
15648 * depending on the locale. But unless the pattern is being compiled
15649 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15650 * In that case, it looks like [= =] isn't allowed at all, and that
15651 * [. .] could be any single code point, but for longer strings the
15652 * constituent characters would have to be the ASCII alphabetics plus
15653 * the minus-hyphen. Any sensible locale definition would limit itself
15654 * to these. And any portable one definitely should. Trying to parse
15655 * the general case is a nightmare (see [perl #127604]). So, this code
15656 * looks only for interiors of these constructs that match:
15658 * Using \w relaxes the apparent rules a little, without adding much
15659 * danger of mistaking something else for one of these constructs.
15661 * [. .] in some implementations described on the internet is usable to
15662 * escape a character that otherwise is special in bracketed character
15663 * classes. For example [.].] means a literal right bracket instead of
15664 * the ending of the class
15666 * [= =] can legitimately contain a [. .] construct, but we don't
15667 * handle this case, as that [. .] construct will later get parsed
15668 * itself and croak then. And [= =] is checked for even when not under
15669 * /l, as Perl has long done so.
15671 * The code below relies on there being a trailing NUL, so it doesn't
15672 * have to keep checking if the parse ptr < e.
15674 if (temp_ptr[1] == open_char) {
15677 else while ( temp_ptr < e
15678 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15683 if (*temp_ptr == open_char) {
15685 if (*temp_ptr == ']') {
15687 if (! found_problem && ! check_only) {
15688 RExC_parse = (char *) temp_ptr;
15689 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15690 "extensions", open_char, open_char);
15693 /* Here, the syntax wasn't completely valid, or else the call
15694 * is to check-only */
15695 if (updated_parse_ptr) {
15696 *updated_parse_ptr = (char *) temp_ptr;
15699 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15703 /* If we find something that started out to look like one of these
15704 * constructs, but isn't, we continue below so that it can be checked
15705 * for being a class name with a typo of '.' or '=' instead of a colon.
15709 /* Here, we think there is a possibility that a [: :] class was meant, and
15710 * we have the first real character. It could be they think the '^' comes
15713 found_problem = TRUE;
15714 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15719 found_problem = TRUE;
15723 } while (p < e && isBLANK(*p));
15725 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15729 /* But the first character should be a colon, which they could have easily
15730 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15731 * distinguish from a colon, so treat that as a colon). */
15734 has_opening_colon = TRUE;
15736 else if (*p == ';') {
15737 found_problem = TRUE;
15739 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15740 has_opening_colon = TRUE;
15743 found_problem = TRUE;
15744 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15746 /* Consider an initial punctuation (not one of the recognized ones) to
15747 * be a left terminator */
15748 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15753 /* They may think that you can put spaces between the components */
15755 found_problem = TRUE;
15759 } while (p < e && isBLANK(*p));
15761 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15766 /* We consider something like [^:^alnum:]] to not have been intended to
15767 * be a posix class, but XXX maybe we should */
15769 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15776 /* Again, they may think that you can put spaces between the components */
15778 found_problem = TRUE;
15782 } while (p < e && isBLANK(*p));
15784 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15789 /* XXX This ']' may be a typo, and something else was meant. But
15790 * treating it as such creates enough complications, that that
15791 * possibility isn't currently considered here. So we assume that the
15792 * ']' is what is intended, and if we've already found an initial '[',
15793 * this leaves this construct looking like [:] or [:^], which almost
15794 * certainly weren't intended to be posix classes */
15795 if (has_opening_bracket) {
15796 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15799 /* But this function can be called when we parse the colon for
15800 * something like qr/[alpha:]]/, so we back up to look for the
15805 found_problem = TRUE;
15806 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15808 else if (*p != ':') {
15810 /* XXX We are currently very restrictive here, so this code doesn't
15811 * consider the possibility that, say, /[alpha.]]/ was intended to
15812 * be a posix class. */
15813 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15816 /* Here we have something like 'foo:]'. There was no initial colon,
15817 * and we back up over 'foo. XXX Unlike the going forward case, we
15818 * don't handle typos of non-word chars in the middle */
15819 has_opening_colon = FALSE;
15822 while (p > RExC_start && isWORDCHAR(*p)) {
15827 /* Here, we have positioned ourselves to where we think the first
15828 * character in the potential class is */
15831 /* Now the interior really starts. There are certain key characters that
15832 * can end the interior, or these could just be typos. To catch both
15833 * cases, we may have to do two passes. In the first pass, we keep on
15834 * going unless we come to a sequence that matches
15835 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15836 * This means it takes a sequence to end the pass, so two typos in a row if
15837 * that wasn't what was intended. If the class is perfectly formed, just
15838 * this one pass is needed. We also stop if there are too many characters
15839 * being accumulated, but this number is deliberately set higher than any
15840 * real class. It is set high enough so that someone who thinks that
15841 * 'alphanumeric' is a correct name would get warned that it wasn't.
15842 * While doing the pass, we keep track of where the key characters were in
15843 * it. If we don't find an end to the class, and one of the key characters
15844 * was found, we redo the pass, but stop when we get to that character.
15845 * Thus the key character was considered a typo in the first pass, but a
15846 * terminator in the second. If two key characters are found, we stop at
15847 * the second one in the first pass. Again this can miss two typos, but
15848 * catches a single one
15850 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15851 * point to the first key character. For the second pass, it starts as -1.
15857 bool has_blank = FALSE;
15858 bool has_upper = FALSE;
15859 bool has_terminating_colon = FALSE;
15860 bool has_terminating_bracket = FALSE;
15861 bool has_semi_colon = FALSE;
15862 unsigned int name_len = 0;
15863 int punct_count = 0;
15867 /* Squeeze out blanks when looking up the class name below */
15868 if (isBLANK(*p) ) {
15870 found_problem = TRUE;
15875 /* The name will end with a punctuation */
15877 const char * peek = p + 1;
15879 /* Treat any non-']' punctuation followed by a ']' (possibly
15880 * with intervening blanks) as trying to terminate the class.
15881 * ']]' is very likely to mean a class was intended (but
15882 * missing the colon), but the warning message that gets
15883 * generated shows the error position better if we exit the
15884 * loop at the bottom (eventually), so skip it here. */
15886 if (peek < e && isBLANK(*peek)) {
15888 found_problem = TRUE;
15891 } while (peek < e && isBLANK(*peek));
15894 if (peek < e && *peek == ']') {
15895 has_terminating_bracket = TRUE;
15897 has_terminating_colon = TRUE;
15899 else if (*p == ';') {
15900 has_semi_colon = TRUE;
15901 has_terminating_colon = TRUE;
15904 found_problem = TRUE;
15911 /* Here we have punctuation we thought didn't end the class.
15912 * Keep track of the position of the key characters that are
15913 * more likely to have been class-enders */
15914 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15916 /* Allow just one such possible class-ender not actually
15917 * ending the class. */
15918 if (possible_end) {
15924 /* If we have too many punctuation characters, no use in
15926 if (++punct_count > max_distance) {
15930 /* Treat the punctuation as a typo. */
15931 input_text[name_len++] = *p;
15934 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15935 input_text[name_len++] = toLOWER(*p);
15937 found_problem = TRUE;
15939 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15940 input_text[name_len++] = *p;
15944 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15948 /* The declaration of 'input_text' is how long we allow a potential
15949 * class name to be, before saying they didn't mean a class name at
15951 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15956 /* We get to here when the possible class name hasn't been properly
15957 * terminated before:
15958 * 1) we ran off the end of the pattern; or
15959 * 2) found two characters, each of which might have been intended to
15960 * be the name's terminator
15961 * 3) found so many punctuation characters in the purported name,
15962 * that the edit distance to a valid one is exceeded
15963 * 4) we decided it was more characters than anyone could have
15964 * intended to be one. */
15966 found_problem = TRUE;
15968 /* In the final two cases, we know that looking up what we've
15969 * accumulated won't lead to a match, even a fuzzy one. */
15970 if ( name_len >= C_ARRAY_LENGTH(input_text)
15971 || punct_count > max_distance)
15973 /* If there was an intermediate key character that could have been
15974 * an intended end, redo the parse, but stop there */
15975 if (possible_end && possible_end != (char *) -1) {
15976 possible_end = (char *) -1; /* Special signal value to say
15977 we've done a first pass */
15982 /* Otherwise, it can't have meant to have been a class */
15983 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15986 /* If we ran off the end, and the final character was a punctuation
15987 * one, back up one, to look at that final one just below. Later, we
15988 * will restore the parse pointer if appropriate */
15989 if (name_len && p == e && isPUNCT(*(p-1))) {
15994 if (p < e && isPUNCT(*p)) {
15996 has_terminating_bracket = TRUE;
15998 /* If this is a 2nd ']', and the first one is just below this
15999 * one, consider that to be the real terminator. This gives a
16000 * uniform and better positioning for the warning message */
16002 && possible_end != (char *) -1
16003 && *possible_end == ']'
16004 && name_len && input_text[name_len - 1] == ']')
16009 /* And this is actually equivalent to having done the 2nd
16010 * pass now, so set it to not try again */
16011 possible_end = (char *) -1;
16016 has_terminating_colon = TRUE;
16018 else if (*p == ';') {
16019 has_semi_colon = TRUE;
16020 has_terminating_colon = TRUE;
16028 /* Here, we have a class name to look up. We can short circuit the
16029 * stuff below for short names that can't possibly be meant to be a
16030 * class name. (We can do this on the first pass, as any second pass
16031 * will yield an even shorter name) */
16032 if (name_len < 3) {
16033 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16036 /* Find which class it is. Initially switch on the length of the name.
16038 switch (name_len) {
16040 if (memEQs(name_start, 4, "word")) {
16041 /* this is not POSIX, this is the Perl \w */
16042 class_number = ANYOF_WORDCHAR;
16046 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16047 * graph lower print punct space upper
16048 * Offset 4 gives the best switch position. */
16049 switch (name_start[4]) {
16051 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16052 class_number = ANYOF_ALPHA;
16055 if (memBEGINs(name_start, 5, "spac")) /* space */
16056 class_number = ANYOF_SPACE;
16059 if (memBEGINs(name_start, 5, "grap")) /* graph */
16060 class_number = ANYOF_GRAPH;
16063 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16064 class_number = ANYOF_ASCII;
16067 if (memBEGINs(name_start, 5, "blan")) /* blank */
16068 class_number = ANYOF_BLANK;
16071 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16072 class_number = ANYOF_CNTRL;
16075 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16076 class_number = ANYOF_ALPHANUMERIC;
16079 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16080 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16081 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16082 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16085 if (memBEGINs(name_start, 5, "digi")) /* digit */
16086 class_number = ANYOF_DIGIT;
16087 else if (memBEGINs(name_start, 5, "prin")) /* print */
16088 class_number = ANYOF_PRINT;
16089 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16090 class_number = ANYOF_PUNCT;
16095 if (memEQs(name_start, 6, "xdigit"))
16096 class_number = ANYOF_XDIGIT;
16100 /* If the name exactly matches a posix class name the class number will
16101 * here be set to it, and the input almost certainly was meant to be a
16102 * posix class, so we can skip further checking. If instead the syntax
16103 * is exactly correct, but the name isn't one of the legal ones, we
16104 * will return that as an error below. But if neither of these apply,
16105 * it could be that no posix class was intended at all, or that one
16106 * was, but there was a typo. We tease these apart by doing fuzzy
16107 * matching on the name */
16108 if (class_number == OOB_NAMEDCLASS && found_problem) {
16109 const UV posix_names[][6] = {
16110 { 'a', 'l', 'n', 'u', 'm' },
16111 { 'a', 'l', 'p', 'h', 'a' },
16112 { 'a', 's', 'c', 'i', 'i' },
16113 { 'b', 'l', 'a', 'n', 'k' },
16114 { 'c', 'n', 't', 'r', 'l' },
16115 { 'd', 'i', 'g', 'i', 't' },
16116 { 'g', 'r', 'a', 'p', 'h' },
16117 { 'l', 'o', 'w', 'e', 'r' },
16118 { 'p', 'r', 'i', 'n', 't' },
16119 { 'p', 'u', 'n', 'c', 't' },
16120 { 's', 'p', 'a', 'c', 'e' },
16121 { 'u', 'p', 'p', 'e', 'r' },
16122 { 'w', 'o', 'r', 'd' },
16123 { 'x', 'd', 'i', 'g', 'i', 't' }
16125 /* The names of the above all have added NULs to make them the same
16126 * size, so we need to also have the real lengths */
16127 const UV posix_name_lengths[] = {
16128 sizeof("alnum") - 1,
16129 sizeof("alpha") - 1,
16130 sizeof("ascii") - 1,
16131 sizeof("blank") - 1,
16132 sizeof("cntrl") - 1,
16133 sizeof("digit") - 1,
16134 sizeof("graph") - 1,
16135 sizeof("lower") - 1,
16136 sizeof("print") - 1,
16137 sizeof("punct") - 1,
16138 sizeof("space") - 1,
16139 sizeof("upper") - 1,
16140 sizeof("word") - 1,
16141 sizeof("xdigit")- 1
16144 int temp_max = max_distance; /* Use a temporary, so if we
16145 reparse, we haven't changed the
16148 /* Use a smaller max edit distance if we are missing one of the
16150 if ( has_opening_bracket + has_opening_colon < 2
16151 || has_terminating_bracket + has_terminating_colon < 2)
16156 /* See if the input name is close to a legal one */
16157 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16159 /* Short circuit call if the lengths are too far apart to be
16161 if (abs( (int) (name_len - posix_name_lengths[i]))
16167 if (edit_distance(input_text,
16170 posix_name_lengths[i],
16174 { /* If it is close, it probably was intended to be a class */
16175 goto probably_meant_to_be;
16179 /* Here the input name is not close enough to a valid class name
16180 * for us to consider it to be intended to be a posix class. If
16181 * we haven't already done so, and the parse found a character that
16182 * could have been terminators for the name, but which we absorbed
16183 * as typos during the first pass, repeat the parse, signalling it
16184 * to stop at that character */
16185 if (possible_end && possible_end != (char *) -1) {
16186 possible_end = (char *) -1;
16191 /* Here neither pass found a close-enough class name */
16192 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16195 probably_meant_to_be:
16197 /* Here we think that a posix specification was intended. Update any
16199 if (updated_parse_ptr) {
16200 *updated_parse_ptr = (char *) p;
16203 /* If a posix class name was intended but incorrectly specified, we
16204 * output or return the warnings */
16205 if (found_problem) {
16207 /* We set flags for these issues in the parse loop above instead of
16208 * adding them to the list of warnings, because we can parse it
16209 * twice, and we only want one warning instance */
16211 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16214 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16216 if (has_semi_colon) {
16217 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16219 else if (! has_terminating_colon) {
16220 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16222 if (! has_terminating_bracket) {
16223 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16226 if ( posix_warnings
16228 && av_count(RExC_warn_text) > 0)
16230 *posix_warnings = RExC_warn_text;
16233 else if (class_number != OOB_NAMEDCLASS) {
16234 /* If it is a known class, return the class. The class number
16235 * #defines are structured so each complement is +1 to the normal
16237 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16239 else if (! check_only) {
16241 /* Here, it is an unrecognized class. This is an error (unless the
16242 * call is to check only, which we've already handled above) */
16243 const char * const complement_string = (complement)
16246 RExC_parse = (char *) p;
16247 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16249 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16253 return OOB_NAMEDCLASS;
16255 #undef ADD_POSIX_WARNING
16257 STATIC unsigned int
16258 S_regex_set_precedence(const U8 my_operator) {
16260 /* Returns the precedence in the (?[...]) construct of the input operator,
16261 * specified by its character representation. The precedence follows
16262 * general Perl rules, but it extends this so that ')' and ']' have (low)
16263 * precedence even though they aren't really operators */
16265 switch (my_operator) {
16281 NOT_REACHED; /* NOTREACHED */
16282 return 0; /* Silence compiler warning */
16285 STATIC regnode_offset
16286 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16287 I32 *flagp, U32 depth,
16288 char * const oregcomp_parse)
16290 /* Handle the (?[...]) construct to do set operations */
16292 U8 curchar; /* Current character being parsed */
16293 UV start, end; /* End points of code point ranges */
16294 SV* final = NULL; /* The end result inversion list */
16295 SV* result_string; /* 'final' stringified */
16296 AV* stack; /* stack of operators and operands not yet
16298 AV* fence_stack = NULL; /* A stack containing the positions in
16299 'stack' of where the undealt-with left
16300 parens would be if they were actually
16302 /* The 'volatile' is a workaround for an optimiser bug
16303 * in Solaris Studio 12.3. See RT #127455 */
16304 volatile IV fence = 0; /* Position of where most recent undealt-
16305 with left paren in stack is; -1 if none.
16307 STRLEN len; /* Temporary */
16308 regnode_offset node; /* Temporary, and final regnode returned by
16310 const bool save_fold = FOLD; /* Temporary */
16311 char *save_end, *save_parse; /* Temporaries */
16312 const bool in_locale = LOC; /* we turn off /l during processing */
16314 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16316 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16317 PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16319 DEBUG_PARSE("xcls");
16322 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16325 /* The use of this operator implies /u. This is required so that the
16326 * compile time values are valid in all runtime cases */
16327 REQUIRE_UNI_RULES(flagp, 0);
16329 ckWARNexperimental(RExC_parse,
16330 WARN_EXPERIMENTAL__REGEX_SETS,
16331 "The regex_sets feature is experimental");
16333 /* Everything in this construct is a metacharacter. Operands begin with
16334 * either a '\' (for an escape sequence), or a '[' for a bracketed
16335 * character class. Any other character should be an operator, or
16336 * parenthesis for grouping. Both types of operands are handled by calling
16337 * regclass() to parse them. It is called with a parameter to indicate to
16338 * return the computed inversion list. The parsing here is implemented via
16339 * a stack. Each entry on the stack is a single character representing one
16340 * of the operators; or else a pointer to an operand inversion list. */
16342 #define IS_OPERATOR(a) SvIOK(a)
16343 #define IS_OPERAND(a) (! IS_OPERATOR(a))
16345 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
16346 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16347 * with pronouncing it called it Reverse Polish instead, but now that YOU
16348 * know how to pronounce it you can use the correct term, thus giving due
16349 * credit to the person who invented it, and impressing your geek friends.
16350 * Wikipedia says that the pronounciation of "Ł" has been changing so that
16351 * it is now more like an English initial W (as in wonk) than an L.)
16353 * This means that, for example, 'a | b & c' is stored on the stack as
16361 * where the numbers in brackets give the stack [array] element number.
16362 * In this implementation, parentheses are not stored on the stack.
16363 * Instead a '(' creates a "fence" so that the part of the stack below the
16364 * fence is invisible except to the corresponding ')' (this allows us to
16365 * replace testing for parens, by using instead subtraction of the fence
16366 * position). As new operands are processed they are pushed onto the stack
16367 * (except as noted in the next paragraph). New operators of higher
16368 * precedence than the current final one are inserted on the stack before
16369 * the lhs operand (so that when the rhs is pushed next, everything will be
16370 * in the correct positions shown above. When an operator of equal or
16371 * lower precedence is encountered in parsing, all the stacked operations
16372 * of equal or higher precedence are evaluated, leaving the result as the
16373 * top entry on the stack. This makes higher precedence operations
16374 * evaluate before lower precedence ones, and causes operations of equal
16375 * precedence to left associate.
16377 * The only unary operator '!' is immediately pushed onto the stack when
16378 * encountered. When an operand is encountered, if the top of the stack is
16379 * a '!", the complement is immediately performed, and the '!' popped. The
16380 * resulting value is treated as a new operand, and the logic in the
16381 * previous paragraph is executed. Thus in the expression
16383 * the stack looks like
16389 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16396 * A ')' is treated as an operator with lower precedence than all the
16397 * aforementioned ones, which causes all operations on the stack above the
16398 * corresponding '(' to be evaluated down to a single resultant operand.
16399 * Then the fence for the '(' is removed, and the operand goes through the
16400 * algorithm above, without the fence.
16402 * A separate stack is kept of the fence positions, so that the position of
16403 * the latest so-far unbalanced '(' is at the top of it.
16405 * The ']' ending the construct is treated as the lowest operator of all,
16406 * so that everything gets evaluated down to a single operand, which is the
16409 sv_2mortal((SV *)(stack = newAV()));
16410 sv_2mortal((SV *)(fence_stack = newAV()));
16412 while (RExC_parse < RExC_end) {
16413 I32 top_index; /* Index of top-most element in 'stack' */
16414 SV** top_ptr; /* Pointer to top 'stack' element */
16415 SV* current = NULL; /* To contain the current inversion list
16417 SV* only_to_avoid_leaks;
16419 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16420 TRUE /* Force /x */ );
16421 if (RExC_parse >= RExC_end) { /* Fail */
16425 curchar = UCHARAT(RExC_parse);
16429 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16430 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16431 DEBUG_U(dump_regex_sets_structures(pRExC_state,
16432 stack, fence, fence_stack));
16435 top_index = av_tindex_skip_len_mg(stack);
16438 SV** stacked_ptr; /* Ptr to something already on 'stack' */
16439 char stacked_operator; /* The topmost operator on the 'stack'. */
16440 SV* lhs; /* Operand to the left of the operator */
16441 SV* rhs; /* Operand to the right of the operator */
16442 SV* fence_ptr; /* Pointer to top element of the fence
16446 if ( RExC_parse < RExC_end - 2
16447 && UCHARAT(RExC_parse + 1) == '?'
16448 && UCHARAT(RExC_parse + 2) == '^')
16450 const regnode_offset orig_emit = RExC_emit;
16451 SV * resultant_invlist;
16453 /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16454 * This happens when we have some thing like
16456 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16458 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
16460 * Here we would be handling the interpolated
16461 * '$thai_or_lao'. We handle this by a recursive call to
16462 * reg which returns the inversion list the
16463 * interpolated expression evaluates to. Actually, the
16464 * return is a special regnode containing a pointer to that
16465 * inversion list. If the return isn't that regnode alone,
16466 * we know that this wasn't such an interpolation, which is
16467 * an error: we need to get a single inversion list back
16468 * from the recursion */
16473 node = reg(pRExC_state, 2, flagp, depth+1);
16474 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16476 if ( OP(REGNODE_p(node)) != REGEX_SET
16477 /* If more than a single node returned, the nested
16478 * parens evaluated to more than just a (?[...]),
16479 * which isn't legal */
16480 || RExC_emit != orig_emit
16481 + NODE_STEP_REGNODE
16482 + regarglen[REGEX_SET])
16484 vFAIL("Expecting interpolated extended charclass");
16486 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16487 current = invlist_clone(resultant_invlist, NULL);
16488 SvREFCNT_dec(resultant_invlist);
16491 RExC_emit = orig_emit;
16492 goto handle_operand;
16495 /* A regular '('. Look behind for illegal syntax */
16496 if (top_index - fence >= 0) {
16497 /* If the top entry on the stack is an operator, it had
16498 * better be a '!', otherwise the entry below the top
16499 * operand should be an operator */
16500 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
16501 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16502 || ( IS_OPERAND(*top_ptr)
16503 && ( top_index - fence < 1
16504 || ! (stacked_ptr = av_fetch(stack,
16507 || ! IS_OPERATOR(*stacked_ptr))))
16510 vFAIL("Unexpected '(' with no preceding operator");
16514 /* Stack the position of this undealt-with left paren */
16515 av_push(fence_stack, newSViv(fence));
16516 fence = top_index + 1;
16520 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16521 * multi-char folds are allowed. */
16522 if (!regclass(pRExC_state, flagp, depth+1,
16523 TRUE, /* means parse just the next thing */
16524 FALSE, /* don't allow multi-char folds */
16525 FALSE, /* don't silence non-portable warnings. */
16527 FALSE, /* Require return to be an ANYOF */
16530 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16531 goto regclass_failed;
16536 /* regclass() will return with parsing just the \ sequence,
16537 * leaving the parse pointer at the next thing to parse */
16539 goto handle_operand;
16541 case '[': /* Is a bracketed character class */
16543 /* See if this is a [:posix:] class. */
16544 bool is_posix_class = (OOB_NAMEDCLASS
16545 < handle_possible_posix(pRExC_state,
16549 TRUE /* checking only */));
16550 /* If it is a posix class, leave the parse pointer at the '['
16551 * to fool regclass() into thinking it is part of a
16552 * '[[:posix:]]'. */
16553 if (! is_posix_class) {
16557 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16558 * multi-char folds are allowed. */
16559 if (!regclass(pRExC_state, flagp, depth+1,
16560 is_posix_class, /* parse the whole char
16561 class only if not a
16563 FALSE, /* don't allow multi-char folds */
16564 TRUE, /* silence non-portable warnings. */
16566 FALSE, /* Require return to be an ANYOF */
16569 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16570 goto regclass_failed;
16575 /* function call leaves parse pointing to the ']', except if we
16577 if (is_posix_class) {
16581 goto handle_operand;
16585 if (top_index >= 1) {
16586 goto join_operators;
16589 /* Only a single operand on the stack: are done */
16593 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16594 if (UCHARAT(RExC_parse - 1) == ']') {
16598 vFAIL("Unexpected ')'");
16601 /* If nothing after the fence, is missing an operand */
16602 if (top_index - fence < 0) {
16606 /* If at least two things on the stack, treat this as an
16608 if (top_index - fence >= 1) {
16609 goto join_operators;
16612 /* Here only a single thing on the fenced stack, and there is a
16613 * fence. Get rid of it */
16614 fence_ptr = av_pop(fence_stack);
16616 fence = SvIV(fence_ptr);
16617 SvREFCNT_dec_NN(fence_ptr);
16624 /* Having gotten rid of the fence, we pop the operand at the
16625 * stack top and process it as a newly encountered operand */
16626 current = av_pop(stack);
16627 if (IS_OPERAND(current)) {
16628 goto handle_operand;
16640 /* These binary operators should have a left operand already
16642 if ( top_index - fence < 0
16643 || top_index - fence == 1
16644 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16645 || ! IS_OPERAND(*top_ptr))
16647 goto unexpected_binary;
16650 /* If only the one operand is on the part of the stack visible
16651 * to us, we just place this operator in the proper position */
16652 if (top_index - fence < 2) {
16654 /* Place the operator before the operand */
16656 SV* lhs = av_pop(stack);
16657 av_push(stack, newSVuv(curchar));
16658 av_push(stack, lhs);
16662 /* But if there is something else on the stack, we need to
16663 * process it before this new operator if and only if the
16664 * stacked operation has equal or higher precedence than the
16669 /* The operator on the stack is supposed to be below both its
16671 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16672 || IS_OPERAND(*stacked_ptr))
16674 /* But if not, it's legal and indicates we are completely
16675 * done if and only if we're currently processing a ']',
16676 * which should be the final thing in the expression */
16677 if (curchar == ']') {
16683 vFAIL2("Unexpected binary operator '%c' with no "
16684 "preceding operand", curchar);
16686 stacked_operator = (char) SvUV(*stacked_ptr);
16688 if (regex_set_precedence(curchar)
16689 > regex_set_precedence(stacked_operator))
16691 /* Here, the new operator has higher precedence than the
16692 * stacked one. This means we need to add the new one to
16693 * the stack to await its rhs operand (and maybe more
16694 * stuff). We put it before the lhs operand, leaving
16695 * untouched the stacked operator and everything below it
16697 lhs = av_pop(stack);
16698 assert(IS_OPERAND(lhs));
16700 av_push(stack, newSVuv(curchar));
16701 av_push(stack, lhs);
16705 /* Here, the new operator has equal or lower precedence than
16706 * what's already there. This means the operation already
16707 * there should be performed now, before the new one. */
16709 rhs = av_pop(stack);
16710 if (! IS_OPERAND(rhs)) {
16712 /* This can happen when a ! is not followed by an operand,
16713 * like in /(?[\t &!])/ */
16717 lhs = av_pop(stack);
16719 if (! IS_OPERAND(lhs)) {
16721 /* This can happen when there is an empty (), like in
16722 * /(?[[0]+()+])/ */
16726 switch (stacked_operator) {
16728 _invlist_intersection(lhs, rhs, &rhs);
16733 _invlist_union(lhs, rhs, &rhs);
16737 _invlist_subtract(lhs, rhs, &rhs);
16740 case '^': /* The union minus the intersection */
16745 _invlist_union(lhs, rhs, &u);
16746 _invlist_intersection(lhs, rhs, &i);
16747 _invlist_subtract(u, i, &rhs);
16748 SvREFCNT_dec_NN(i);
16749 SvREFCNT_dec_NN(u);
16755 /* Here, the higher precedence operation has been done, and the
16756 * result is in 'rhs'. We overwrite the stacked operator with
16757 * the result. Then we redo this code to either push the new
16758 * operator onto the stack or perform any higher precedence
16759 * stacked operation */
16760 only_to_avoid_leaks = av_pop(stack);
16761 SvREFCNT_dec(only_to_avoid_leaks);
16762 av_push(stack, rhs);
16765 case '!': /* Highest priority, right associative */
16767 /* If what's already at the top of the stack is another '!",
16768 * they just cancel each other out */
16769 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16770 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16772 only_to_avoid_leaks = av_pop(stack);
16773 SvREFCNT_dec(only_to_avoid_leaks);
16775 else { /* Otherwise, since it's right associative, just push
16777 av_push(stack, newSVuv(curchar));
16782 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16783 if (RExC_parse >= RExC_end) {
16786 vFAIL("Unexpected character");
16790 /* Here 'current' is the operand. If something is already on the
16791 * stack, we have to check if it is a !. But first, the code above
16792 * may have altered the stack in the time since we earlier set
16795 top_index = av_tindex_skip_len_mg(stack);
16796 if (top_index - fence >= 0) {
16797 /* If the top entry on the stack is an operator, it had better
16798 * be a '!', otherwise the entry below the top operand should
16799 * be an operator */
16800 top_ptr = av_fetch(stack, top_index, FALSE);
16802 if (IS_OPERATOR(*top_ptr)) {
16804 /* The only permissible operator at the top of the stack is
16805 * '!', which is applied immediately to this operand. */
16806 curchar = (char) SvUV(*top_ptr);
16807 if (curchar != '!') {
16808 SvREFCNT_dec(current);
16809 vFAIL2("Unexpected binary operator '%c' with no "
16810 "preceding operand", curchar);
16813 _invlist_invert(current);
16815 only_to_avoid_leaks = av_pop(stack);
16816 SvREFCNT_dec(only_to_avoid_leaks);
16818 /* And we redo with the inverted operand. This allows
16819 * handling multiple ! in a row */
16820 goto handle_operand;
16822 /* Single operand is ok only for the non-binary ')'
16824 else if ((top_index - fence == 0 && curchar != ')')
16825 || (top_index - fence > 0
16826 && (! (stacked_ptr = av_fetch(stack,
16829 || IS_OPERAND(*stacked_ptr))))
16831 SvREFCNT_dec(current);
16832 vFAIL("Operand with no preceding operator");
16836 /* Here there was nothing on the stack or the top element was
16837 * another operand. Just add this new one */
16838 av_push(stack, current);
16840 } /* End of switch on next parse token */
16842 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16843 } /* End of loop parsing through the construct */
16845 vFAIL("Syntax error in (?[...])");
16849 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16850 if (RExC_parse < RExC_end) {
16854 vFAIL("Unexpected ']' with no following ')' in (?[...");
16857 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16858 vFAIL("Unmatched (");
16861 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16862 || ((final = av_pop(stack)) == NULL)
16863 || ! IS_OPERAND(final)
16864 || ! is_invlist(final)
16865 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16868 SvREFCNT_dec(final);
16869 vFAIL("Incomplete expression within '(?[ ])'");
16872 /* Here, 'final' is the resultant inversion list from evaluating the
16873 * expression. Return it if so requested */
16874 if (return_invlist) {
16875 *return_invlist = final;
16879 if (RExC_sets_depth) { /* If within a recursive call, return in a special
16882 node = regpnode(pRExC_state, REGEX_SET, final);
16886 /* Otherwise generate a resultant node, based on 'final'. regclass()
16887 * is expecting a string of ranges and individual code points */
16888 invlist_iterinit(final);
16889 result_string = newSVpvs("");
16890 while (invlist_iternext(final, &start, &end)) {
16891 if (start == end) {
16892 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16895 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16896 UVXf "}", start, end);
16900 /* About to generate an ANYOF (or similar) node from the inversion list
16901 * we have calculated */
16902 save_parse = RExC_parse;
16903 RExC_parse = SvPV(result_string, len);
16904 save_end = RExC_end;
16905 RExC_end = RExC_parse + len;
16906 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16908 /* We turn off folding around the call, as the class we have
16909 * constructed already has all folding taken into consideration, and we
16910 * don't want regclass() to add to that */
16911 RExC_flags &= ~RXf_PMf_FOLD;
16912 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16913 * folds are allowed. */
16914 node = regclass(pRExC_state, flagp, depth+1,
16915 FALSE, /* means parse the whole char class */
16916 FALSE, /* don't allow multi-char folds */
16917 TRUE, /* silence non-portable warnings. The above may
16918 very well have generated non-portable code
16919 points, but they're valid on this machine */
16920 FALSE, /* similarly, no need for strict */
16922 /* We can optimize into something besides an ANYOF,
16923 * except under /l, which needs to be ANYOF because of
16924 * runtime checks for locale sanity, etc */
16930 RExC_parse = save_parse + 1;
16931 RExC_end = save_end;
16932 SvREFCNT_dec_NN(final);
16933 SvREFCNT_dec_NN(result_string);
16936 RExC_flags |= RXf_PMf_FOLD;
16940 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16941 goto regclass_failed;
16944 /* Fix up the node type if we are in locale. (We have pretended we are
16945 * under /u for the purposes of regclass(), as this construct will only
16946 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
16947 * (so as to cause any warnings about bad locales to be output in
16948 * regexec.c), and add the flag that indicates to check if not in a
16949 * UTF-8 locale. The reason we above forbid optimization into
16950 * something other than an ANYOF node is simply to minimize the number
16951 * of code changes in regexec.c. Otherwise we would have to create new
16952 * EXACTish node types and deal with them. This decision could be
16953 * revisited should this construct become popular.
16955 * (One might think we could look at the resulting ANYOF node and
16956 * suppress the flag if everything is above 255, as those would be
16957 * UTF-8 only, but this isn't true, as the components that led to that
16958 * result could have been locale-affected, and just happen to cancel
16959 * each other out under UTF-8 locales.) */
16961 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16963 assert(OP(REGNODE_p(node)) == ANYOF);
16965 OP(REGNODE_p(node)) = ANYOFL;
16966 ANYOF_FLAGS(REGNODE_p(node))
16967 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16971 nextchar(pRExC_state);
16972 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16976 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16980 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16983 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16984 AV * stack, const IV fence, AV * fence_stack)
16985 { /* Dumps the stacks in handle_regex_sets() */
16987 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16988 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16991 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16993 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16995 if (stack_top < 0) {
16996 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16999 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17000 for (i = stack_top; i >= 0; i--) {
17001 SV ** element_ptr = av_fetch(stack, i, FALSE);
17002 if (! element_ptr) {
17005 if (IS_OPERATOR(*element_ptr)) {
17006 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17007 (int) i, (int) SvIV(*element_ptr));
17010 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17011 sv_dump(*element_ptr);
17016 if (fence_stack_top < 0) {
17017 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17020 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17021 for (i = fence_stack_top; i >= 0; i--) {
17022 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17023 if (! element_ptr) {
17026 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17027 (int) i, (int) SvIV(*element_ptr));
17038 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17040 /* This adds the Latin1/above-Latin1 folding rules.
17042 * This should be called only for a Latin1-range code points, cp, which is
17043 * known to be involved in a simple fold with other code points above
17044 * Latin1. It would give false results if /aa has been specified.
17045 * Multi-char folds are outside the scope of this, and must be handled
17048 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17050 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17052 /* The rules that are valid for all Unicode versions are hard-coded in */
17057 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17061 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17064 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17065 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17067 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17068 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17069 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17071 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17072 *invlist = add_cp_to_invlist(*invlist,
17073 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17076 default: /* Other code points are checked against the data for the
17077 current Unicode version */
17079 Size_t folds_count;
17081 const U32 * remaining_folds;
17085 folded_cp = toFOLD(cp);
17088 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17090 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17093 if (folded_cp > 255) {
17094 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17097 folds_count = _inverse_folds(folded_cp, &first_fold,
17099 if (folds_count == 0) {
17101 /* Use deprecated warning to increase the chances of this being
17103 ckWARN2reg_d(RExC_parse,
17104 "Perl folding rules are not up-to-date for 0x%02X;"
17105 " please use the perlbug utility to report;", cp);
17110 if (first_fold > 255) {
17111 *invlist = add_cp_to_invlist(*invlist, first_fold);
17113 for (i = 0; i < folds_count - 1; i++) {
17114 if (remaining_folds[i] > 255) {
17115 *invlist = add_cp_to_invlist(*invlist,
17116 remaining_folds[i]);
17126 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17128 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17132 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17134 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17136 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17137 CLEAR_POSIX_WARNINGS();
17141 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17142 if (first_is_fatal) { /* Avoid leaking this */
17143 av_undef(posix_warnings); /* This isn't necessary if the
17144 array is mortal, but is a
17146 (void) sv_2mortal(msg);
17149 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17150 SvREFCNT_dec_NN(msg);
17153 UPDATE_WARNINGS_LOC(RExC_parse);
17156 PERL_STATIC_INLINE Size_t
17157 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17159 const U8 * const start = s1;
17160 const U8 * const send = start + max;
17162 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17164 while (s1 < send && *s1 == *s2) {
17173 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17175 /* This adds the string scalar <multi_string> to the array
17176 * <multi_char_matches>. <multi_string> is known to have exactly
17177 * <cp_count> code points in it. This is used when constructing a
17178 * bracketed character class and we find something that needs to match more
17179 * than a single character.
17181 * <multi_char_matches> is actually an array of arrays. Each top-level
17182 * element is an array that contains all the strings known so far that are
17183 * the same length. And that length (in number of code points) is the same
17184 * as the index of the top-level array. Hence, the [2] element is an
17185 * array, each element thereof is a string containing TWO code points;
17186 * while element [3] is for strings of THREE characters, and so on. Since
17187 * this is for multi-char strings there can never be a [0] nor [1] element.
17189 * When we rewrite the character class below, we will do so such that the
17190 * longest strings are written first, so that it prefers the longest
17191 * matching strings first. This is done even if it turns out that any
17192 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17193 * Christiansen has agreed that this is ok. This makes the test for the
17194 * ligature 'ffi' come before the test for 'ff', for example */
17197 AV** this_array_ptr;
17199 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17201 if (! multi_char_matches) {
17202 multi_char_matches = newAV();
17205 if (av_exists(multi_char_matches, cp_count)) {
17206 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17207 this_array = *this_array_ptr;
17210 this_array = newAV();
17211 av_store(multi_char_matches, cp_count,
17214 av_push(this_array, multi_string);
17216 return multi_char_matches;
17219 /* The names of properties whose definitions are not known at compile time are
17220 * stored in this SV, after a constant heading. So if the length has been
17221 * changed since initialization, then there is a run-time definition. */
17222 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17223 (SvCUR(listsv) != initial_listsv_len)
17225 /* There is a restricted set of white space characters that are legal when
17226 * ignoring white space in a bracketed character class. This generates the
17227 * code to skip them.
17229 * There is a line below that uses the same white space criteria but is outside
17230 * this macro. Both here and there must use the same definition */
17231 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
17234 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
17241 STATIC regnode_offset
17242 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17243 const bool stop_at_1, /* Just parse the next thing, don't
17244 look for a full character class */
17245 bool allow_mutiple_chars,
17246 const bool silence_non_portable, /* Don't output warnings
17250 bool optimizable, /* ? Allow a non-ANYOF return
17252 SV** ret_invlist /* Return an inversion list, not a node */
17255 /* parse a bracketed class specification. Most of these will produce an
17256 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17257 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17258 * under /i with multi-character folds: it will be rewritten following the
17259 * paradigm of this example, where the <multi-fold>s are characters which
17260 * fold to multiple character sequences:
17261 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17262 * gets effectively rewritten as:
17263 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17264 * reg() gets called (recursively) on the rewritten version, and this
17265 * function will return what it constructs. (Actually the <multi-fold>s
17266 * aren't physically removed from the [abcdefghi], it's just that they are
17267 * ignored in the recursion by means of a flag:
17268 * <RExC_in_multi_char_class>.)
17270 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17271 * characters, with the corresponding bit set if that character is in the
17272 * list. For characters above this, an inversion list is used. There
17273 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17274 * determinable at compile time
17276 * On success, returns the offset at which any next node should be placed
17277 * into the regex engine program being compiled.
17279 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17280 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17284 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17286 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17287 regnode_offset ret = -1; /* Initialized to an illegal value */
17289 int namedclass = OOB_NAMEDCLASS;
17290 char *rangebegin = NULL;
17291 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17292 aren't available at the time this was called */
17293 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17294 than just initialized. */
17295 SV* properties = NULL; /* Code points that match \p{} \P{} */
17296 SV* posixes = NULL; /* Code points that match classes like [:word:],
17297 extended beyond the Latin1 range. These have to
17298 be kept separate from other code points for much
17299 of this function because their handling is
17300 different under /i, and for most classes under
17302 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17303 separate for a while from the non-complemented
17304 versions because of complications with /d
17306 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17307 treated more simply than the general case,
17308 leading to less compilation and execution
17310 UV element_count = 0; /* Number of distinct elements in the class.
17311 Optimizations may be possible if this is tiny */
17312 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17313 character; used under /i */
17315 char * stop_ptr = RExC_end; /* where to stop parsing */
17317 /* ignore unescaped whitespace? */
17318 const bool skip_white = cBOOL( ret_invlist
17319 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17321 /* inversion list of code points this node matches only when the target
17322 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17324 SV* upper_latin1_only_utf8_matches = NULL;
17326 /* Inversion list of code points this node matches regardless of things
17327 * like locale, folding, utf8ness of the target string */
17328 SV* cp_list = NULL;
17330 /* Like cp_list, but code points on this list need to be checked for things
17331 * that fold to/from them under /i */
17332 SV* cp_foldable_list = NULL;
17334 /* Like cp_list, but code points on this list are valid only when the
17335 * runtime locale is UTF-8 */
17336 SV* only_utf8_locale_list = NULL;
17338 /* In a range, if one of the endpoints is non-character-set portable,
17339 * meaning that it hard-codes a code point that may mean a different
17340 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17341 * mnemonic '\t' which each mean the same character no matter which
17342 * character set the platform is on. */
17343 unsigned int non_portable_endpoint = 0;
17345 /* Is the range unicode? which means on a platform that isn't 1-1 native
17346 * to Unicode (i.e. non-ASCII), each code point in it should be considered
17347 * to be a Unicode value. */
17348 bool unicode_range = FALSE;
17349 bool invert = FALSE; /* Is this class to be complemented */
17351 bool warn_super = ALWAYS_WARN_SUPER;
17353 const char * orig_parse = RExC_parse;
17355 /* This variable is used to mark where the end in the input is of something
17356 * that looks like a POSIX construct but isn't. During the parse, when
17357 * something looks like it could be such a construct is encountered, it is
17358 * checked for being one, but not if we've already checked this area of the
17359 * input. Only after this position is reached do we check again */
17360 char *not_posix_region_end = RExC_parse - 1;
17362 AV* posix_warnings = NULL;
17363 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17364 U8 op = END; /* The returned node-type, initialized to an impossible
17366 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
17367 U32 posixl = 0; /* bit field of posix classes matched under /l */
17370 /* Flags as to what things aren't knowable until runtime. (Note that these are
17371 * mutually exclusive.) */
17372 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
17373 haven't been defined as of yet */
17374 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
17376 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
17377 what gets folded */
17378 U32 has_runtime_dependency = 0; /* OR of the above flags */
17380 DECLARE_AND_GET_RE_DEBUG_FLAGS;
17382 PERL_ARGS_ASSERT_REGCLASS;
17384 PERL_UNUSED_ARG(depth);
17387 assert(! (ret_invlist && allow_mutiple_chars));
17389 /* If wants an inversion list returned, we can't optimize to something
17392 optimizable = FALSE;
17395 DEBUG_PARSE("clas");
17397 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
17398 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
17399 && UNICODE_DOT_DOT_VERSION == 0)
17400 allow_mutiple_chars = FALSE;
17403 /* We include the /i status at the beginning of this so that we can
17404 * know it at runtime */
17405 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17406 initial_listsv_len = SvCUR(listsv);
17407 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
17409 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17411 assert(RExC_parse <= RExC_end);
17413 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
17416 allow_mutiple_chars = FALSE;
17418 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17421 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17422 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17423 int maybe_class = handle_possible_posix(pRExC_state,
17425 ¬_posix_region_end,
17427 TRUE /* checking only */);
17428 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17429 ckWARN4reg(not_posix_region_end,
17430 "POSIX syntax [%c %c] belongs inside character classes%s",
17431 *RExC_parse, *RExC_parse,
17432 (maybe_class == OOB_NAMEDCLASS)
17433 ? ((POSIXCC_NOTYET(*RExC_parse))
17434 ? " (but this one isn't implemented)"
17435 : " (but this one isn't fully valid)")
17441 /* If the caller wants us to just parse a single element, accomplish this
17442 * by faking the loop ending condition */
17443 if (stop_at_1 && RExC_end > RExC_parse) {
17444 stop_ptr = RExC_parse + 1;
17447 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17448 if (UCHARAT(RExC_parse) == ']')
17449 goto charclassloop;
17453 if ( posix_warnings
17454 && av_tindex_skip_len_mg(posix_warnings) >= 0
17455 && RExC_parse > not_posix_region_end)
17457 /* Warnings about posix class issues are considered tentative until
17458 * we are far enough along in the parse that we can no longer
17459 * change our mind, at which point we output them. This is done
17460 * each time through the loop so that a later class won't zap them
17461 * before they have been dealt with. */
17462 output_posix_warnings(pRExC_state, posix_warnings);
17465 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17467 if (RExC_parse >= stop_ptr) {
17471 if (UCHARAT(RExC_parse) == ']') {
17477 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17478 save_value = value;
17479 save_prevvalue = prevvalue;
17482 rangebegin = RExC_parse;
17484 non_portable_endpoint = 0;
17486 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17487 value = utf8n_to_uvchr((U8*)RExC_parse,
17488 RExC_end - RExC_parse,
17489 &numlen, UTF8_ALLOW_DEFAULT);
17490 RExC_parse += numlen;
17493 value = UCHARAT(RExC_parse++);
17495 if (value == '[') {
17496 char * posix_class_end;
17497 namedclass = handle_possible_posix(pRExC_state,
17500 do_posix_warnings ? &posix_warnings : NULL,
17501 FALSE /* die if error */);
17502 if (namedclass > OOB_NAMEDCLASS) {
17504 /* If there was an earlier attempt to parse this particular
17505 * posix class, and it failed, it was a false alarm, as this
17506 * successful one proves */
17507 if ( posix_warnings
17508 && av_tindex_skip_len_mg(posix_warnings) >= 0
17509 && not_posix_region_end >= RExC_parse
17510 && not_posix_region_end <= posix_class_end)
17512 av_undef(posix_warnings);
17515 RExC_parse = posix_class_end;
17517 else if (namedclass == OOB_NAMEDCLASS) {
17518 not_posix_region_end = posix_class_end;
17521 namedclass = OOB_NAMEDCLASS;
17524 else if ( RExC_parse - 1 > not_posix_region_end
17525 && MAYBE_POSIXCC(value))
17527 (void) handle_possible_posix(
17529 RExC_parse - 1, /* -1 because parse has already been
17531 ¬_posix_region_end,
17532 do_posix_warnings ? &posix_warnings : NULL,
17533 TRUE /* checking only */);
17535 else if ( strict && ! skip_white
17536 && ( _generic_isCC(value, _CC_VERTSPACE)
17537 || is_VERTWS_cp_high(value)))
17539 vFAIL("Literal vertical space in [] is illegal except under /x");
17541 else if (value == '\\') {
17542 /* Is a backslash; get the code point of the char after it */
17544 if (RExC_parse >= RExC_end) {
17545 vFAIL("Unmatched [");
17548 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17549 value = utf8n_to_uvchr((U8*)RExC_parse,
17550 RExC_end - RExC_parse,
17551 &numlen, UTF8_ALLOW_DEFAULT);
17552 RExC_parse += numlen;
17555 value = UCHARAT(RExC_parse++);
17557 /* Some compilers cannot handle switching on 64-bit integer
17558 * values, therefore value cannot be an UV. Yes, this will
17559 * be a problem later if we want switch on Unicode.
17560 * A similar issue a little bit later when switching on
17561 * namedclass. --jhi */
17563 /* If the \ is escaping white space when white space is being
17564 * skipped, it means that that white space is wanted literally, and
17565 * is already in 'value'. Otherwise, need to translate the escape
17566 * into what it signifies. */
17567 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17568 const char * message;
17572 case 'w': namedclass = ANYOF_WORDCHAR; break;
17573 case 'W': namedclass = ANYOF_NWORDCHAR; break;
17574 case 's': namedclass = ANYOF_SPACE; break;
17575 case 'S': namedclass = ANYOF_NSPACE; break;
17576 case 'd': namedclass = ANYOF_DIGIT; break;
17577 case 'D': namedclass = ANYOF_NDIGIT; break;
17578 case 'v': namedclass = ANYOF_VERTWS; break;
17579 case 'V': namedclass = ANYOF_NVERTWS; break;
17580 case 'h': namedclass = ANYOF_HORIZWS; break;
17581 case 'H': namedclass = ANYOF_NHORIZWS; break;
17582 case 'N': /* Handle \N{NAME} in class */
17584 const char * const backslash_N_beg = RExC_parse - 2;
17587 if (! grok_bslash_N(pRExC_state,
17588 NULL, /* No regnode */
17589 &value, /* Yes single value */
17590 &cp_count, /* Multiple code pt count */
17596 if (*flagp & NEED_UTF8)
17597 FAIL("panic: grok_bslash_N set NEED_UTF8");
17599 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17601 if (cp_count < 0) {
17602 vFAIL("\\N in a character class must be a named character: \\N{...}");
17604 else if (cp_count == 0) {
17605 ckWARNreg(RExC_parse,
17606 "Ignoring zero length \\N{} in character class");
17608 else { /* cp_count > 1 */
17609 assert(cp_count > 1);
17610 if (! RExC_in_multi_char_class) {
17611 if ( ! allow_mutiple_chars
17614 || *RExC_parse == '-')
17618 vFAIL("\\N{} here is restricted to one character");
17620 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17621 break; /* <value> contains the first code
17622 point. Drop out of the switch to
17626 SV * multi_char_N = newSVpvn(backslash_N_beg,
17627 RExC_parse - backslash_N_beg);
17629 = add_multi_match(multi_char_matches,
17634 } /* End of cp_count != 1 */
17636 /* This element should not be processed further in this
17639 value = save_value;
17640 prevvalue = save_prevvalue;
17641 continue; /* Back to top of loop to get next char */
17644 /* Here, is a single code point, and <value> contains it */
17645 unicode_range = TRUE; /* \N{} are Unicode */
17653 if (RExC_pm_flags & PMf_WILDCARD) {
17655 /* diag_listed_as: Use of %s is not allowed in Unicode
17656 property wildcard subpatterns in regex; marked by <--
17658 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17659 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17662 /* \p means they want Unicode semantics */
17663 REQUIRE_UNI_RULES(flagp, 0);
17665 if (RExC_parse >= RExC_end)
17666 vFAIL2("Empty \\%c", (U8)value);
17667 if (*RExC_parse == '{') {
17668 const U8 c = (U8)value;
17669 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17672 vFAIL2("Missing right brace on \\%c{}", c);
17677 /* White space is allowed adjacent to the braces and after
17678 * any '^', even when not under /x */
17679 while (isSPACE(*RExC_parse)) {
17683 if (UCHARAT(RExC_parse) == '^') {
17685 /* toggle. (The rhs xor gets the single bit that
17686 * differs between P and p; the other xor inverts just
17688 value ^= 'P' ^ 'p';
17691 while (isSPACE(*RExC_parse)) {
17696 if (e == RExC_parse)
17697 vFAIL2("Empty \\%c{}", c);
17699 n = e - RExC_parse;
17700 while (isSPACE(*(RExC_parse + n - 1)))
17703 } /* The \p isn't immediately followed by a '{' */
17704 else if (! isALPHA(*RExC_parse)) {
17705 RExC_parse += (UTF)
17706 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17708 vFAIL2("Character following \\%c must be '{' or a "
17709 "single-character Unicode property name",
17717 char* name = RExC_parse;
17719 /* Any message returned about expanding the definition */
17720 SV* msg = newSVpvs_flags("", SVs_TEMP);
17722 /* If set TRUE, the property is user-defined as opposed to
17723 * official Unicode */
17724 bool user_defined = FALSE;
17725 AV * strings = NULL;
17727 SV * prop_definition = parse_uniprop_string(
17728 name, n, UTF, FOLD,
17729 FALSE, /* This is compile-time */
17731 /* We can't defer this defn when
17732 * the full result is required in
17734 ! cBOOL(ret_invlist),
17741 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17742 assert(prop_definition == NULL);
17743 RExC_parse = e + 1;
17744 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17745 thing so, or else the display is
17749 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17750 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17751 SvCUR(msg), SvPVX(msg)));
17754 assert(prop_definition || strings);
17758 if (! prop_definition) {
17759 RExC_parse = e + 1;
17760 vFAIL("Unicode string properties are not implemented in (?[...])");
17764 "Using just the single character results"
17765 " returned by \\p{} in (?[...])");
17768 else if (! RExC_in_multi_char_class) {
17769 if (invert ^ (value == 'P')) {
17770 RExC_parse = e + 1;
17771 vFAIL("Inverting a character class which contains"
17772 " a multi-character sequence is illegal");
17775 /* For each multi-character string ... */
17776 while (av_count(strings) > 0) {
17777 /* ... Each entry is itself an array of code
17779 AV * this_string = (AV *) av_shift( strings);
17780 STRLEN cp_count = av_count(this_string);
17781 SV * final = newSV(cp_count * 4);
17784 /* Create another string of sequences of \x{...} */
17785 while (av_count(this_string) > 0) {
17786 SV * character = av_shift(this_string);
17787 UV cp = SvUV(character);
17790 REQUIRE_UTF8(flagp);
17792 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17794 SvREFCNT_dec_NN(character);
17796 SvREFCNT_dec_NN(this_string);
17798 /* And add that to the list of such things */
17800 = add_multi_match(multi_char_matches,
17805 SvREFCNT_dec_NN(strings);
17808 if (! prop_definition) { /* If we got only a string,
17809 this iteration didn't really
17810 find a character */
17813 else if (! is_invlist(prop_definition)) {
17815 /* Here, the definition isn't known, so we have gotten
17816 * returned a string that will be evaluated if and when
17817 * encountered at runtime. We add it to the list of
17818 * such properties, along with whether it should be
17819 * complemented or not */
17820 if (value == 'P') {
17821 sv_catpvs(listsv, "!");
17824 sv_catpvs(listsv, "+");
17826 sv_catsv(listsv, prop_definition);
17828 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17830 /* We don't know yet what this matches, so have to flag
17832 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17835 assert (prop_definition && is_invlist(prop_definition));
17837 /* Here we do have the complete property definition
17839 * Temporary workaround for [perl #133136]. For this
17840 * precise input that is in the .t that is failing,
17841 * load utf8.pm, which is what the test wants, so that
17842 * that .t passes */
17843 if ( memEQs(RExC_start, e + 1 - RExC_start,
17845 && ! hv_common(GvHVn(PL_incgv),
17847 "utf8.pm", sizeof("utf8.pm") - 1,
17848 0, HV_FETCH_ISEXISTS, NULL, 0))
17850 require_pv("utf8.pm");
17853 if (! user_defined &&
17854 /* We warn on matching an above-Unicode code point
17855 * if the match would return true, except don't
17856 * warn for \p{All}, which has exactly one element
17858 (_invlist_contains_cp(prop_definition, 0x110000)
17859 && (! (_invlist_len(prop_definition) == 1
17860 && *invlist_array(prop_definition) == 0))))
17865 /* Invert if asking for the complement */
17866 if (value == 'P') {
17867 _invlist_union_complement_2nd(properties,
17872 _invlist_union(properties, prop_definition, &properties);
17877 RExC_parse = e + 1;
17878 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17882 case 'n': value = '\n'; break;
17883 case 'r': value = '\r'; break;
17884 case 't': value = '\t'; break;
17885 case 'f': value = '\f'; break;
17886 case 'b': value = '\b'; break;
17887 case 'e': value = ESC_NATIVE; break;
17888 case 'a': value = '\a'; break;
17890 RExC_parse--; /* function expects to be pointed at the 'o' */
17891 if (! grok_bslash_o(&RExC_parse,
17897 cBOOL(range), /* MAX_UV allowed for range
17903 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17904 warn_non_literal_string(RExC_parse, packed_warn, message);
17908 non_portable_endpoint++;
17912 RExC_parse--; /* function expects to be pointed at the 'x' */
17913 if (! grok_bslash_x(&RExC_parse,
17919 cBOOL(range), /* MAX_UV allowed for range
17925 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17926 warn_non_literal_string(RExC_parse, packed_warn, message);
17930 non_portable_endpoint++;
17934 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17937 /* going to die anyway; point to exact spot of
17939 RExC_parse += (UTF)
17940 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17945 value = grok_c_char;
17947 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17948 warn_non_literal_string(RExC_parse, packed_warn, message);
17951 non_portable_endpoint++;
17953 case '0': case '1': case '2': case '3': case '4':
17954 case '5': case '6': case '7':
17956 /* Take 1-3 octal digits */
17957 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17958 | PERL_SCAN_NOTIFY_ILLDIGIT;
17959 numlen = (strict) ? 4 : 3;
17960 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17961 RExC_parse += numlen;
17964 RExC_parse += (UTF)
17965 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17967 vFAIL("Need exactly 3 octal digits");
17969 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17970 && RExC_parse < RExC_end
17971 && isDIGIT(*RExC_parse)
17972 && ckWARN(WARN_REGEXP))
17974 reg_warn_non_literal_string(
17976 form_alien_digit_msg(8, numlen, RExC_parse,
17977 RExC_end, UTF, FALSE));
17981 non_portable_endpoint++;
17986 /* Allow \_ to not give an error */
17987 if (isWORDCHAR(value) && value != '_') {
17989 vFAIL2("Unrecognized escape \\%c in character class",
17993 ckWARN2reg(RExC_parse,
17994 "Unrecognized escape \\%c in character class passed through",
17999 } /* End of switch on char following backslash */
18000 } /* end of handling backslash escape sequences */
18002 /* Here, we have the current token in 'value' */
18004 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18007 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
18008 * literal, as is the character that began the false range, i.e.
18009 * the 'a' in the examples */
18011 const int w = (RExC_parse >= rangebegin)
18012 ? RExC_parse - rangebegin
18016 "False [] range \"%" UTF8f "\"",
18017 UTF8fARG(UTF, w, rangebegin));
18020 ckWARN2reg(RExC_parse,
18021 "False [] range \"%" UTF8f "\"",
18022 UTF8fARG(UTF, w, rangebegin));
18023 cp_list = add_cp_to_invlist(cp_list, '-');
18024 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18028 range = 0; /* this was not a true range */
18029 element_count += 2; /* So counts for three values */
18032 classnum = namedclass_to_classnum(namedclass);
18034 if (LOC && namedclass < ANYOF_POSIXL_MAX
18035 #ifndef HAS_ISASCII
18036 && classnum != _CC_ASCII
18039 SV* scratch_list = NULL;
18041 /* What the Posix classes (like \w, [:space:]) match isn't
18042 * generally knowable under locale until actual match time. A
18043 * special node is used for these which has extra space for a
18044 * bitmap, with a bit reserved for each named class that is to
18045 * be matched against. (This isn't needed for \p{} and
18046 * pseudo-classes, as they are not affected by locale, and
18047 * hence are dealt with separately.) However, if a named class
18048 * and its complement are both present, then it matches
18049 * everything, and there is no runtime dependency. Odd numbers
18050 * are the complements of the next lower number, so xor works.
18051 * (Note that something like [\w\D] should match everything,
18052 * because \d should be a proper subset of \w. But rather than
18053 * trust that the locale is well behaved, we leave this to
18054 * runtime to sort out) */
18055 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18056 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18057 POSIXL_ZERO(posixl);
18058 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18059 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18060 continue; /* We could ignore the rest of the class, but
18061 best to parse it for any errors */
18063 else { /* Here, isn't the complement of any already parsed
18065 POSIXL_SET(posixl, namedclass);
18066 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18067 anyof_flags |= ANYOF_MATCHES_POSIXL;
18069 /* The above-Latin1 characters are not subject to locale
18070 * rules. Just add them to the unconditionally-matched
18073 /* Get the list of the above-Latin1 code points this
18075 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18076 PL_XPosix_ptrs[classnum],
18078 /* Odd numbers are complements,
18079 * like NDIGIT, NASCII, ... */
18080 namedclass % 2 != 0,
18082 /* Checking if 'cp_list' is NULL first saves an extra
18083 * clone. Its reference count will be decremented at the
18084 * next union, etc, or if this is the only instance, at the
18085 * end of the routine */
18087 cp_list = scratch_list;
18090 _invlist_union(cp_list, scratch_list, &cp_list);
18091 SvREFCNT_dec_NN(scratch_list);
18093 continue; /* Go get next character */
18098 /* Here, is not /l, or is a POSIX class for which /l doesn't
18099 * matter (or is a Unicode property, which is skipped here). */
18100 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18101 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18103 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18104 * nor /l make a difference in what these match,
18105 * therefore we just add what they match to cp_list. */
18106 if (classnum != _CC_VERTSPACE) {
18107 assert( namedclass == ANYOF_HORIZWS
18108 || namedclass == ANYOF_NHORIZWS);
18110 /* It turns out that \h is just a synonym for
18112 classnum = _CC_BLANK;
18115 _invlist_union_maybe_complement_2nd(
18117 PL_XPosix_ptrs[classnum],
18118 namedclass % 2 != 0, /* Complement if odd
18119 (NHORIZWS, NVERTWS)
18124 else if ( AT_LEAST_UNI_SEMANTICS
18125 || classnum == _CC_ASCII
18126 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
18127 || classnum == _CC_XDIGIT)))
18129 /* We usually have to worry about /d affecting what POSIX
18130 * classes match, with special code needed because we won't
18131 * know until runtime what all matches. But there is no
18132 * extra work needed under /u and /a; and [:ascii:] is
18133 * unaffected by /d; and :digit: and :xdigit: don't have
18134 * runtime differences under /d. So we can special case
18135 * these, and avoid some extra work below, and at runtime.
18137 _invlist_union_maybe_complement_2nd(
18139 ((AT_LEAST_ASCII_RESTRICTED)
18140 ? PL_Posix_ptrs[classnum]
18141 : PL_XPosix_ptrs[classnum]),
18142 namedclass % 2 != 0,
18145 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18146 complement and use nposixes */
18147 SV** posixes_ptr = namedclass % 2 == 0
18150 _invlist_union_maybe_complement_2nd(
18152 PL_XPosix_ptrs[classnum],
18153 namedclass % 2 != 0,
18157 } /* end of namedclass \blah */
18159 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18161 /* If 'range' is set, 'value' is the ending of a range--check its
18162 * validity. (If value isn't a single code point in the case of a
18163 * range, we should have figured that out above in the code that
18164 * catches false ranges). Later, we will handle each individual code
18165 * point in the range. If 'range' isn't set, this could be the
18166 * beginning of a range, so check for that by looking ahead to see if
18167 * the next real character to be processed is the range indicator--the
18172 /* For unicode ranges, we have to test that the Unicode as opposed
18173 * to the native values are not decreasing. (Above 255, there is
18174 * no difference between native and Unicode) */
18175 if (unicode_range && prevvalue < 255 && value < 255) {
18176 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18177 goto backwards_range;
18182 if (prevvalue > value) /* b-a */ {
18187 w = RExC_parse - rangebegin;
18189 "Invalid [] range \"%" UTF8f "\"",
18190 UTF8fARG(UTF, w, rangebegin));
18191 NOT_REACHED; /* NOTREACHED */
18195 prevvalue = value; /* save the beginning of the potential range */
18196 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18197 && *RExC_parse == '-')
18199 char* next_char_ptr = RExC_parse + 1;
18201 /* Get the next real char after the '-' */
18202 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18204 /* If the '-' is at the end of the class (just before the ']',
18205 * it is a literal minus; otherwise it is a range */
18206 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18207 RExC_parse = next_char_ptr;
18209 /* a bad range like \w-, [:word:]- ? */
18210 if (namedclass > OOB_NAMEDCLASS) {
18211 if (strict || ckWARN(WARN_REGEXP)) {
18212 const int w = RExC_parse >= rangebegin
18213 ? RExC_parse - rangebegin
18216 vFAIL4("False [] range \"%*.*s\"",
18221 "False [] range \"%*.*s\"",
18225 cp_list = add_cp_to_invlist(cp_list, '-');
18228 range = 1; /* yeah, it's a range! */
18229 continue; /* but do it the next time */
18234 if (namedclass > OOB_NAMEDCLASS) {
18238 /* Here, we have a single value this time through the loop, and
18239 * <prevvalue> is the beginning of the range, if any; or <value> if
18242 /* non-Latin1 code point implies unicode semantics. */
18244 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18245 || prevvalue > MAX_LEGAL_CP))
18247 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18249 REQUIRE_UNI_RULES(flagp, 0);
18250 if ( ! silence_non_portable
18251 && UNICODE_IS_PERL_EXTENDED(value)
18252 && TO_OUTPUT_WARNINGS(RExC_parse))
18254 ckWARN2_non_literal_string(RExC_parse,
18255 packWARN(WARN_PORTABLE),
18256 PL_extended_cp_format,
18261 /* Ready to process either the single value, or the completed range.
18262 * For single-valued non-inverted ranges, we consider the possibility
18263 * of multi-char folds. (We made a conscious decision to not do this
18264 * for the other cases because it can often lead to non-intuitive
18265 * results. For example, you have the peculiar case that:
18266 * "s s" =~ /^[^\xDF]+$/i => Y
18267 * "ss" =~ /^[^\xDF]+$/i => N
18269 * See [perl #89750] */
18270 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18271 if ( value == LATIN_SMALL_LETTER_SHARP_S
18272 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18275 /* Here <value> is indeed a multi-char fold. Get what it is */
18277 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18280 UV folded = _to_uni_fold_flags(
18284 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18285 ? FOLD_FLAGS_NOMIX_ASCII
18289 /* Here, <folded> should be the first character of the
18290 * multi-char fold of <value>, with <foldbuf> containing the
18291 * whole thing. But, if this fold is not allowed (because of
18292 * the flags), <fold> will be the same as <value>, and should
18293 * be processed like any other character, so skip the special
18295 if (folded != value) {
18297 /* Skip if we are recursed, currently parsing the class
18298 * again. Otherwise add this character to the list of
18299 * multi-char folds. */
18300 if (! RExC_in_multi_char_class) {
18301 STRLEN cp_count = utf8_length(foldbuf,
18302 foldbuf + foldlen);
18303 SV* multi_fold = sv_2mortal(newSVpvs(""));
18305 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18308 = add_multi_match(multi_char_matches,
18314 /* This element should not be processed further in this
18317 value = save_value;
18318 prevvalue = save_prevvalue;
18324 if (strict && ckWARN(WARN_REGEXP)) {
18327 /* If the range starts above 255, everything is portable and
18328 * likely to be so for any forseeable character set, so don't
18330 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18331 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18333 else if (prevvalue != value) {
18335 /* Under strict, ranges that stop and/or end in an ASCII
18336 * printable should have each end point be a portable value
18337 * for it (preferably like 'A', but we don't warn if it is
18338 * a (portable) Unicode name or code point), and the range
18339 * must be all digits or all letters of the same case.
18340 * Otherwise, the range is non-portable and unclear as to
18341 * what it contains */
18342 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
18343 && ( non_portable_endpoint
18344 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18345 || (isLOWER_A(prevvalue) && isLOWER_A(value))
18346 || (isUPPER_A(prevvalue) && isUPPER_A(value))
18348 vWARN(RExC_parse, "Ranges of ASCII printables should"
18349 " be some subset of \"0-9\","
18350 " \"A-Z\", or \"a-z\"");
18352 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18353 SSize_t index_start;
18354 SSize_t index_final;
18356 /* But the nature of Unicode and languages mean we
18357 * can't do the same checks for above-ASCII ranges,
18358 * except in the case of digit ones. These should
18359 * contain only digits from the same group of 10. The
18360 * ASCII case is handled just above. Hence here, the
18361 * range could be a range of digits. First some
18362 * unlikely special cases. Grandfather in that a range
18363 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18364 * if its starting value is one of the 10 digits prior
18365 * to it. This is because it is an alternate way of
18366 * writing 19D1, and some people may expect it to be in
18367 * that group. But it is bad, because it won't give
18368 * the expected results. In Unicode 5.2 it was
18369 * considered to be in that group (of 11, hence), but
18370 * this was fixed in the next version */
18372 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18373 goto warn_bad_digit_range;
18375 else if (UNLIKELY( prevvalue >= 0x1D7CE
18376 && value <= 0x1D7FF))
18378 /* This is the only other case currently in Unicode
18379 * where the algorithm below fails. The code
18380 * points just above are the end points of a single
18381 * range containing only decimal digits. It is 5
18382 * different series of 0-9. All other ranges of
18383 * digits currently in Unicode are just a single
18384 * series. (And mktables will notify us if a later
18385 * Unicode version breaks this.)
18387 * If the range being checked is at most 9 long,
18388 * and the digit values represented are in
18389 * numerical order, they are from the same series.
18391 if ( value - prevvalue > 9
18392 || ((( value - 0x1D7CE) % 10)
18393 <= (prevvalue - 0x1D7CE) % 10))
18395 goto warn_bad_digit_range;
18400 /* For all other ranges of digits in Unicode, the
18401 * algorithm is just to check if both end points
18402 * are in the same series, which is the same range.
18404 index_start = _invlist_search(
18405 PL_XPosix_ptrs[_CC_DIGIT],
18408 /* Warn if the range starts and ends with a digit,
18409 * and they are not in the same group of 10. */
18410 if ( index_start >= 0
18411 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18413 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18414 value)) != index_start
18415 && index_final >= 0
18416 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18418 warn_bad_digit_range:
18419 vWARN(RExC_parse, "Ranges of digits should be"
18420 " from the same group of"
18427 if ((! range || prevvalue == value) && non_portable_endpoint) {
18428 if (isPRINT_A(value)) {
18431 if (isBACKSLASHED_PUNCT(value)) {
18432 literal[d++] = '\\';
18434 literal[d++] = (char) value;
18435 literal[d++] = '\0';
18438 "\"%.*s\" is more clearly written simply as \"%s\"",
18439 (int) (RExC_parse - rangebegin),
18444 else if (isMNEMONIC_CNTRL(value)) {
18446 "\"%.*s\" is more clearly written simply as \"%s\"",
18447 (int) (RExC_parse - rangebegin),
18449 cntrl_to_mnemonic((U8) value)
18455 /* Deal with this element of the class */
18458 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18461 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18462 * that don't require special handling, we can just add the range like
18463 * we do for ASCII platforms */
18464 if ((UNLIKELY(prevvalue == 0) && value >= 255)
18465 || ! (prevvalue < 256
18467 || (! non_portable_endpoint
18468 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18469 || (isUPPER_A(prevvalue)
18470 && isUPPER_A(value)))))))
18472 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18476 /* Here, requires special handling. This can be because it is a
18477 * range whose code points are considered to be Unicode, and so
18478 * must be individually translated into native, or because its a
18479 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18480 * EBCDIC, but we have defined them to include only the "expected"
18481 * upper or lower case ASCII alphabetics. Subranges above 255 are
18482 * the same in native and Unicode, so can be added as a range */
18483 U8 start = NATIVE_TO_LATIN1(prevvalue);
18485 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18486 for (j = start; j <= end; j++) {
18487 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18490 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18496 range = 0; /* this range (if it was one) is done now */
18497 } /* End of loop through all the text within the brackets */
18499 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18500 output_posix_warnings(pRExC_state, posix_warnings);
18503 /* If anything in the class expands to more than one character, we have to
18504 * deal with them by building up a substitute parse string, and recursively
18505 * calling reg() on it, instead of proceeding */
18506 if (multi_char_matches) {
18507 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18510 char *save_end = RExC_end;
18511 char *save_parse = RExC_parse;
18512 char *save_start = RExC_start;
18513 Size_t constructed_prefix_len = 0; /* This gives the length of the
18514 constructed portion of the
18515 substitute parse. */
18516 bool first_time = TRUE; /* First multi-char occurrence doesn't get
18521 /* Only one level of recursion allowed */
18522 assert(RExC_copy_start_in_constructed == RExC_precomp);
18524 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
18525 because too confusing */
18527 sv_catpvs(substitute_parse, "(?:");
18531 /* Look at the longest strings first */
18532 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18537 if (av_exists(multi_char_matches, cp_count)) {
18538 AV** this_array_ptr;
18541 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18543 while ((this_sequence = av_pop(*this_array_ptr)) !=
18546 if (! first_time) {
18547 sv_catpvs(substitute_parse, "|");
18549 first_time = FALSE;
18551 sv_catpv(substitute_parse, SvPVX(this_sequence));
18556 /* If the character class contains anything else besides these
18557 * multi-character strings, have to include it in recursive parsing */
18558 if (element_count) {
18559 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18561 sv_catpvs(substitute_parse, "|");
18562 if (has_l_bracket) { /* Add an [ if the original had one */
18563 sv_catpvs(substitute_parse, "[");
18565 constructed_prefix_len = SvCUR(substitute_parse);
18566 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18568 /* Put in a closing ']' to match any opening one, but not if going
18569 * off the end, as otherwise we are adding something that really
18571 if (has_l_bracket && RExC_parse < RExC_end) {
18572 sv_catpvs(substitute_parse, "]");
18576 sv_catpvs(substitute_parse, ")");
18579 /* This is a way to get the parse to skip forward a whole named
18580 * sequence instead of matching the 2nd character when it fails the
18582 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18586 /* Set up the data structure so that any errors will be properly
18587 * reported. See the comments at the definition of
18588 * REPORT_LOCATION_ARGS for details */
18589 RExC_copy_start_in_input = (char *) orig_parse;
18590 RExC_start = RExC_parse = SvPV(substitute_parse, len);
18591 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18592 RExC_end = RExC_parse + len;
18593 RExC_in_multi_char_class = 1;
18595 ret = reg(pRExC_state, 1, ®_flags, depth+1);
18597 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18599 /* And restore so can parse the rest of the pattern */
18600 RExC_parse = save_parse;
18601 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18602 RExC_end = save_end;
18603 RExC_in_multi_char_class = 0;
18604 SvREFCNT_dec_NN(multi_char_matches);
18608 /* If folding, we calculate all characters that could fold to or from the
18609 * ones already on the list */
18610 if (cp_foldable_list) {
18612 UV start, end; /* End points of code point ranges */
18614 SV* fold_intersection = NULL;
18617 /* Our calculated list will be for Unicode rules. For locale
18618 * matching, we have to keep a separate list that is consulted at
18619 * runtime only when the locale indicates Unicode rules (and we
18620 * don't include potential matches in the ASCII/Latin1 range, as
18621 * any code point could fold to any other, based on the run-time
18622 * locale). For non-locale, we just use the general list */
18624 use_list = &only_utf8_locale_list;
18627 use_list = &cp_list;
18630 /* Only the characters in this class that participate in folds need
18631 * be checked. Get the intersection of this class and all the
18632 * possible characters that are foldable. This can quickly narrow
18633 * down a large class */
18634 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18635 &fold_intersection);
18637 /* Now look at the foldable characters in this class individually */
18638 invlist_iterinit(fold_intersection);
18639 while (invlist_iternext(fold_intersection, &start, &end)) {
18643 /* Look at every character in the range */
18644 for (j = start; j <= end; j++) {
18645 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18648 Size_t folds_count;
18650 const U32 * remaining_folds;
18654 /* Under /l, we don't know what code points below 256
18655 * fold to, except we do know the MICRO SIGN folds to
18656 * an above-255 character if the locale is UTF-8, so we
18657 * add it to the special list (in *use_list) Otherwise
18658 * we know now what things can match, though some folds
18659 * are valid under /d only if the target is UTF-8.
18660 * Those go in a separate list */
18661 if ( IS_IN_SOME_FOLD_L1(j)
18662 && ! (LOC && j != MICRO_SIGN))
18665 /* ASCII is always matched; non-ASCII is matched
18666 * only under Unicode rules (which could happen
18667 * under /l if the locale is a UTF-8 one */
18668 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18669 *use_list = add_cp_to_invlist(*use_list,
18670 PL_fold_latin1[j]);
18672 else if (j != PL_fold_latin1[j]) {
18673 upper_latin1_only_utf8_matches
18674 = add_cp_to_invlist(
18675 upper_latin1_only_utf8_matches,
18676 PL_fold_latin1[j]);
18680 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18681 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18683 add_above_Latin1_folds(pRExC_state,
18690 /* Here is an above Latin1 character. We don't have the
18691 * rules hard-coded for it. First, get its fold. This is
18692 * the simple fold, as the multi-character folds have been
18693 * handled earlier and separated out */
18694 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18695 (ASCII_FOLD_RESTRICTED)
18696 ? FOLD_FLAGS_NOMIX_ASCII
18699 /* Single character fold of above Latin1. Add everything
18700 * in its fold closure to the list that this node should
18702 folds_count = _inverse_folds(folded, &first_fold,
18704 for (k = 0; k <= folds_count; k++) {
18705 UV c = (k == 0) /* First time through use itself */
18707 : (k == 1) /* 2nd time use, the first fold */
18710 /* Then the remaining ones */
18711 : remaining_folds[k-2];
18713 /* /aa doesn't allow folds between ASCII and non- */
18714 if (( ASCII_FOLD_RESTRICTED
18715 && (isASCII(c) != isASCII(j))))
18720 /* Folds under /l which cross the 255/256 boundary are
18721 * added to a separate list. (These are valid only
18722 * when the locale is UTF-8.) */
18723 if (c < 256 && LOC) {
18724 *use_list = add_cp_to_invlist(*use_list, c);
18728 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18730 cp_list = add_cp_to_invlist(cp_list, c);
18733 /* Similarly folds involving non-ascii Latin1
18734 * characters under /d are added to their list */
18735 upper_latin1_only_utf8_matches
18736 = add_cp_to_invlist(
18737 upper_latin1_only_utf8_matches,
18743 SvREFCNT_dec_NN(fold_intersection);
18746 /* Now that we have finished adding all the folds, there is no reason
18747 * to keep the foldable list separate */
18748 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18749 SvREFCNT_dec_NN(cp_foldable_list);
18752 /* And combine the result (if any) with any inversion lists from posix
18753 * classes. The lists are kept separate up to now because we don't want to
18754 * fold the classes */
18755 if (simple_posixes) { /* These are the classes known to be unaffected by
18758 _invlist_union(cp_list, simple_posixes, &cp_list);
18759 SvREFCNT_dec_NN(simple_posixes);
18762 cp_list = simple_posixes;
18765 if (posixes || nposixes) {
18766 if (! DEPENDS_SEMANTICS) {
18768 /* For everything but /d, we can just add the current 'posixes' and
18769 * 'nposixes' to the main list */
18772 _invlist_union(cp_list, posixes, &cp_list);
18773 SvREFCNT_dec_NN(posixes);
18781 _invlist_union(cp_list, nposixes, &cp_list);
18782 SvREFCNT_dec_NN(nposixes);
18785 cp_list = nposixes;
18790 /* Under /d, things like \w match upper Latin1 characters only if
18791 * the target string is in UTF-8. But things like \W match all the
18792 * upper Latin1 characters if the target string is not in UTF-8.
18794 * Handle the case with something like \W separately */
18796 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18798 /* A complemented posix class matches all upper Latin1
18799 * characters if not in UTF-8. And it matches just certain
18800 * ones when in UTF-8. That means those certain ones are
18801 * matched regardless, so can just be added to the
18802 * unconditional list */
18804 _invlist_union(cp_list, nposixes, &cp_list);
18805 SvREFCNT_dec_NN(nposixes);
18809 cp_list = nposixes;
18812 /* Likewise for 'posixes' */
18813 _invlist_union(posixes, cp_list, &cp_list);
18814 SvREFCNT_dec(posixes);
18816 /* Likewise for anything else in the range that matched only
18818 if (upper_latin1_only_utf8_matches) {
18819 _invlist_union(cp_list,
18820 upper_latin1_only_utf8_matches,
18822 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18823 upper_latin1_only_utf8_matches = NULL;
18826 /* If we don't match all the upper Latin1 characters regardless
18827 * of UTF-8ness, we have to set a flag to match the rest when
18829 _invlist_subtract(only_non_utf8_list, cp_list,
18830 &only_non_utf8_list);
18831 if (_invlist_len(only_non_utf8_list) != 0) {
18832 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18834 SvREFCNT_dec_NN(only_non_utf8_list);
18837 /* Here there were no complemented posix classes. That means
18838 * the upper Latin1 characters in 'posixes' match only when the
18839 * target string is in UTF-8. So we have to add them to the
18840 * list of those types of code points, while adding the
18841 * remainder to the unconditional list.
18843 * First calculate what they are */
18844 SV* nonascii_but_latin1_properties = NULL;
18845 _invlist_intersection(posixes, PL_UpperLatin1,
18846 &nonascii_but_latin1_properties);
18848 /* And add them to the final list of such characters. */
18849 _invlist_union(upper_latin1_only_utf8_matches,
18850 nonascii_but_latin1_properties,
18851 &upper_latin1_only_utf8_matches);
18853 /* Remove them from what now becomes the unconditional list */
18854 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18857 /* And add those unconditional ones to the final list */
18859 _invlist_union(cp_list, posixes, &cp_list);
18860 SvREFCNT_dec_NN(posixes);
18867 SvREFCNT_dec(nonascii_but_latin1_properties);
18869 /* Get rid of any characters from the conditional list that we
18870 * now know are matched unconditionally, which may make that
18872 _invlist_subtract(upper_latin1_only_utf8_matches,
18874 &upper_latin1_only_utf8_matches);
18875 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18876 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18877 upper_latin1_only_utf8_matches = NULL;
18883 /* And combine the result (if any) with any inversion list from properties.
18884 * The lists are kept separate up to now so that we can distinguish the two
18885 * in regards to matching above-Unicode. A run-time warning is generated
18886 * if a Unicode property is matched against a non-Unicode code point. But,
18887 * we allow user-defined properties to match anything, without any warning,
18888 * and we also suppress the warning if there is a portion of the character
18889 * class that isn't a Unicode property, and which matches above Unicode, \W
18890 * or [\x{110000}] for example.
18891 * (Note that in this case, unlike the Posix one above, there is no
18892 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18893 * forces Unicode semantics */
18897 /* If it matters to the final outcome, see if a non-property
18898 * component of the class matches above Unicode. If so, the
18899 * warning gets suppressed. This is true even if just a single
18900 * such code point is specified, as, though not strictly correct if
18901 * another such code point is matched against, the fact that they
18902 * are using above-Unicode code points indicates they should know
18903 * the issues involved */
18905 warn_super = ! (invert
18906 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18909 _invlist_union(properties, cp_list, &cp_list);
18910 SvREFCNT_dec_NN(properties);
18913 cp_list = properties;
18918 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18920 /* Because an ANYOF node is the only one that warns, this node
18921 * can't be optimized into something else */
18922 optimizable = FALSE;
18926 /* Here, we have calculated what code points should be in the character
18929 * Now we can see about various optimizations. Fold calculation (which we
18930 * did above) needs to take place before inversion. Otherwise /[^k]/i
18931 * would invert to include K, which under /i would match k, which it
18932 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18933 * folded until runtime */
18935 /* If we didn't do folding, it's because some information isn't available
18936 * until runtime; set the run-time fold flag for these We know to set the
18937 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18938 * at least one 0-255 range code point */
18941 /* Some things on the list might be unconditionally included because of
18942 * other components. Remove them, and clean up the list if it goes to
18944 if (only_utf8_locale_list && cp_list) {
18945 _invlist_subtract(only_utf8_locale_list, cp_list,
18946 &only_utf8_locale_list);
18948 if (_invlist_len(only_utf8_locale_list) == 0) {
18949 SvREFCNT_dec_NN(only_utf8_locale_list);
18950 only_utf8_locale_list = NULL;
18953 if ( only_utf8_locale_list
18954 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18955 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18957 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18960 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18962 else if (cp_list && invlist_lowest(cp_list) < 256) {
18963 /* If nothing is below 256, has no locale dependency; otherwise it
18965 anyof_flags |= ANYOFL_FOLD;
18966 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18969 else if ( DEPENDS_SEMANTICS
18970 && ( upper_latin1_only_utf8_matches
18971 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18973 RExC_seen_d_op = TRUE;
18974 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18977 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18981 && ! has_runtime_dependency)
18983 _invlist_invert(cp_list);
18985 /* Clear the invert flag since have just done it here */
18989 /* All possible optimizations below still have these characteristics.
18990 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18992 *flagp |= HASWIDTH|SIMPLE;
18995 *ret_invlist = cp_list;
18997 return (cp_list) ? RExC_emit : 0;
19000 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19001 RExC_contains_locale = 1;
19004 /* Some character classes are equivalent to other nodes. Such nodes take
19005 * up less room, and some nodes require fewer operations to execute, than
19006 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
19007 * improve efficiency. */
19010 PERL_UINT_FAST8_T i;
19011 UV partial_cp_count = 0;
19012 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19013 UV end[MAX_FOLD_FROMS+1] = { 0 };
19014 bool single_range = FALSE;
19016 if (cp_list) { /* Count the code points in enough ranges that we would
19017 see all the ones possible in any fold in this version
19020 invlist_iterinit(cp_list);
19021 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19022 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19025 partial_cp_count += end[i] - start[i] + 1;
19029 single_range = TRUE;
19031 invlist_iterfinish(cp_list);
19034 /* If we know at compile time that this matches every possible code
19035 * point, any run-time dependencies don't matter */
19036 if (start[0] == 0 && end[0] == UV_MAX) {
19038 ret = reganode(pRExC_state, OPFAIL, 0);
19041 ret = reg_node(pRExC_state, SANY);
19047 /* Similarly, for /l posix classes, if both a class and its
19048 * complement match, any run-time dependencies don't matter */
19050 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19053 if ( POSIXL_TEST(posixl, namedclass) /* class */
19054 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19057 ret = reganode(pRExC_state, OPFAIL, 0);
19060 ret = reg_node(pRExC_state, SANY);
19067 /* For well-behaved locales, some classes are subsets of others,
19068 * so complementing the subset and including the non-complemented
19069 * superset should match everything, like [\D[:alnum:]], and
19070 * [[:^alpha:][:alnum:]], but some implementations of locales are
19071 * buggy, and khw thinks its a bad idea to have optimization change
19072 * behavior, even if it avoids an OS bug in a given case */
19074 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19076 /* If is a single posix /l class, can optimize to just that op.
19077 * Such a node will not match anything in the Latin1 range, as that
19078 * is not determinable until runtime, but will match whatever the
19079 * class does outside that range. (Note that some classes won't
19080 * match anything outside the range, like [:ascii:]) */
19081 if ( isSINGLE_BIT_SET(posixl)
19082 && (partial_cp_count == 0 || start[0] > 255))
19085 SV * class_above_latin1 = NULL;
19086 bool already_inverted;
19087 bool are_equivalent;
19089 /* Compute which bit is set, which is the same thing as, e.g.,
19090 * ANYOF_CNTRL. From
19091 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19093 static const int MultiplyDeBruijnBitPosition2[32] =
19095 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19096 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19099 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19100 * 0x077CB531U) >> 27];
19101 classnum = namedclass_to_classnum(namedclass);
19103 /* The named classes are such that the inverted number is one
19104 * larger than the non-inverted one */
19105 already_inverted = namedclass
19106 - classnum_to_namedclass(classnum);
19108 /* Create an inversion list of the official property, inverted
19109 * if the constructed node list is inverted, and restricted to
19110 * only the above latin1 code points, which are the only ones
19111 * known at compile time */
19112 _invlist_intersection_maybe_complement_2nd(
19114 PL_XPosix_ptrs[classnum],
19116 &class_above_latin1);
19117 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19119 SvREFCNT_dec_NN(class_above_latin1);
19121 if (are_equivalent) {
19123 /* Resolve the run-time inversion flag with this possibly
19124 * inverted class */
19125 invert = invert ^ already_inverted;
19127 ret = reg_node(pRExC_state,
19128 POSIXL + invert * (NPOSIXL - POSIXL));
19129 FLAGS(REGNODE_p(ret)) = classnum;
19135 /* khw can't think of any other possible transformation involving
19137 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19141 if (! has_runtime_dependency) {
19143 /* If the list is empty, nothing matches. This happens, for
19144 * example, when a Unicode property that doesn't match anything is
19145 * the only element in the character class (perluniprops.pod notes
19146 * such properties). */
19147 if (partial_cp_count == 0) {
19149 ret = reg_node(pRExC_state, SANY);
19152 ret = reganode(pRExC_state, OPFAIL, 0);
19158 /* If matches everything but \n */
19159 if ( start[0] == 0 && end[0] == '\n' - 1
19160 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19163 ret = reg_node(pRExC_state, REG_ANY);
19169 /* Next see if can optimize classes that contain just a few code points
19170 * into an EXACTish node. The reason to do this is to let the
19171 * optimizer join this node with adjacent EXACTish ones, and ANYOF
19172 * nodes require conversion to code point from UTF-8.
19174 * An EXACTFish node can be generated even if not under /i, and vice
19175 * versa. But care must be taken. An EXACTFish node has to be such
19176 * that it only matches precisely the code points in the class, but we
19177 * want to generate the least restrictive one that does that, to
19178 * increase the odds of being able to join with an adjacent node. For
19179 * example, if the class contains [kK], we have to make it an EXACTFAA
19180 * node to prevent the KELVIN SIGN from matching. Whether we are under
19181 * /i or not is irrelevant in this case. Less obvious is the pattern
19182 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
19183 * supposed to match the single character U+0149 LATIN SMALL LETTER N
19184 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
19185 * that includes \X{02BC}, there is a multi-char fold that does, and so
19186 * the node generated for it must be an EXACTFish one. On the other
19187 * hand qr/:/i should generate a plain EXACT node since the colon
19188 * participates in no fold whatsoever, and having it EXACT tells the
19189 * optimizer the target string cannot match unless it has a colon in
19195 /* Only try if there are no more code points in the class than
19196 * in the max possible fold */
19197 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19199 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19201 /* We can always make a single code point class into an
19202 * EXACTish node. */
19206 /* Here is /l: Use EXACTL, except if there is a fold not
19207 * known until runtime so shows as only a single code point
19208 * here. For code points above 255, we know which can
19209 * cause problems by having a potential fold to the Latin1
19212 || ( start[0] > 255
19213 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19221 else if (! FOLD) { /* Not /l and not /i */
19222 op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19224 else if (start[0] < 256) { /* /i, not /l, and the code point is
19227 /* Under /i, it gets a little tricky. A code point that
19228 * doesn't participate in a fold should be an EXACT node.
19229 * We know this one isn't the result of a simple fold, or
19230 * there'd be more than one code point in the list, but it
19231 * could be part of a multi- character fold. In that case
19232 * we better not create an EXACT node, as we would wrongly
19233 * be telling the optimizer that this code point must be in
19234 * the target string, and that is wrong. This is because
19235 * if the sequence around this code point forms a
19236 * multi-char fold, what needs to be in the string could be
19237 * the code point that folds to the sequence.
19239 * This handles the case of below-255 code points, as we
19240 * have an easy look up for those. The next clause handles
19241 * the above-256 one */
19242 op = IS_IN_SOME_FOLD_L1(start[0])
19246 else { /* /i, larger code point. Since we are under /i, and
19247 have just this code point, we know that it can't
19248 fold to something else, so PL_InMultiCharFold
19250 op = _invlist_contains_cp(PL_InMultiCharFold,
19258 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19259 && _invlist_contains_cp(PL_in_some_fold, start[0]))
19261 /* Here, the only runtime dependency, if any, is from /d, and
19262 * the class matches more than one code point, and the lowest
19263 * code point participates in some fold. It might be that the
19264 * other code points are /i equivalent to this one, and hence
19265 * they would representable by an EXACTFish node. Above, we
19266 * eliminated classes that contain too many code points to be
19267 * EXACTFish, with the test for MAX_FOLD_FROMS
19269 * First, special case the ASCII fold pairs, like 'B' and 'b'.
19270 * We do this because we have EXACTFAA at our disposal for the
19272 if (partial_cp_count == 2 && isASCII(start[0])) {
19274 /* The only ASCII characters that participate in folds are
19276 assert(isALPHA(start[0]));
19277 if ( end[0] == start[0] /* First range is a single
19278 character, so 2nd exists */
19279 && isALPHA_FOLD_EQ(start[0], start[1]))
19282 /* Here, is part of an ASCII fold pair */
19284 if ( ASCII_FOLD_RESTRICTED
19285 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19287 /* If the second clause just above was true, it
19288 * means we can't be under /i, or else the list
19289 * would have included more than this fold pair.
19290 * Therefore we have to exclude the possibility of
19291 * whatever else it is that folds to these, by
19292 * using EXACTFAA */
19295 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19297 /* Here, there's no simple fold that start[0] is part
19298 * of, but there is a multi-character one. If we
19299 * are not under /i, we want to exclude that
19300 * possibility; if under /i, we want to include it
19302 op = (FOLD) ? EXACTFU : EXACTFAA;
19306 /* Here, the only possible fold start[0] particpates in
19307 * is with start[1]. /i or not isn't relevant */
19311 value = toFOLD(start[0]);
19314 else if ( ! upper_latin1_only_utf8_matches
19315 || ( _invlist_len(upper_latin1_only_utf8_matches)
19318 invlist_highest(upper_latin1_only_utf8_matches)]
19321 /* Here, the smallest character is non-ascii or there are
19322 * more than 2 code points matched by this node. Also, we
19323 * either don't have /d UTF-8 dependent matches, or if we
19324 * do, they look like they could be a single character that
19325 * is the fold of the lowest one in the always-match list.
19326 * This test quickly excludes most of the false positives
19327 * when there are /d UTF-8 depdendent matches. These are
19328 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19329 * SMALL LETTER A WITH GRAVE iff the target string is
19330 * UTF-8. (We don't have to worry above about exceeding
19331 * the array bounds of PL_fold_latin1[] because any code
19332 * point in 'upper_latin1_only_utf8_matches' is below 256.)
19334 * EXACTFAA would apply only to pairs (hence exactly 2 code
19335 * points) in the ASCII range, so we can't use it here to
19336 * artificially restrict the fold domain, so we check if
19337 * the class does or does not match some EXACTFish node.
19338 * Further, if we aren't under /i, and the folded-to
19339 * character is part of a multi-character fold, we can't do
19340 * this optimization, as the sequence around it could be
19341 * that multi-character fold, and we don't here know the
19342 * context, so we have to assume it is that multi-char
19343 * fold, to prevent potential bugs.
19345 * To do the general case, we first find the fold of the
19346 * lowest code point (which may be higher than the lowest
19347 * one), then find everything that folds to it. (The data
19348 * structure we have only maps from the folded code points,
19349 * so we have to do the earlier step.) */
19352 U8 foldbuf[UTF8_MAXBYTES_CASE];
19353 UV folded = _to_uni_fold_flags(start[0],
19354 foldbuf, &foldlen, 0);
19356 const U32 * remaining_folds;
19357 Size_t folds_to_this_cp_count = _inverse_folds(
19361 Size_t folds_count = folds_to_this_cp_count + 1;
19362 SV * fold_list = _new_invlist(folds_count);
19365 /* If there are UTF-8 dependent matches, create a temporary
19366 * list of what this node matches, including them. */
19367 SV * all_cp_list = NULL;
19368 SV ** use_this_list = &cp_list;
19370 if (upper_latin1_only_utf8_matches) {
19371 all_cp_list = _new_invlist(0);
19372 use_this_list = &all_cp_list;
19373 _invlist_union(cp_list,
19374 upper_latin1_only_utf8_matches,
19378 /* Having gotten everything that participates in the fold
19379 * containing the lowest code point, we turn that into an
19380 * inversion list, making sure everything is included. */
19381 fold_list = add_cp_to_invlist(fold_list, start[0]);
19382 fold_list = add_cp_to_invlist(fold_list, folded);
19383 if (folds_to_this_cp_count > 0) {
19384 fold_list = add_cp_to_invlist(fold_list, first_fold);
19385 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19386 fold_list = add_cp_to_invlist(fold_list,
19387 remaining_folds[i]);
19391 /* If the fold list is identical to what's in this ANYOF
19392 * node, the node can be represented by an EXACTFish one
19394 if (_invlistEQ(*use_this_list, fold_list,
19395 0 /* Don't complement */ )
19398 /* But, we have to be careful, as mentioned above.
19399 * Just the right sequence of characters could match
19400 * this if it is part of a multi-character fold. That
19401 * IS what we want if we are under /i. But it ISN'T
19402 * what we want if not under /i, as it could match when
19403 * it shouldn't. So, when we aren't under /i and this
19404 * character participates in a multi-char fold, we
19405 * don't optimize into an EXACTFish node. So, for each
19406 * case below we have to check if we are folding
19407 * and if not, if it is not part of a multi-char fold.
19409 if (start[0] > 255) { /* Highish code point */
19410 if (FOLD || ! _invlist_contains_cp(
19411 PL_InMultiCharFold, folded))
19415 : (ASCII_FOLD_RESTRICTED)
19420 } /* Below, the lowest code point < 256 */
19423 && DEPENDS_SEMANTICS)
19424 { /* An EXACTF node containing a single character
19425 's', can be an EXACTFU if it doesn't get
19426 joined with an adjacent 's' */
19427 op = EXACTFU_S_EDGE;
19431 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19433 if (upper_latin1_only_utf8_matches) {
19436 /* We can't use the fold, as that only matches
19440 else if ( UNLIKELY(start[0] == MICRO_SIGN)
19442 { /* EXACTFUP is a special node for this
19444 op = (ASCII_FOLD_RESTRICTED)
19447 value = MICRO_SIGN;
19449 else if ( ASCII_FOLD_RESTRICTED
19450 && ! isASCII(start[0]))
19451 { /* For ASCII under /iaa, we can use EXACTFU
19463 SvREFCNT_dec_NN(fold_list);
19464 SvREFCNT_dec(all_cp_list);
19471 /* Here, we have calculated what EXACTish node to use. Have to
19472 * convert to UTF-8 if not already there */
19475 SvREFCNT_dec(cp_list);;
19476 REQUIRE_UTF8(flagp);
19479 /* This is a kludge to the special casing issues with this
19480 * ligature under /aa. FB05 should fold to FB06, but the
19481 * call above to _to_uni_fold_flags() didn't find this, as
19482 * it didn't use the /aa restriction in order to not miss
19483 * other folds that would be affected. This is the only
19484 * instance likely to ever be a problem in all of Unicode.
19485 * So special case it. */
19486 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
19487 && ASCII_FOLD_RESTRICTED)
19489 value = LATIN_SMALL_LIGATURE_ST;
19493 len = (UTF) ? UVCHR_SKIP(value) : 1;
19495 ret = regnode_guts(pRExC_state, op, len, "exact");
19496 FILL_NODE(ret, op);
19497 RExC_emit += 1 + STR_SZ(len);
19498 setSTR_LEN(REGNODE_p(ret), len);
19500 *STRINGs(REGNODE_p(ret)) = (U8) value;
19503 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19509 if (! has_runtime_dependency) {
19511 /* See if this can be turned into an ANYOFM node. Think about the
19512 * bit patterns in two different bytes. In some positions, the
19513 * bits in each will be 1; and in other positions both will be 0;
19514 * and in some positions the bit will be 1 in one byte, and 0 in
19515 * the other. Let 'n' be the number of positions where the bits
19516 * differ. We create a mask which has exactly 'n' 0 bits, each in
19517 * a position where the two bytes differ. Now take the set of all
19518 * bytes that when ANDed with the mask yield the same result. That
19519 * set has 2**n elements, and is representable by just two 8 bit
19520 * numbers: the result and the mask. Importantly, matching the set
19521 * can be vectorized by creating a word full of the result bytes,
19522 * and a word full of the mask bytes, yielding a significant speed
19523 * up. Here, see if this node matches such a set. As a concrete
19524 * example consider [01], and the byte representing '0' which is
19525 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
19526 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
19527 * 0x30. Any other bytes ANDed yield something else. So [01],
19528 * which is a common usage, is optimizable into ANYOFM, and can
19529 * benefit from the speed up. We can only do this on UTF-8
19530 * invariant bytes, because they have the same bit patterns under
19532 PERL_UINT_FAST8_T inverted = 0;
19534 const PERL_UINT_FAST8_T max_permissible = 0xFF;
19536 const PERL_UINT_FAST8_T max_permissible = 0x7F;
19538 /* If doesn't fit the criteria for ANYOFM, invert and try again.
19539 * If that works we will instead later generate an NANYOFM, and
19540 * invert back when through */
19541 if (invlist_highest(cp_list) > max_permissible) {
19542 _invlist_invert(cp_list);
19546 if (invlist_highest(cp_list) <= max_permissible) {
19547 UV this_start, this_end;
19548 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
19549 U8 bits_differing = 0;
19550 Size_t full_cp_count = 0;
19551 bool first_time = TRUE;
19553 /* Go through the bytes and find the bit positions that differ
19555 invlist_iterinit(cp_list);
19556 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19557 unsigned int i = this_start;
19560 if (! UVCHR_IS_INVARIANT(i)) {
19564 first_time = FALSE;
19565 lowest_cp = this_start;
19567 /* We have set up the code point to compare with.
19568 * Don't compare it with itself */
19572 /* Find the bit positions that differ from the lowest code
19573 * point in the node. Keep track of all such positions by
19575 for (; i <= this_end; i++) {
19576 if (! UVCHR_IS_INVARIANT(i)) {
19580 bits_differing |= i ^ lowest_cp;
19583 full_cp_count += this_end - this_start + 1;
19586 /* At the end of the loop, we count how many bits differ from
19587 * the bits in lowest code point, call the count 'd'. If the
19588 * set we found contains 2**d elements, it is the closure of
19589 * all code points that differ only in those bit positions. To
19590 * convince yourself of that, first note that the number in the
19591 * closure must be a power of 2, which we test for. The only
19592 * way we could have that count and it be some differing set,
19593 * is if we got some code points that don't differ from the
19594 * lowest code point in any position, but do differ from each
19595 * other in some other position. That means one code point has
19596 * a 1 in that position, and another has a 0. But that would
19597 * mean that one of them differs from the lowest code point in
19598 * that position, which possibility we've already excluded. */
19599 if ( (inverted || full_cp_count > 1)
19600 && full_cp_count == 1U << PL_bitcount[bits_differing])
19604 op = ANYOFM + inverted;;
19606 /* We need to make the bits that differ be 0's */
19607 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19609 /* The argument is the lowest code point */
19610 ret = reganode(pRExC_state, op, lowest_cp);
19611 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19615 invlist_iterfinish(cp_list);
19619 _invlist_invert(cp_list);
19626 /* XXX We could create an ANYOFR_LOW node here if we saved above if
19627 * all were invariants, it wasn't inverted, and there is a single
19628 * range. This would be faster than some of the posix nodes we
19629 * create below like /\d/a, but would be twice the size. Without
19630 * having actually measured the gain, khw doesn't think the
19631 * tradeoff is really worth it */
19634 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19635 PERL_UINT_FAST8_T type;
19636 SV * intersection = NULL;
19637 SV* d_invlist = NULL;
19639 /* See if this matches any of the POSIX classes. The POSIXA and
19640 * POSIXD ones are about the same speed as ANYOF ops, but take less
19641 * room; the ones that have above-Latin1 code point matches are
19642 * somewhat faster than ANYOF. */
19644 for (type = POSIXA; type >= POSIXD; type--) {
19647 if (type == POSIXL) { /* But not /l posix classes */
19651 for (posix_class = 0;
19652 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19655 SV** our_code_points = &cp_list;
19656 SV** official_code_points;
19659 if (type == POSIXA) {
19660 official_code_points = &PL_Posix_ptrs[posix_class];
19663 official_code_points = &PL_XPosix_ptrs[posix_class];
19666 /* Skip non-existent classes of this type. e.g. \v only
19667 * has an entry in PL_XPosix_ptrs */
19668 if (! *official_code_points) {
19672 /* Try both the regular class, and its inversion */
19673 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19674 bool this_inverted = invert ^ try_inverted;
19676 if (type != POSIXD) {
19678 /* This class that isn't /d can't match if we have
19679 * /d dependencies */
19680 if (has_runtime_dependency
19681 & HAS_D_RUNTIME_DEPENDENCY)
19686 else /* is /d */ if (! this_inverted) {
19688 /* /d classes don't match anything non-ASCII below
19689 * 256 unconditionally (which cp_list contains) */
19690 _invlist_intersection(cp_list, PL_UpperLatin1,
19692 if (_invlist_len(intersection) != 0) {
19696 SvREFCNT_dec(d_invlist);
19697 d_invlist = invlist_clone(cp_list, NULL);
19699 /* But under UTF-8 it turns into using /u rules.
19700 * Add the things it matches under these conditions
19701 * so that we check below that these are identical
19702 * to what the tested class should match */
19703 if (upper_latin1_only_utf8_matches) {
19706 upper_latin1_only_utf8_matches,
19709 our_code_points = &d_invlist;
19711 else { /* POSIXD, inverted. If this doesn't have this
19712 flag set, it isn't /d. */
19713 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19717 our_code_points = &cp_list;
19720 /* Here, have weeded out some things. We want to see
19721 * if the list of characters this node contains
19722 * ('*our_code_points') precisely matches those of the
19723 * class we are currently checking against
19724 * ('*official_code_points'). */
19725 if (_invlistEQ(*our_code_points,
19726 *official_code_points,
19729 /* Here, they precisely match. Optimize this ANYOF
19730 * node into its equivalent POSIX one of the
19731 * correct type, possibly inverted */
19732 ret = reg_node(pRExC_state, (try_inverted)
19736 FLAGS(REGNODE_p(ret)) = posix_class;
19737 SvREFCNT_dec(d_invlist);
19738 SvREFCNT_dec(intersection);
19744 SvREFCNT_dec(d_invlist);
19745 SvREFCNT_dec(intersection);
19748 /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19749 * both in size and speed. Currently, a 20 bit range base (smallest
19750 * code point in the range), and a 12 bit maximum delta are packed into
19751 * a 32 bit word. This allows for using it on all of the Unicode code
19752 * points except for the highest plane, which is only for private use
19753 * code points. khw doubts that a bigger delta is likely in real world
19756 && ! has_runtime_dependency
19757 && anyof_flags == 0
19758 && start[0] < (1 << ANYOFR_BASE_BITS)
19759 && end[0] - start[0]
19760 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19761 * CHARBITS - ANYOFR_BASE_BITS))))
19764 U8 low_utf8[UTF8_MAXBYTES+1];
19765 U8 high_utf8[UTF8_MAXBYTES+1];
19767 ret = reganode(pRExC_state, ANYOFR,
19768 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19770 /* Place the lowest UTF-8 start byte in the flags field, so as to
19771 * allow efficient ruling out at run time of many possible inputs.
19773 (void) uvchr_to_utf8(low_utf8, start[0]);
19774 (void) uvchr_to_utf8(high_utf8, end[0]);
19776 /* If all code points share the same first byte, this can be an
19777 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
19778 * quickly rule out many inputs at run-time without having to
19779 * compute the code point from UTF-8. For EBCDIC, we use I8, as
19780 * not doing that transformation would not rule out nearly so many
19782 if (low_utf8[0] == high_utf8[0]) {
19783 OP(REGNODE_p(ret)) = ANYOFRb;
19784 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19787 ANYOF_FLAGS(REGNODE_p(ret))
19788 = NATIVE_UTF8_TO_I8(low_utf8[0]);
19794 /* If didn't find an optimization and there is no need for a bitmap,
19795 * optimize to indicate that */
19796 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19798 && ! upper_latin1_only_utf8_matches
19799 && anyof_flags == 0)
19801 U8 low_utf8[UTF8_MAXBYTES+1];
19802 UV highest_cp = invlist_highest(cp_list);
19804 /* Currently the maximum allowed code point by the system is
19805 * IV_MAX. Higher ones are reserved for future internal use. This
19806 * particular regnode can be used for higher ones, but we can't
19807 * calculate the code point of those. IV_MAX suffices though, as
19808 * it will be a large first byte */
19809 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19812 /* We store the lowest possible first byte of the UTF-8
19813 * representation, using the flags field. This allows for quick
19814 * ruling out of some inputs without having to convert from UTF-8
19815 * to code point. For EBCDIC, we use I8, as not doing that
19816 * transformation would not rule out nearly so many things */
19817 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19821 /* If the first UTF-8 start byte for the highest code point in the
19822 * range is suitably small, we may be able to get an upper bound as
19824 if (highest_cp <= IV_MAX) {
19825 U8 high_utf8[UTF8_MAXBYTES+1];
19826 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19829 /* If the lowest and highest are the same, we can get an exact
19830 * first byte instead of a just minimum or even a sequence of
19831 * exact leading bytes. We signal these with different
19833 if (low_utf8[0] == high_utf8[0]) {
19834 Size_t len = find_first_differing_byte_pos(low_utf8,
19836 MIN(low_len, high_len));
19840 /* No need to convert to I8 for EBCDIC as this is an
19842 anyof_flags = low_utf8[0];
19847 ret = regnode_guts(pRExC_state, op,
19848 regarglen[op] + STR_SZ(len),
19850 FILL_NODE(ret, op);
19851 ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19853 Copy(low_utf8, /* Add the common bytes */
19854 ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19856 RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19857 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19858 NULL, only_utf8_locale_list);
19862 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19865 /* Here, the high byte is not the same as the low, but is
19866 * small enough that its reasonable to have a loose upper
19867 * bound, which is packed in with the strict lower bound.
19868 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19869 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19870 * is the same thing as UTF-8 */
19873 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19874 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19877 if (range_diff <= max_range_diff / 8) {
19880 else if (range_diff <= max_range_diff / 4) {
19883 else if (range_diff <= max_range_diff / 2) {
19886 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19891 goto done_finding_op;
19893 } /* End of seeing if can optimize it into a different node */
19895 is_anyof: /* It's going to be an ANYOF node. */
19896 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19906 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19907 FILL_NODE(ret, op); /* We set the argument later */
19908 RExC_emit += 1 + regarglen[op];
19909 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19911 /* Here, <cp_list> contains all the code points we can determine at
19912 * compile time that match under all conditions. Go through it, and
19913 * for things that belong in the bitmap, put them there, and delete from
19914 * <cp_list>. While we are at it, see if everything above 255 is in the
19915 * list, and if so, set a flag to speed up execution */
19917 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19920 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19924 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19927 /* Here, the bitmap has been populated with all the Latin1 code points that
19928 * always match. Can now add to the overall list those that match only
19929 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19931 if (upper_latin1_only_utf8_matches) {
19933 _invlist_union(cp_list,
19934 upper_latin1_only_utf8_matches,
19936 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19939 cp_list = upper_latin1_only_utf8_matches;
19941 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19944 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19945 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19948 only_utf8_locale_list);
19949 SvREFCNT_dec(cp_list);;
19950 SvREFCNT_dec(only_utf8_locale_list);
19955 /* Here, the node is getting optimized into something that's not an ANYOF
19956 * one. Finish up. */
19958 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19959 RExC_parse - orig_parse);;
19960 SvREFCNT_dec(cp_list);;
19961 SvREFCNT_dec(only_utf8_locale_list);
19965 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19968 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19969 regnode* const node,
19971 SV* const runtime_defns,
19972 SV* const only_utf8_locale_list)
19974 /* Sets the arg field of an ANYOF-type node 'node', using information about
19975 * the node passed-in. If there is nothing outside the node's bitmap, the
19976 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19977 * the count returned by add_data(), having allocated and stored an array,
19980 * av[0] stores the inversion list defining this class as far as known at
19981 * this time, or PL_sv_undef if nothing definite is now known.
19982 * av[1] stores the inversion list of code points that match only if the
19983 * current locale is UTF-8, or if none, PL_sv_undef if there is an
19984 * av[2], or no entry otherwise.
19985 * av[2] stores the list of user-defined properties whose subroutine
19986 * definitions aren't known at this time, or no entry if none. */
19990 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19992 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19993 assert(! (ANYOF_FLAGS(node)
19994 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19995 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19998 AV * const av = newAV();
20002 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20005 /* (Note that if any of this changes, the size calculations in
20006 * S_optimize_regclass() might need to be updated.) */
20008 if (only_utf8_locale_list) {
20009 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20010 SvREFCNT_inc_NN(only_utf8_locale_list));
20013 if (runtime_defns) {
20014 av_store(av, DEFERRED_USER_DEFINED_INDEX,
20015 SvREFCNT_inc_NN(runtime_defns));
20018 rv = newRV_noinc(MUTABLE_SV(av));
20019 n = add_data(pRExC_state, STR_WITH_LEN("s"));
20020 RExC_rxi->data->data[n] = (void*)rv;
20027 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20028 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20030 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20034 /* For internal core use only.
20035 * Returns the inversion list for the input 'node' in the regex 'prog'.
20036 * If <doinit> is 'true', will attempt to create the inversion list if not
20038 * If <listsvp> is non-null, will return the printable contents of the
20039 * property definition. This can be used to get debugging information
20040 * even before the inversion list exists, by calling this function with
20041 * 'doinit' set to false, in which case the components that will be used
20042 * to eventually create the inversion list are returned (in a printable
20044 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20045 * store an inversion list of code points that should match only if the
20046 * execution-time locale is a UTF-8 one.
20047 * If <output_invlist> is not NULL, it is where this routine is to store an
20048 * inversion list of the code points that would be instead returned in
20049 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20050 * when this parameter is used, is just the non-code point data that
20051 * will go into creating the inversion list. This currently should be just
20052 * user-defined properties whose definitions were not known at compile
20053 * time. Using this parameter allows for easier manipulation of the
20054 * inversion list's data by the caller. It is illegal to call this
20055 * function with this parameter set, but not <listsvp>
20057 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20058 * that, in spite of this function's name, the inversion list it returns
20059 * may include the bitmap data as well */
20061 SV *si = NULL; /* Input initialization string */
20062 SV* invlist = NULL;
20064 RXi_GET_DECL(prog, progi);
20065 const struct reg_data * const data = prog ? progi->data : NULL;
20067 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20068 PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20070 PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20072 assert(! output_invlist || listsvp);
20074 if (data && data->count) {
20075 const U32 n = ARG(node);
20077 if (data->what[n] == 's') {
20078 SV * const rv = MUTABLE_SV(data->data[n]);
20079 AV * const av = MUTABLE_AV(SvRV(rv));
20080 SV **const ary = AvARRAY(av);
20082 invlist = ary[INVLIST_INDEX];
20084 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20085 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20088 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20089 si = ary[DEFERRED_USER_DEFINED_INDEX];
20092 if (doinit && (si || invlist)) {
20095 SV * msg = newSVpvs_flags("", SVs_TEMP);
20097 SV * prop_definition = handle_user_defined_property(
20098 "", 0, FALSE, /* There is no \p{}, \P{} */
20099 SvPVX_const(si)[1] - '0', /* /i or not has been
20100 stored here for just
20102 TRUE, /* run time */
20103 FALSE, /* This call must find the defn */
20104 si, /* The property definition */
20107 0 /* base level call */
20111 assert(prop_definition == NULL);
20113 Perl_croak(aTHX_ "%" UTF8f,
20114 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20118 _invlist_union(invlist, prop_definition, &invlist);
20119 SvREFCNT_dec_NN(prop_definition);
20122 invlist = prop_definition;
20125 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20126 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20128 ary[INVLIST_INDEX] = invlist;
20129 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20130 ? ONLY_LOCALE_MATCHES_INDEX
20138 /* If requested, return a printable version of what this ANYOF node matches
20141 SV* matches_string = NULL;
20143 /* This function can be called at compile-time, before everything gets
20144 * resolved, in which case we return the currently best available
20145 * information, which is the string that will eventually be used to do
20146 * that resolving, 'si' */
20148 /* Here, we only have 'si' (and possibly some passed-in data in
20149 * 'invlist', which is handled below) If the caller only wants
20150 * 'si', use that. */
20151 if (! output_invlist) {
20152 matches_string = newSVsv(si);
20155 /* But if the caller wants an inversion list of the node, we
20156 * need to parse 'si' and place as much as possible in the
20157 * desired output inversion list, making 'matches_string' only
20158 * contain the currently unresolvable things */
20159 const char *si_string = SvPVX(si);
20160 STRLEN remaining = SvCUR(si);
20164 /* Ignore everything before and including the first new-line */
20165 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20166 assert (si_string != NULL);
20168 remaining = SvPVX(si) + SvCUR(si) - si_string;
20170 while (remaining > 0) {
20172 /* The data consists of just strings defining user-defined
20173 * property names, but in prior incarnations, and perhaps
20174 * somehow from pluggable regex engines, it could still
20175 * hold hex code point definitions, all of which should be
20176 * legal (or it wouldn't have gotten this far). Each
20177 * component of a range would be separated by a tab, and
20178 * each range by a new-line. If these are found, instead
20179 * add them to the inversion list */
20180 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
20181 |PERL_SCAN_SILENT_NON_PORTABLE;
20182 STRLEN len = remaining;
20183 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20185 /* If the hex decode routine found something, it should go
20186 * up to the next \n */
20187 if ( *(si_string + len) == '\n') {
20188 if (count) { /* 2nd code point on line */
20189 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20192 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20195 goto prepare_for_next_iteration;
20198 /* If the hex decode was instead for the lower range limit,
20199 * save it, and go parse the upper range limit */
20200 if (*(si_string + len) == '\t') {
20201 assert(count == 0);
20205 prepare_for_next_iteration:
20206 si_string += len + 1;
20207 remaining -= len + 1;
20211 /* Here, didn't find a legal hex number. Just add the text
20212 * from here up to the next \n, omitting any trailing
20216 len = strcspn(si_string,
20217 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20219 if (matches_string) {
20220 sv_catpvn(matches_string, si_string, len);
20223 matches_string = newSVpvn(si_string, len);
20225 sv_catpvs(matches_string, " ");
20229 && UCHARAT(si_string)
20230 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20235 if (remaining && UCHARAT(si_string) == '\n') {
20239 } /* end of loop through the text */
20241 assert(matches_string);
20242 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
20243 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20245 } /* end of has an 'si' */
20248 /* Add the stuff that's already known */
20251 /* Again, if the caller doesn't want the output inversion list, put
20252 * everything in 'matches-string' */
20253 if (! output_invlist) {
20254 if ( ! matches_string) {
20255 matches_string = newSVpvs("\n");
20257 sv_catsv(matches_string, invlist_contents(invlist,
20258 TRUE /* traditional style */
20261 else if (! *output_invlist) {
20262 *output_invlist = invlist_clone(invlist, NULL);
20265 _invlist_union(*output_invlist, invlist, output_invlist);
20269 *listsvp = matches_string;
20275 /* reg_skipcomment()
20277 Absorbs an /x style # comment from the input stream,
20278 returning a pointer to the first character beyond the comment, or if the
20279 comment terminates the pattern without anything following it, this returns
20280 one past the final character of the pattern (in other words, RExC_end) and
20281 sets the REG_RUN_ON_COMMENT_SEEN flag.
20283 Note it's the callers responsibility to ensure that we are
20284 actually in /x mode
20288 PERL_STATIC_INLINE char*
20289 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20291 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20295 while (p < RExC_end) {
20296 if (*(++p) == '\n') {
20301 /* we ran off the end of the pattern without ending the comment, so we have
20302 * to add an \n when wrapping */
20303 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20308 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20310 const bool force_to_xmod
20313 /* If the text at the current parse position '*p' is a '(?#...)' comment,
20314 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20315 * is /x whitespace, advance '*p' so that on exit it points to the first
20316 * byte past all such white space and comments */
20318 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20320 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20322 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20325 if (RExC_end - (*p) >= 3
20327 && *(*p + 1) == '?'
20328 && *(*p + 2) == '#')
20330 while (*(*p) != ')') {
20331 if ((*p) == RExC_end)
20332 FAIL("Sequence (?#... not terminated");
20340 const char * save_p = *p;
20341 while ((*p) < RExC_end) {
20343 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20346 else if (*(*p) == '#') {
20347 (*p) = reg_skipcomment(pRExC_state, (*p));
20353 if (*p != save_p) {
20366 Advances the parse position by one byte, unless that byte is the beginning
20367 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
20368 those two cases, the parse position is advanced beyond all such comments and
20371 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20375 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20377 PERL_ARGS_ASSERT_NEXTCHAR;
20379 if (RExC_parse < RExC_end) {
20381 || UTF8_IS_INVARIANT(*RExC_parse)
20382 || UTF8_IS_START(*RExC_parse));
20384 RExC_parse += (UTF)
20385 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20388 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20389 FALSE /* Don't force /x */ );
20394 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20396 /* 'size' is the delta number of smallest regnode equivalents to add or
20397 * subtract from the current memory allocated to the regex engine being
20400 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20405 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20406 /* +1 for REG_MAGIC */
20409 if ( RExC_rxi == NULL )
20410 FAIL("Regexp out of space");
20411 RXi_SET(RExC_rx, RExC_rxi);
20413 RExC_emit_start = RExC_rxi->program;
20415 Zero(REGNODE_p(RExC_emit), size, regnode);
20418 #ifdef RE_TRACK_PATTERN_OFFSETS
20419 Renew(RExC_offsets, 2*RExC_size+1, U32);
20421 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20423 RExC_offsets[0] = RExC_size;
20427 STATIC regnode_offset
20428 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20430 /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20431 * equivalents space. It aligns and increments RExC_size
20433 * It returns the regnode's offset into the regex engine program */
20435 const regnode_offset ret = RExC_emit;
20437 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20439 PERL_ARGS_ASSERT_REGNODE_GUTS;
20441 SIZE_ALIGN(RExC_size);
20442 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20443 NODE_ALIGN_FILL(REGNODE_p(ret));
20444 #ifndef RE_TRACK_PATTERN_OFFSETS
20445 PERL_UNUSED_ARG(name);
20446 PERL_UNUSED_ARG(op);
20448 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20450 if (RExC_offsets) { /* MJD */
20452 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20455 (UV)(RExC_emit) > RExC_offsets[0]
20456 ? "Overwriting end of array!\n" : "OK",
20458 (UV)(RExC_parse - RExC_start),
20459 (UV)RExC_offsets[0]));
20460 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20467 - reg_node - emit a node
20469 STATIC regnode_offset /* Location. */
20470 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20472 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20473 regnode_offset ptr = ret;
20475 PERL_ARGS_ASSERT_REG_NODE;
20477 assert(regarglen[op] == 0);
20479 FILL_ADVANCE_NODE(ptr, op);
20485 - reganode - emit a node with an argument
20487 STATIC regnode_offset /* Location. */
20488 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20490 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20491 regnode_offset ptr = ret;
20493 PERL_ARGS_ASSERT_REGANODE;
20495 /* ANYOF are special cased to allow non-length 1 args */
20496 assert(regarglen[op] == 1);
20498 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20504 - regpnode - emit a temporary node with a SV* argument
20506 STATIC regnode_offset /* Location. */
20507 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20509 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20510 regnode_offset ptr = ret;
20512 PERL_ARGS_ASSERT_REGPNODE;
20514 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20519 STATIC regnode_offset
20520 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20522 /* emit a node with U32 and I32 arguments */
20524 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20525 regnode_offset ptr = ret;
20527 PERL_ARGS_ASSERT_REG2LANODE;
20529 assert(regarglen[op] == 2);
20531 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20537 - reginsert - insert an operator in front of already-emitted operand
20539 * That means that on exit 'operand' is the offset of the newly inserted
20540 * operator, and the original operand has been relocated.
20542 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20543 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20545 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20546 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20548 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20551 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20552 const regnode_offset operand, const U32 depth)
20557 const int offset = regarglen[(U8)op];
20558 const int size = NODE_STEP_REGNODE + offset;
20559 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20561 PERL_ARGS_ASSERT_REGINSERT;
20562 PERL_UNUSED_CONTEXT;
20563 PERL_UNUSED_ARG(depth);
20564 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20565 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20566 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20567 studying. If this is wrong then we need to adjust RExC_recurse
20568 below like we do with RExC_open_parens/RExC_close_parens. */
20569 change_engine_size(pRExC_state, (Ptrdiff_t) size);
20570 src = REGNODE_p(RExC_emit);
20572 dst = REGNODE_p(RExC_emit);
20574 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20575 * and [perl #133871] shows this can lead to problems, so skip this
20576 * realignment of parens until a later pass when they are reliable */
20577 if (! IN_PARENS_PASS && RExC_open_parens) {
20579 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20580 /* remember that RExC_npar is rex->nparens + 1,
20581 * iow it is 1 more than the number of parens seen in
20582 * the pattern so far. */
20583 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20584 /* note, RExC_open_parens[0] is the start of the
20585 * regex, it can't move. RExC_close_parens[0] is the end
20586 * of the regex, it *can* move. */
20587 if ( paren && RExC_open_parens[paren] >= operand ) {
20588 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20589 RExC_open_parens[paren] += size;
20591 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20593 if ( RExC_close_parens[paren] >= operand ) {
20594 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20595 RExC_close_parens[paren] += size;
20597 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20602 RExC_end_op += size;
20604 while (src > REGNODE_p(operand)) {
20605 StructCopy(--src, --dst, regnode);
20606 #ifdef RE_TRACK_PATTERN_OFFSETS
20607 if (RExC_offsets) { /* MJD 20010112 */
20609 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20613 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20614 ? "Overwriting end of array!\n" : "OK",
20615 (UV)REGNODE_OFFSET(src),
20616 (UV)REGNODE_OFFSET(dst),
20617 (UV)RExC_offsets[0]));
20618 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20619 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20624 place = REGNODE_p(operand); /* Op node, where operand used to be. */
20625 #ifdef RE_TRACK_PATTERN_OFFSETS
20626 if (RExC_offsets) { /* MJD */
20628 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20632 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20633 ? "Overwriting end of array!\n" : "OK",
20634 (UV)REGNODE_OFFSET(place),
20635 (UV)(RExC_parse - RExC_start),
20636 (UV)RExC_offsets[0]));
20637 Set_Node_Offset(place, RExC_parse);
20638 Set_Node_Length(place, 1);
20641 src = NEXTOPER(place);
20643 FILL_NODE(operand, op);
20645 /* Zero out any arguments in the new node */
20646 Zero(src, offset, regnode);
20650 - regtail - set the next-pointer at the end of a node chain of p to val. If
20651 that value won't fit in the space available, instead returns FALSE.
20652 (Except asserts if we can't fit in the largest space the regex
20653 engine is designed for.)
20654 - SEE ALSO: regtail_study
20657 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20658 const regnode_offset p,
20659 const regnode_offset val,
20662 regnode_offset scan;
20663 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20665 PERL_ARGS_ASSERT_REGTAIL;
20667 PERL_UNUSED_ARG(depth);
20670 /* The final node in the chain is the first one with a nonzero next pointer
20672 scan = (regnode_offset) p;
20674 regnode * const temp = regnext(REGNODE_p(scan));
20676 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20677 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20678 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
20679 SvPV_nolen_const(RExC_mysv), scan,
20680 (temp == NULL ? "->" : ""),
20681 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20686 scan = REGNODE_OFFSET(temp);
20689 /* Populate this node's next pointer */
20690 assert(val >= scan);
20691 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20692 assert((UV) (val - scan) <= U32_MAX);
20693 ARG_SET(REGNODE_p(scan), val - scan);
20696 if (val - scan > U16_MAX) {
20697 /* Populate this with something that won't loop and will likely
20698 * lead to a crash if the caller ignores the failure return, and
20699 * execution continues */
20700 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20703 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20711 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20712 - Look for optimizable sequences at the same time.
20713 - currently only looks for EXACT chains.
20715 This is experimental code. The idea is to use this routine to perform
20716 in place optimizations on branches and groups as they are constructed,
20717 with the long term intention of removing optimization from study_chunk so
20718 that it is purely analytical.
20720 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20721 to control which is which.
20723 This used to return a value that was ignored. It was a problem that it is
20724 #ifdef'd to be another function that didn't return a value. khw has changed it
20725 so both currently return a pass/fail return.
20728 /* TODO: All four parms should be const */
20731 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20732 const regnode_offset val, U32 depth)
20734 regnode_offset scan;
20736 #ifdef EXPERIMENTAL_INPLACESCAN
20739 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20741 PERL_ARGS_ASSERT_REGTAIL_STUDY;
20744 /* Find last node. */
20748 regnode * const temp = regnext(REGNODE_p(scan));
20749 #ifdef EXPERIMENTAL_INPLACESCAN
20750 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20751 bool unfolded_multi_char; /* Unexamined in this routine */
20752 if (join_exact(pRExC_state, scan, &min,
20753 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20754 return TRUE; /* Was return EXACT */
20758 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20759 if (exact == PSEUDO )
20760 exact= OP(REGNODE_p(scan));
20761 else if (exact != OP(REGNODE_p(scan)) )
20764 else if (OP(REGNODE_p(scan)) != NOTHING) {
20769 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20770 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20771 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
20772 SvPV_nolen_const(RExC_mysv),
20774 PL_reg_name[exact]);
20778 scan = REGNODE_OFFSET(temp);
20781 DEBUG_PARSE_MSG("");
20782 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20783 Perl_re_printf( aTHX_
20784 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20785 SvPV_nolen_const(RExC_mysv),
20790 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20791 assert((UV) (val - scan) <= U32_MAX);
20792 ARG_SET(REGNODE_p(scan), val - scan);
20795 if (val - scan > U16_MAX) {
20796 /* Populate this with something that won't loop and will likely
20797 * lead to a crash if the caller ignores the failure return, and
20798 * execution continues */
20799 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20802 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20805 return TRUE; /* Was 'return exact' */
20810 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20812 /* Returns an inversion list of all the code points matched by the
20813 * ANYOFM/NANYOFM node 'n' */
20815 SV * cp_list = _new_invlist(-1);
20816 const U8 lowest = (U8) ARG(n);
20819 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20821 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20823 /* Starting with the lowest code point, any code point that ANDed with the
20824 * mask yields the lowest code point is in the set */
20825 for (i = lowest; i <= 0xFF; i++) {
20826 if ((i & FLAGS(n)) == ARG(n)) {
20827 cp_list = add_cp_to_invlist(cp_list, i);
20830 /* We know how many code points (a power of two) that are in the
20831 * set. No use looking once we've got that number */
20832 if (count >= needed) break;
20836 if (OP(n) == NANYOFM) {
20837 _invlist_invert(cp_list);
20843 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20848 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20853 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20855 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20856 if (flags & (1<<bit)) {
20857 if (!set++ && lead)
20858 Perl_re_printf( aTHX_ "%s", lead);
20859 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20864 Perl_re_printf( aTHX_ "\n");
20866 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20871 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20877 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20879 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20880 if (flags & (1<<bit)) {
20881 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20884 if (!set++ && lead)
20885 Perl_re_printf( aTHX_ "%s", lead);
20886 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20889 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20890 if (!set++ && lead) {
20891 Perl_re_printf( aTHX_ "%s", lead);
20894 case REGEX_UNICODE_CHARSET:
20895 Perl_re_printf( aTHX_ "UNICODE");
20897 case REGEX_LOCALE_CHARSET:
20898 Perl_re_printf( aTHX_ "LOCALE");
20900 case REGEX_ASCII_RESTRICTED_CHARSET:
20901 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20903 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20904 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20907 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20913 Perl_re_printf( aTHX_ "\n");
20915 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20921 Perl_regdump(pTHX_ const regexp *r)
20925 SV * const sv = sv_newmortal();
20926 SV *dsv= sv_newmortal();
20927 RXi_GET_DECL(r, ri);
20928 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20930 PERL_ARGS_ASSERT_REGDUMP;
20932 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20934 /* Header fields of interest. */
20935 for (i = 0; i < 2; i++) {
20936 if (r->substrs->data[i].substr) {
20937 RE_PV_QUOTED_DECL(s, 0, dsv,
20938 SvPVX_const(r->substrs->data[i].substr),
20939 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20940 PL_dump_re_max_len);
20941 Perl_re_printf( aTHX_
20942 "%s %s%s at %" IVdf "..%" UVuf " ",
20943 i ? "floating" : "anchored",
20945 RE_SV_TAIL(r->substrs->data[i].substr),
20946 (IV)r->substrs->data[i].min_offset,
20947 (UV)r->substrs->data[i].max_offset);
20949 else if (r->substrs->data[i].utf8_substr) {
20950 RE_PV_QUOTED_DECL(s, 1, dsv,
20951 SvPVX_const(r->substrs->data[i].utf8_substr),
20952 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20954 Perl_re_printf( aTHX_
20955 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20956 i ? "floating" : "anchored",
20958 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20959 (IV)r->substrs->data[i].min_offset,
20960 (UV)r->substrs->data[i].max_offset);
20964 if (r->check_substr || r->check_utf8)
20965 Perl_re_printf( aTHX_
20967 ( r->check_substr == r->substrs->data[1].substr
20968 && r->check_utf8 == r->substrs->data[1].utf8_substr
20969 ? "(checking floating" : "(checking anchored"));
20970 if (r->intflags & PREGf_NOSCAN)
20971 Perl_re_printf( aTHX_ " noscan");
20972 if (r->extflags & RXf_CHECK_ALL)
20973 Perl_re_printf( aTHX_ " isall");
20974 if (r->check_substr || r->check_utf8)
20975 Perl_re_printf( aTHX_ ") ");
20977 if (ri->regstclass) {
20978 regprop(r, sv, ri->regstclass, NULL, NULL);
20979 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
20981 if (r->intflags & PREGf_ANCH) {
20982 Perl_re_printf( aTHX_ "anchored");
20983 if (r->intflags & PREGf_ANCH_MBOL)
20984 Perl_re_printf( aTHX_ "(MBOL)");
20985 if (r->intflags & PREGf_ANCH_SBOL)
20986 Perl_re_printf( aTHX_ "(SBOL)");
20987 if (r->intflags & PREGf_ANCH_GPOS)
20988 Perl_re_printf( aTHX_ "(GPOS)");
20989 Perl_re_printf( aTHX_ " ");
20991 if (r->intflags & PREGf_GPOS_SEEN)
20992 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
20993 if (r->intflags & PREGf_SKIP)
20994 Perl_re_printf( aTHX_ "plus ");
20995 if (r->intflags & PREGf_IMPLICIT)
20996 Perl_re_printf( aTHX_ "implicit ");
20997 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
20998 if (r->extflags & RXf_EVAL_SEEN)
20999 Perl_re_printf( aTHX_ "with eval ");
21000 Perl_re_printf( aTHX_ "\n");
21002 regdump_extflags("r->extflags: ", r->extflags);
21003 regdump_intflags("r->intflags: ", r->intflags);
21006 PERL_ARGS_ASSERT_REGDUMP;
21007 PERL_UNUSED_CONTEXT;
21008 PERL_UNUSED_ARG(r);
21009 #endif /* DEBUGGING */
21012 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21015 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
21016 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
21017 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
21018 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
21019 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
21020 || _CC_VERTSPACE != 15
21021 # error Need to adjust order of anyofs[]
21023 static const char * const anyofs[] = {
21060 - regprop - printable representation of opcode, with run time support
21064 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21068 RXi_GET_DECL(prog, progi);
21069 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21071 PERL_ARGS_ASSERT_REGPROP;
21075 if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
21076 if (pRExC_state) { /* This gives more info, if we have it */
21077 FAIL3("panic: corrupted regexp opcode %d > %d",
21078 (int)OP(o), (int)REGNODE_MAX);
21081 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21082 (int)OP(o), (int)REGNODE_MAX);
21085 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21087 k = PL_regkind[OP(o)];
21090 sv_catpvs(sv, " ");
21091 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21092 * is a crude hack but it may be the best for now since
21093 * we have no flag "this EXACTish node was UTF-8"
21095 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21096 PL_colors[0], PL_colors[1],
21097 PERL_PV_ESCAPE_UNI_DETECT |
21098 PERL_PV_ESCAPE_NONASCII |
21099 PERL_PV_PRETTY_ELLIPSES |
21100 PERL_PV_PRETTY_LTGT |
21101 PERL_PV_PRETTY_NOCLEAR
21103 } else if (k == TRIE) {
21104 /* print the details of the trie in dumpuntil instead, as
21105 * progi->data isn't available here */
21106 const char op = OP(o);
21107 const U32 n = ARG(o);
21108 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21109 (reg_ac_data *)progi->data->data[n] :
21111 const reg_trie_data * const trie
21112 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21114 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21115 DEBUG_TRIE_COMPILE_r({
21117 sv_catpvs(sv, "(JUMP)");
21118 Perl_sv_catpvf(aTHX_ sv,
21119 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21120 (UV)trie->startstate,
21121 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21122 (UV)trie->wordcount,
21125 (UV)TRIE_CHARCOUNT(trie),
21126 (UV)trie->uniquecharcount
21129 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21130 sv_catpvs(sv, "[");
21131 (void) put_charclass_bitmap_innards(sv,
21132 ((IS_ANYOF_TRIE(op))
21134 : TRIE_BITMAP(trie)),
21141 sv_catpvs(sv, "]");
21143 } else if (k == CURLY) {
21144 U32 lo = ARG1(o), hi = ARG2(o);
21145 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21146 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21147 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21148 if (hi == REG_INFTY)
21149 sv_catpvs(sv, "INFTY");
21151 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21152 sv_catpvs(sv, "}");
21154 else if (k == WHILEM && o->flags) /* Ordinal/of */
21155 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21156 else if (k == REF || k == OPEN || k == CLOSE
21157 || k == GROUPP || OP(o)==ACCEPT)
21159 AV *name_list= NULL;
21160 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21161 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21162 if ( RXp_PAREN_NAMES(prog) ) {
21163 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21164 } else if ( pRExC_state ) {
21165 name_list= RExC_paren_name_list;
21168 if ( k != REF || (OP(o) < REFN)) {
21169 SV **name= av_fetch(name_list, parno, 0 );
21171 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21174 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21175 I32 *nums=(I32*)SvPVX(sv_dat);
21176 SV **name= av_fetch(name_list, nums[0], 0 );
21179 for ( n=0; n<SvIVX(sv_dat); n++ ) {
21180 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21181 (n ? "," : ""), (IV)nums[n]);
21183 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21187 if ( k == REF && reginfo) {
21188 U32 n = ARG(o); /* which paren pair */
21189 I32 ln = prog->offs[n].start;
21190 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21191 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21192 else if (ln == prog->offs[n].end)
21193 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21195 const char *s = reginfo->strbeg + ln;
21196 Perl_sv_catpvf(aTHX_ sv, ": ");
21197 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21198 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21201 } else if (k == GOSUB) {
21202 AV *name_list= NULL;
21203 if ( RXp_PAREN_NAMES(prog) ) {
21204 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21205 } else if ( pRExC_state ) {
21206 name_list= RExC_paren_name_list;
21209 /* Paren and offset */
21210 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21211 (int)((o + (int)ARG2L(o)) - progi->program) );
21213 SV **name= av_fetch(name_list, ARG(o), 0 );
21215 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21218 else if (k == LOGICAL)
21219 /* 2: embedded, otherwise 1 */
21220 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21221 else if (k == ANYOF || k == ANYOFR) {
21225 bool do_sep = FALSE; /* Do we need to separate various components of
21227 /* Set if there is still an unresolved user-defined property */
21228 SV *unresolved = NULL;
21230 /* Things that are ignored except when the runtime locale is UTF-8 */
21231 SV *only_utf8_locale_invlist = NULL;
21233 /* Code points that don't fit in the bitmap */
21234 SV *nonbitmap_invlist = NULL;
21236 /* And things that aren't in the bitmap, but are small enough to be */
21237 SV* bitmap_range_not_in_bitmap = NULL;
21241 if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21247 flags = ANYOF_FLAGS(o);
21248 bitmap = ANYOF_BITMAP(o);
21252 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21253 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21254 sv_catpvs(sv, "{utf8-locale-reqd}");
21256 if (flags & ANYOFL_FOLD) {
21257 sv_catpvs(sv, "{i}");
21261 inverted = flags & ANYOF_INVERT;
21263 /* If there is stuff outside the bitmap, get it */
21264 if (arg != ANYOF_ONLY_HAS_BITMAP) {
21265 if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21266 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21268 ANYOFRbase(o) + ANYOFRdelta(o));
21271 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21272 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21274 &only_utf8_locale_invlist,
21275 &nonbitmap_invlist);
21277 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21279 &only_utf8_locale_invlist,
21280 &nonbitmap_invlist);
21284 /* The non-bitmap data may contain stuff that could fit in the
21285 * bitmap. This could come from a user-defined property being
21286 * finally resolved when this call was done; or much more likely
21287 * because there are matches that require UTF-8 to be valid, and so
21288 * aren't in the bitmap (or ANYOFR). This is teased apart later */
21289 _invlist_intersection(nonbitmap_invlist,
21291 &bitmap_range_not_in_bitmap);
21292 /* Leave just the things that don't fit into the bitmap */
21293 _invlist_subtract(nonbitmap_invlist,
21295 &nonbitmap_invlist);
21298 /* Obey this flag to add all above-the-bitmap code points */
21299 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21300 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21301 NUM_ANYOF_CODE_POINTS,
21305 /* Ready to start outputting. First, the initial left bracket */
21306 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21308 /* ANYOFH by definition doesn't have anything that will fit inside the
21309 * bitmap; ANYOFR may or may not. */
21310 if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21311 && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21312 || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21314 /* Then all the things that could fit in the bitmap */
21315 do_sep = put_charclass_bitmap_innards(sv,
21317 bitmap_range_not_in_bitmap,
21318 only_utf8_locale_invlist,
21322 /* Can't try inverting for a
21323 * better display if there
21324 * are things that haven't
21327 || inRANGE(OP(o), ANYOFR, ANYOFRb));
21328 SvREFCNT_dec(bitmap_range_not_in_bitmap);
21330 /* If there are user-defined properties which haven't been defined
21331 * yet, output them. If the result is not to be inverted, it is
21332 * clearest to output them in a separate [] from the bitmap range
21333 * stuff. If the result is to be complemented, we have to show
21334 * everything in one [], as the inversion applies to the whole
21335 * thing. Use {braces} to separate them from anything in the
21336 * bitmap and anything above the bitmap. */
21339 if (! do_sep) { /* If didn't output anything in the bitmap
21341 sv_catpvs(sv, "^");
21343 sv_catpvs(sv, "{");
21346 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21349 sv_catsv(sv, unresolved);
21351 sv_catpvs(sv, "}");
21353 do_sep = ! inverted;
21357 /* And, finally, add the above-the-bitmap stuff */
21358 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21361 /* See if truncation size is overridden */
21362 const STRLEN dump_len = (PL_dump_re_max_len > 256)
21363 ? PL_dump_re_max_len
21366 /* This is output in a separate [] */
21368 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21371 /* And, for easy of understanding, it is shown in the
21372 * uncomplemented form if possible. The one exception being if
21373 * there are unresolved items, where the inversion has to be
21374 * delayed until runtime */
21375 if (inverted && ! unresolved) {
21376 _invlist_invert(nonbitmap_invlist);
21377 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21380 contents = invlist_contents(nonbitmap_invlist,
21381 FALSE /* output suitable for catsv */
21384 /* If the output is shorter than the permissible maximum, just do it. */
21385 if (SvCUR(contents) <= dump_len) {
21386 sv_catsv(sv, contents);
21389 const char * contents_string = SvPVX(contents);
21390 STRLEN i = dump_len;
21392 /* Otherwise, start at the permissible max and work back to the
21393 * first break possibility */
21394 while (i > 0 && contents_string[i] != ' ') {
21397 if (i == 0) { /* Fail-safe. Use the max if we couldn't
21398 find a legal break */
21402 sv_catpvn(sv, contents_string, i);
21403 sv_catpvs(sv, "...");
21406 SvREFCNT_dec_NN(contents);
21407 SvREFCNT_dec_NN(nonbitmap_invlist);
21410 /* And finally the matching, closing ']' */
21411 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21413 if (OP(o) == ANYOFHs) {
21414 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21416 else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21417 U8 lowest = (OP(o) != ANYOFHr)
21419 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21420 U8 highest = (OP(o) == ANYOFHr)
21421 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21422 : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21426 if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21429 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21430 if (lowest != highest) {
21431 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21433 Perl_sv_catpvf(aTHX_ sv, ")");
21437 SvREFCNT_dec(unresolved);
21439 else if (k == ANYOFM) {
21440 SV * cp_list = get_ANYOFM_contents(o);
21442 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21443 if (OP(o) == NANYOFM) {
21444 _invlist_invert(cp_list);
21447 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21448 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21450 SvREFCNT_dec(cp_list);
21452 else if (k == POSIXD || k == NPOSIXD) {
21453 U8 index = FLAGS(o) * 2;
21454 if (index < C_ARRAY_LENGTH(anyofs)) {
21455 if (*anyofs[index] != '[') {
21456 sv_catpvs(sv, "[");
21458 sv_catpv(sv, anyofs[index]);
21459 if (*anyofs[index] != '[') {
21460 sv_catpvs(sv, "]");
21464 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21467 else if (k == BOUND || k == NBOUND) {
21468 /* Must be synced with order of 'bound_type' in regcomp.h */
21469 const char * const bounds[] = {
21470 "", /* Traditional */
21476 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21477 sv_catpv(sv, bounds[FLAGS(o)]);
21479 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21480 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21482 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21484 Perl_sv_catpvf(aTHX_ sv, "]");
21486 else if (OP(o) == SBOL)
21487 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21489 /* add on the verb argument if there is one */
21490 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21492 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21493 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21495 sv_catpvs(sv, ":NULL");
21498 PERL_UNUSED_CONTEXT;
21499 PERL_UNUSED_ARG(sv);
21500 PERL_UNUSED_ARG(o);
21501 PERL_UNUSED_ARG(prog);
21502 PERL_UNUSED_ARG(reginfo);
21503 PERL_UNUSED_ARG(pRExC_state);
21504 #endif /* DEBUGGING */
21510 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21511 { /* Assume that RE_INTUIT is set */
21512 /* Returns an SV containing a string that must appear in the target for it
21513 * to match, or NULL if nothing is known that must match.
21515 * CAUTION: the SV can be freed during execution of the regex engine */
21517 struct regexp *const prog = ReANY(r);
21518 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21520 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21521 PERL_UNUSED_CONTEXT;
21525 if (prog->maxlen > 0) {
21526 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21527 ? prog->check_utf8 : prog->check_substr);
21529 if (!PL_colorset) reginitcolors();
21530 Perl_re_printf( aTHX_
21531 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21533 RX_UTF8(r) ? "utf8 " : "",
21534 PL_colors[5], PL_colors[0],
21537 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21541 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21542 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21548 handles refcounting and freeing the perl core regexp structure. When
21549 it is necessary to actually free the structure the first thing it
21550 does is call the 'free' method of the regexp_engine associated to
21551 the regexp, allowing the handling of the void *pprivate; member
21552 first. (This routine is not overridable by extensions, which is why
21553 the extensions free is called first.)
21555 See regdupe and regdupe_internal if you change anything here.
21557 #ifndef PERL_IN_XSUB_RE
21559 Perl_pregfree(pTHX_ REGEXP *r)
21565 Perl_pregfree2(pTHX_ REGEXP *rx)
21567 struct regexp *const r = ReANY(rx);
21568 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21570 PERL_ARGS_ASSERT_PREGFREE2;
21575 if (r->mother_re) {
21576 ReREFCNT_dec(r->mother_re);
21578 CALLREGFREE_PVT(rx); /* free the private data */
21579 SvREFCNT_dec(RXp_PAREN_NAMES(r));
21583 for (i = 0; i < 2; i++) {
21584 SvREFCNT_dec(r->substrs->data[i].substr);
21585 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21587 Safefree(r->substrs);
21589 RX_MATCH_COPY_FREE(rx);
21590 #ifdef PERL_ANY_COW
21591 SvREFCNT_dec(r->saved_copy);
21594 SvREFCNT_dec(r->qr_anoncv);
21595 if (r->recurse_locinput)
21596 Safefree(r->recurse_locinput);
21602 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21603 except that dsv will be created if NULL.
21605 This function is used in two main ways. First to implement
21606 $r = qr/....; $s = $$r;
21608 Secondly, it is used as a hacky workaround to the structural issue of
21610 being stored in the regexp structure which is in turn stored in
21611 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21612 could be PL_curpm in multiple contexts, and could require multiple
21613 result sets being associated with the pattern simultaneously, such
21614 as when doing a recursive match with (??{$qr})
21616 The solution is to make a lightweight copy of the regexp structure
21617 when a qr// is returned from the code executed by (??{$qr}) this
21618 lightweight copy doesn't actually own any of its data except for
21619 the starp/end and the actual regexp structure itself.
21625 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21627 struct regexp *drx;
21628 struct regexp *const srx = ReANY(ssv);
21629 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21631 PERL_ARGS_ASSERT_REG_TEMP_COPY;
21634 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21636 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21638 /* our only valid caller, sv_setsv_flags(), should have done
21639 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21640 assert(!SvOOK(dsv));
21641 assert(!SvIsCOW(dsv));
21642 assert(!SvROK(dsv));
21644 if (SvPVX_const(dsv)) {
21646 Safefree(SvPVX(dsv));
21651 SvOK_off((SV *)dsv);
21654 /* For PVLVs, the head (sv_any) points to an XPVLV, while
21655 * the LV's xpvlenu_rx will point to a regexp body, which
21656 * we allocate here */
21657 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21658 assert(!SvPVX(dsv));
21659 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21660 temp->sv_any = NULL;
21661 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21662 SvREFCNT_dec_NN(temp);
21663 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21664 ing below will not set it. */
21665 SvCUR_set(dsv, SvCUR(ssv));
21668 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21669 sv_force_normal(sv) is called. */
21673 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21674 SvPV_set(dsv, RX_WRAPPED(ssv));
21675 /* We share the same string buffer as the original regexp, on which we
21676 hold a reference count, incremented when mother_re is set below.
21677 The string pointer is copied here, being part of the regexp struct.
21679 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21680 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21684 const I32 npar = srx->nparens+1;
21685 Newx(drx->offs, npar, regexp_paren_pair);
21686 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21688 if (srx->substrs) {
21690 Newx(drx->substrs, 1, struct reg_substr_data);
21691 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21693 for (i = 0; i < 2; i++) {
21694 SvREFCNT_inc_void(drx->substrs->data[i].substr);
21695 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21698 /* check_substr and check_utf8, if non-NULL, point to either their
21699 anchored or float namesakes, and don't hold a second reference. */
21701 RX_MATCH_COPIED_off(dsv);
21702 #ifdef PERL_ANY_COW
21703 drx->saved_copy = NULL;
21705 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21706 SvREFCNT_inc_void(drx->qr_anoncv);
21707 if (srx->recurse_locinput)
21708 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21715 /* regfree_internal()
21717 Free the private data in a regexp. This is overloadable by
21718 extensions. Perl takes care of the regexp structure in pregfree(),
21719 this covers the *pprivate pointer which technically perl doesn't
21720 know about, however of course we have to handle the
21721 regexp_internal structure when no extension is in use.
21723 Note this is called before freeing anything in the regexp
21728 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21730 struct regexp *const r = ReANY(rx);
21731 RXi_GET_DECL(r, ri);
21732 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21734 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21744 SV *dsv= sv_newmortal();
21745 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21746 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21747 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21748 PL_colors[4], PL_colors[5], s);
21752 #ifdef RE_TRACK_PATTERN_OFFSETS
21754 Safefree(ri->u.offsets); /* 20010421 MJD */
21756 if (ri->code_blocks)
21757 S_free_codeblocks(aTHX_ ri->code_blocks);
21760 int n = ri->data->count;
21763 /* If you add a ->what type here, update the comment in regcomp.h */
21764 switch (ri->data->what[n]) {
21770 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21773 Safefree(ri->data->data[n]);
21779 { /* Aho Corasick add-on structure for a trie node.
21780 Used in stclass optimization only */
21782 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21783 #ifdef USE_ITHREADS
21786 refcount = --aho->refcount;
21789 PerlMemShared_free(aho->states);
21790 PerlMemShared_free(aho->fail);
21791 /* do this last!!!! */
21792 PerlMemShared_free(ri->data->data[n]);
21793 /* we should only ever get called once, so
21794 * assert as much, and also guard the free
21795 * which /might/ happen twice. At the least
21796 * it will make code anlyzers happy and it
21797 * doesn't cost much. - Yves */
21798 assert(ri->regstclass);
21799 if (ri->regstclass) {
21800 PerlMemShared_free(ri->regstclass);
21801 ri->regstclass = 0;
21808 /* trie structure. */
21810 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21811 #ifdef USE_ITHREADS
21814 refcount = --trie->refcount;
21817 PerlMemShared_free(trie->charmap);
21818 PerlMemShared_free(trie->states);
21819 PerlMemShared_free(trie->trans);
21821 PerlMemShared_free(trie->bitmap);
21823 PerlMemShared_free(trie->jump);
21824 PerlMemShared_free(trie->wordinfo);
21825 /* do this last!!!! */
21826 PerlMemShared_free(ri->data->data[n]);
21831 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21832 ri->data->what[n]);
21835 Safefree(ri->data->what);
21836 Safefree(ri->data);
21842 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21843 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21844 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
21847 =for apidoc_section REGEXP Functions
21848 =for apidoc re_dup_guts
21849 Duplicate a regexp.
21851 This routine is expected to clone a given regexp structure. It is only
21852 compiled under USE_ITHREADS.
21854 After all of the core data stored in struct regexp is duplicated
21855 the regexp_engine.dupe method is used to copy any private data
21856 stored in the *pprivate pointer. This allows extensions to handle
21857 any duplication they need to do.
21861 See pregfree() and regfree_internal() if you change anything here.
21863 #if defined(USE_ITHREADS)
21864 #ifndef PERL_IN_XSUB_RE
21866 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21869 const struct regexp *r = ReANY(sstr);
21870 struct regexp *ret = ReANY(dstr);
21872 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21874 npar = r->nparens+1;
21875 Newx(ret->offs, npar, regexp_paren_pair);
21876 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21878 if (ret->substrs) {
21879 /* Do it this way to avoid reading from *r after the StructCopy().
21880 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21881 cache, it doesn't matter. */
21883 const bool anchored = r->check_substr
21884 ? r->check_substr == r->substrs->data[0].substr
21885 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21886 Newx(ret->substrs, 1, struct reg_substr_data);
21887 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21889 for (i = 0; i < 2; i++) {
21890 ret->substrs->data[i].substr =
21891 sv_dup_inc(ret->substrs->data[i].substr, param);
21892 ret->substrs->data[i].utf8_substr =
21893 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21896 /* check_substr and check_utf8, if non-NULL, point to either their
21897 anchored or float namesakes, and don't hold a second reference. */
21899 if (ret->check_substr) {
21901 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21903 ret->check_substr = ret->substrs->data[0].substr;
21904 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21906 assert(r->check_substr == r->substrs->data[1].substr);
21907 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21909 ret->check_substr = ret->substrs->data[1].substr;
21910 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21912 } else if (ret->check_utf8) {
21914 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21916 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21921 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21922 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21923 if (r->recurse_locinput)
21924 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21927 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21929 if (RX_MATCH_COPIED(dstr))
21930 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21932 ret->subbeg = NULL;
21933 #ifdef PERL_ANY_COW
21934 ret->saved_copy = NULL;
21937 /* Whether mother_re be set or no, we need to copy the string. We
21938 cannot refrain from copying it when the storage points directly to
21939 our mother regexp, because that's
21940 1: a buffer in a different thread
21941 2: something we no longer hold a reference on
21942 so we need to copy it locally. */
21943 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21944 /* set malloced length to a non-zero value so it will be freed
21945 * (otherwise in combination with SVf_FAKE it looks like an alien
21946 * buffer). It doesn't have to be the actual malloced size, since it
21947 * should never be grown */
21948 SvLEN_set(dstr, SvCUR(sstr)+1);
21949 ret->mother_re = NULL;
21951 #endif /* PERL_IN_XSUB_RE */
21956 This is the internal complement to regdupe() which is used to copy
21957 the structure pointed to by the *pprivate pointer in the regexp.
21958 This is the core version of the extension overridable cloning hook.
21959 The regexp structure being duplicated will be copied by perl prior
21960 to this and will be provided as the regexp *r argument, however
21961 with the /old/ structures pprivate pointer value. Thus this routine
21962 may override any copying normally done by perl.
21964 It returns a pointer to the new regexp_internal structure.
21968 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21970 struct regexp *const r = ReANY(rx);
21971 regexp_internal *reti;
21973 RXi_GET_DECL(r, ri);
21975 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21979 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21980 char, regexp_internal);
21981 Copy(ri->program, reti->program, len+1, regnode);
21984 if (ri->code_blocks) {
21986 Newx(reti->code_blocks, 1, struct reg_code_blocks);
21987 Newx(reti->code_blocks->cb, ri->code_blocks->count,
21988 struct reg_code_block);
21989 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21990 ri->code_blocks->count, struct reg_code_block);
21991 for (n = 0; n < ri->code_blocks->count; n++)
21992 reti->code_blocks->cb[n].src_regex = (REGEXP*)
21993 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21994 reti->code_blocks->count = ri->code_blocks->count;
21995 reti->code_blocks->refcnt = 1;
21998 reti->code_blocks = NULL;
22000 reti->regstclass = NULL;
22003 struct reg_data *d;
22004 const int count = ri->data->count;
22007 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22008 char, struct reg_data);
22009 Newx(d->what, count, U8);
22012 for (i = 0; i < count; i++) {
22013 d->what[i] = ri->data->what[i];
22014 switch (d->what[i]) {
22015 /* see also regcomp.h and regfree_internal() */
22016 case 'a': /* actually an AV, but the dup function is identical.
22017 values seem to be "plain sv's" generally. */
22018 case 'r': /* a compiled regex (but still just another SV) */
22019 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22020 this use case should go away, the code could have used
22021 'a' instead - see S_set_ANYOF_arg() for array contents. */
22022 case 'S': /* actually an SV, but the dup function is identical. */
22023 case 'u': /* actually an HV, but the dup function is identical.
22024 values are "plain sv's" */
22025 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22028 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22029 * patterns which could start with several different things. Pre-TRIE
22030 * this was more important than it is now, however this still helps
22031 * in some places, for instance /x?a+/ might produce a SSC equivalent
22032 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22035 /* This is cheating. */
22036 Newx(d->data[i], 1, regnode_ssc);
22037 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22038 reti->regstclass = (regnode*)d->data[i];
22041 /* AHO-CORASICK fail table */
22042 /* Trie stclasses are readonly and can thus be shared
22043 * without duplication. We free the stclass in pregfree
22044 * when the corresponding reg_ac_data struct is freed.
22046 reti->regstclass= ri->regstclass;
22049 /* TRIE transition table */
22051 ((reg_trie_data*)ri->data->data[i])->refcount++;
22054 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22055 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22056 is not from another regexp */
22057 d->data[i] = ri->data->data[i];
22060 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22061 ri->data->what[i]);
22070 reti->name_list_idx = ri->name_list_idx;
22072 #ifdef RE_TRACK_PATTERN_OFFSETS
22073 if (ri->u.offsets) {
22074 Newx(reti->u.offsets, 2*len+1, U32);
22075 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22078 SetProgLen(reti, len);
22081 return (void*)reti;
22084 #endif /* USE_ITHREADS */
22086 #ifndef PERL_IN_XSUB_RE
22089 - regnext - dig the "next" pointer out of a node
22092 Perl_regnext(pTHX_ regnode *p)
22099 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
22100 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22101 (int)OP(p), (int)REGNODE_MAX);
22104 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22114 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22117 STRLEN len = strlen(pat);
22120 const char *message;
22122 PERL_ARGS_ASSERT_RE_CROAK;
22126 Copy(pat, buf, len , char);
22128 buf[len + 1] = '\0';
22129 va_start(args, pat);
22130 msv = vmess(buf, &args);
22132 message = SvPV_const(msv, len);
22135 Copy(message, buf, len , char);
22136 /* len-1 to avoid \n */
22137 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22140 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22142 #ifndef PERL_IN_XSUB_RE
22144 Perl_save_re_context(pTHX)
22149 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22152 const REGEXP * const rx = PM_GETRE(PL_curpm);
22154 nparens = RX_NPARENS(rx);
22157 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22158 * that PL_curpm will be null, but that utf8.pm and the modules it
22159 * loads will only use $1..$3.
22160 * The t/porting/re_context.t test file checks this assumption.
22165 for (i = 1; i <= nparens; i++) {
22166 char digits[TYPE_CHARS(long)];
22167 const STRLEN len = my_snprintf(digits, sizeof(digits),
22169 GV *const *const gvp
22170 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22173 GV * const gv = *gvp;
22174 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22184 S_put_code_point(pTHX_ SV *sv, UV c)
22186 PERL_ARGS_ASSERT_PUT_CODE_POINT;
22189 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22191 else if (isPRINT(c)) {
22192 const char string = (char) c;
22194 /* We use {phrase} as metanotation in the class, so also escape literal
22196 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22197 sv_catpvs(sv, "\\");
22198 sv_catpvn(sv, &string, 1);
22200 else if (isMNEMONIC_CNTRL(c)) {
22201 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22204 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22208 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22211 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22213 /* Appends to 'sv' a displayable version of the range of code points from
22214 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
22215 * that have them, when they occur at the beginning or end of the range.
22216 * It uses hex to output the remaining code points, unless 'allow_literals'
22217 * is true, in which case the printable ASCII ones are output as-is (though
22218 * some of these will be escaped by put_code_point()).
22220 * NOTE: This is designed only for printing ranges of code points that fit
22221 * inside an ANYOF bitmap. Higher code points are simply suppressed
22224 const unsigned int min_range_count = 3;
22226 assert(start <= end);
22228 PERL_ARGS_ASSERT_PUT_RANGE;
22230 while (start <= end) {
22232 const char * format;
22234 if ( end - start < min_range_count
22235 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22237 /* Output a range of 1 or 2 chars individually, or longer ranges
22238 * when printable */
22239 for (; start <= end; start++) {
22240 put_code_point(sv, start);
22245 /* If permitted by the input options, and there is a possibility that
22246 * this range contains a printable literal, look to see if there is
22248 if (allow_literals && start <= MAX_PRINT_A) {
22250 /* If the character at the beginning of the range isn't an ASCII
22251 * printable, effectively split the range into two parts:
22252 * 1) the portion before the first such printable,
22254 * and output them separately. */
22255 if (! isPRINT_A(start)) {
22256 UV temp_end = start + 1;
22258 /* There is no point looking beyond the final possible
22259 * printable, in MAX_PRINT_A */
22260 UV max = MIN(end, MAX_PRINT_A);
22262 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22266 /* Here, temp_end points to one beyond the first printable if
22267 * found, or to one beyond 'max' if not. If none found, make
22268 * sure that we use the entire range */
22269 if (temp_end > MAX_PRINT_A) {
22270 temp_end = end + 1;
22273 /* Output the first part of the split range: the part that
22274 * doesn't have printables, with the parameter set to not look
22275 * for literals (otherwise we would infinitely recurse) */
22276 put_range(sv, start, temp_end - 1, FALSE);
22278 /* The 2nd part of the range (if any) starts here. */
22281 /* We do a continue, instead of dropping down, because even if
22282 * the 2nd part is non-empty, it could be so short that we want
22283 * to output it as individual characters, as tested for at the
22284 * top of this loop. */
22288 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
22289 * output a sub-range of just the digits or letters, then process
22290 * the remaining portion as usual. */
22291 if (isALPHANUMERIC_A(start)) {
22292 UV mask = (isDIGIT_A(start))
22297 UV temp_end = start + 1;
22299 /* Find the end of the sub-range that includes just the
22300 * characters in the same class as the first character in it */
22301 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22306 /* For short ranges, don't duplicate the code above to output
22307 * them; just call recursively */
22308 if (temp_end - start < min_range_count) {
22309 put_range(sv, start, temp_end, FALSE);
22311 else { /* Output as a range */
22312 put_code_point(sv, start);
22313 sv_catpvs(sv, "-");
22314 put_code_point(sv, temp_end);
22316 start = temp_end + 1;
22320 /* We output any other printables as individual characters */
22321 if (isPUNCT_A(start) || isSPACE_A(start)) {
22322 while (start <= end && (isPUNCT_A(start)
22323 || isSPACE_A(start)))
22325 put_code_point(sv, start);
22330 } /* End of looking for literals */
22332 /* Here is not to output as a literal. Some control characters have
22333 * mnemonic names. Split off any of those at the beginning and end of
22334 * the range to print mnemonically. It isn't possible for many of
22335 * these to be in a row, so this won't overwhelm with output */
22337 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22339 while (isMNEMONIC_CNTRL(start) && start <= end) {
22340 put_code_point(sv, start);
22344 /* If this didn't take care of the whole range ... */
22345 if (start <= end) {
22347 /* Look backwards from the end to find the final non-mnemonic
22350 while (isMNEMONIC_CNTRL(temp_end)) {
22354 /* And separately output the interior range that doesn't start
22355 * or end with mnemonics */
22356 put_range(sv, start, temp_end, FALSE);
22358 /* Then output the mnemonic trailing controls */
22359 start = temp_end + 1;
22360 while (start <= end) {
22361 put_code_point(sv, start);
22368 /* As a final resort, output the range or subrange as hex. */
22370 if (start >= NUM_ANYOF_CODE_POINTS) {
22373 else { /* Have to split range at the bitmap boundary */
22374 this_end = (end < NUM_ANYOF_CODE_POINTS)
22376 : NUM_ANYOF_CODE_POINTS - 1;
22378 #if NUM_ANYOF_CODE_POINTS > 256
22379 format = (this_end < 256)
22380 ? "\\x%02" UVXf "-\\x%02" UVXf
22381 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22383 format = "\\x%02" UVXf "-\\x%02" UVXf;
22385 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22386 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22387 GCC_DIAG_RESTORE_STMT;
22393 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22395 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22399 bool allow_literals = TRUE;
22401 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22403 /* Generally, it is more readable if printable characters are output as
22404 * literals, but if a range (nearly) spans all of them, it's best to output
22405 * it as a single range. This code will use a single range if all but 2
22406 * ASCII printables are in it */
22407 invlist_iterinit(invlist);
22408 while (invlist_iternext(invlist, &start, &end)) {
22410 /* If the range starts beyond the final printable, it doesn't have any
22412 if (start > MAX_PRINT_A) {
22416 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
22417 * all but two, the range must start and end no later than 2 from
22419 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22420 if (end > MAX_PRINT_A) {
22426 if (end - start >= MAX_PRINT_A - ' ' - 2) {
22427 allow_literals = FALSE;
22432 invlist_iterfinish(invlist);
22434 /* Here we have figured things out. Output each range */
22435 invlist_iterinit(invlist);
22436 while (invlist_iternext(invlist, &start, &end)) {
22437 if (start >= NUM_ANYOF_CODE_POINTS) {
22440 put_range(sv, start, end, allow_literals);
22442 invlist_iterfinish(invlist);
22448 S_put_charclass_bitmap_innards_common(pTHX_
22449 SV* invlist, /* The bitmap */
22450 SV* posixes, /* Under /l, things like [:word:], \S */
22451 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
22452 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
22453 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
22454 const bool invert /* Is the result to be inverted? */
22457 /* Create and return an SV containing a displayable version of the bitmap
22458 * and associated information determined by the input parameters. If the
22459 * output would have been only the inversion indicator '^', NULL is instead
22464 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22467 output = newSVpvs("^");
22470 output = newSVpvs("");
22473 /* First, the code points in the bitmap that are unconditionally there */
22474 put_charclass_bitmap_innards_invlist(output, invlist);
22476 /* Traditionally, these have been placed after the main code points */
22478 sv_catsv(output, posixes);
22481 if (only_utf8 && _invlist_len(only_utf8)) {
22482 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22483 put_charclass_bitmap_innards_invlist(output, only_utf8);
22486 if (not_utf8 && _invlist_len(not_utf8)) {
22487 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22488 put_charclass_bitmap_innards_invlist(output, not_utf8);
22491 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22492 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22493 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22495 /* This is the only list in this routine that can legally contain code
22496 * points outside the bitmap range. The call just above to
22497 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22498 * output them here. There's about a half-dozen possible, and none in
22499 * contiguous ranges longer than 2 */
22500 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22502 SV* above_bitmap = NULL;
22504 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22506 invlist_iterinit(above_bitmap);
22507 while (invlist_iternext(above_bitmap, &start, &end)) {
22510 for (i = start; i <= end; i++) {
22511 put_code_point(output, i);
22514 invlist_iterfinish(above_bitmap);
22515 SvREFCNT_dec_NN(above_bitmap);
22519 if (invert && SvCUR(output) == 1) {
22527 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22529 SV *nonbitmap_invlist,
22530 SV *only_utf8_locale_invlist,
22531 const regnode * const node,
22533 const bool force_as_is_display)
22535 /* Appends to 'sv' a displayable version of the innards of the bracketed
22536 * character class defined by the other arguments:
22537 * 'bitmap' points to the bitmap, or NULL if to ignore that.
22538 * 'nonbitmap_invlist' is an inversion list of the code points that are in
22539 * the bitmap range, but for some reason aren't in the bitmap; NULL if
22540 * none. The reasons for this could be that they require some
22541 * condition such as the target string being or not being in UTF-8
22542 * (under /d), or because they came from a user-defined property that
22543 * was not resolved at the time of the regex compilation (under /u)
22544 * 'only_utf8_locale_invlist' is an inversion list of the code points that
22545 * are valid only if the runtime locale is a UTF-8 one; NULL if none
22546 * 'node' is the regex pattern ANYOF node. It is needed only when the
22547 * above two parameters are not null, and is passed so that this
22548 * routine can tease apart the various reasons for them.
22549 * 'flags' is the flags field of 'node'
22550 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
22551 * to invert things to see if that leads to a cleaner display. If
22552 * FALSE, this routine is free to use its judgment about doing this.
22554 * It returns TRUE if there was actually something output. (It may be that
22555 * the bitmap, etc is empty.)
22557 * When called for outputting the bitmap of a non-ANYOF node, just pass the
22558 * bitmap, with the succeeding parameters set to NULL, and the final one to
22562 /* In general, it tries to display the 'cleanest' representation of the
22563 * innards, choosing whether to display them inverted or not, regardless of
22564 * whether the class itself is to be inverted. However, there are some
22565 * cases where it can't try inverting, as what actually matches isn't known
22566 * until runtime, and hence the inversion isn't either. */
22568 bool inverting_allowed = ! force_as_is_display;
22571 STRLEN orig_sv_cur = SvCUR(sv);
22573 SV* invlist; /* Inversion list we accumulate of code points that
22574 are unconditionally matched */
22575 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
22577 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
22579 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
22580 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
22583 SV* as_is_display; /* The output string when we take the inputs
22585 SV* inverted_display; /* The output string when we invert the inputs */
22587 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
22589 /* We are biased in favor of displaying things without them being inverted,
22590 * as that is generally easier to understand */
22591 const int bias = 5;
22593 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22595 /* Start off with whatever code points are passed in. (We clone, so we
22596 * don't change the caller's list) */
22597 if (nonbitmap_invlist) {
22598 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22599 invlist = invlist_clone(nonbitmap_invlist, NULL);
22601 else { /* Worst case size is every other code point is matched */
22602 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22606 if (OP(node) == ANYOFD) {
22608 /* This flag indicates that the code points below 0x100 in the
22609 * nonbitmap list are precisely the ones that match only when the
22610 * target is UTF-8 (they should all be non-ASCII). */
22611 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22613 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22614 _invlist_subtract(invlist, only_utf8, &invlist);
22617 /* And this flag for matching all non-ASCII 0xFF and below */
22618 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22620 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22623 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22625 /* If either of these flags are set, what matches isn't
22626 * determinable except during execution, so don't know enough here
22628 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22629 inverting_allowed = FALSE;
22632 /* What the posix classes match also varies at runtime, so these
22633 * will be output symbolically. */
22634 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22637 posixes = newSVpvs("");
22638 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22639 if (ANYOF_POSIXL_TEST(node, i)) {
22640 sv_catpv(posixes, anyofs[i]);
22647 /* Accumulate the bit map into the unconditional match list */
22649 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22650 if (BITMAP_TEST(bitmap, i)) {
22653 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22656 invlist = _add_range_to_invlist(invlist, start, i-1);
22661 /* Make sure that the conditional match lists don't have anything in them
22662 * that match unconditionally; otherwise the output is quite confusing.
22663 * This could happen if the code that populates these misses some
22666 _invlist_subtract(only_utf8, invlist, &only_utf8);
22669 _invlist_subtract(not_utf8, invlist, ¬_utf8);
22672 if (only_utf8_locale_invlist) {
22674 /* Since this list is passed in, we have to make a copy before
22676 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22678 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22680 /* And, it can get really weird for us to try outputting an inverted
22681 * form of this list when it has things above the bitmap, so don't even
22683 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22684 inverting_allowed = FALSE;
22688 /* Calculate what the output would be if we take the input as-is */
22689 as_is_display = put_charclass_bitmap_innards_common(invlist,
22696 /* If have to take the output as-is, just do that */
22697 if (! inverting_allowed) {
22698 if (as_is_display) {
22699 sv_catsv(sv, as_is_display);
22700 SvREFCNT_dec_NN(as_is_display);
22703 else { /* But otherwise, create the output again on the inverted input, and
22704 use whichever version is shorter */
22706 int inverted_bias, as_is_bias;
22708 /* We will apply our bias to whichever of the results doesn't have
22718 inverted_bias = bias;
22721 /* Now invert each of the lists that contribute to the output,
22722 * excluding from the result things outside the possible range */
22724 /* For the unconditional inversion list, we have to add in all the
22725 * conditional code points, so that when inverted, they will be gone
22727 _invlist_union(only_utf8, invlist, &invlist);
22728 _invlist_union(not_utf8, invlist, &invlist);
22729 _invlist_union(only_utf8_locale, invlist, &invlist);
22730 _invlist_invert(invlist);
22731 _invlist_intersection(invlist, PL_InBitmap, &invlist);
22734 _invlist_invert(only_utf8);
22735 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22737 else if (not_utf8) {
22739 /* If a code point matches iff the target string is not in UTF-8,
22740 * then complementing the result has it not match iff not in UTF-8,
22741 * which is the same thing as matching iff it is UTF-8. */
22742 only_utf8 = not_utf8;
22746 if (only_utf8_locale) {
22747 _invlist_invert(only_utf8_locale);
22748 _invlist_intersection(only_utf8_locale,
22750 &only_utf8_locale);
22753 inverted_display = put_charclass_bitmap_innards_common(
22758 only_utf8_locale, invert);
22760 /* Use the shortest representation, taking into account our bias
22761 * against showing it inverted */
22762 if ( inverted_display
22763 && ( ! as_is_display
22764 || ( SvCUR(inverted_display) + inverted_bias
22765 < SvCUR(as_is_display) + as_is_bias)))
22767 sv_catsv(sv, inverted_display);
22769 else if (as_is_display) {
22770 sv_catsv(sv, as_is_display);
22773 SvREFCNT_dec(as_is_display);
22774 SvREFCNT_dec(inverted_display);
22777 SvREFCNT_dec_NN(invlist);
22778 SvREFCNT_dec(only_utf8);
22779 SvREFCNT_dec(not_utf8);
22780 SvREFCNT_dec(posixes);
22781 SvREFCNT_dec(only_utf8_locale);
22783 return SvCUR(sv) > orig_sv_cur;
22786 #define CLEAR_OPTSTART \
22787 if (optstart) STMT_START { \
22788 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
22789 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22793 #define DUMPUNTIL(b,e) \
22795 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22797 STATIC const regnode *
22798 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22799 const regnode *last, const regnode *plast,
22800 SV* sv, I32 indent, U32 depth)
22802 U8 op = PSEUDO; /* Arbitrary non-END op. */
22803 const regnode *next;
22804 const regnode *optstart= NULL;
22806 RXi_GET_DECL(r, ri);
22807 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22809 PERL_ARGS_ASSERT_DUMPUNTIL;
22811 #ifdef DEBUG_DUMPUNTIL
22812 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
22813 last ? last-start : 0, plast ? plast-start : 0);
22816 if (plast && plast < last)
22819 while (PL_regkind[op] != END && (!last || node < last)) {
22821 /* While that wasn't END last time... */
22824 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22826 next = regnext((regnode *)node);
22829 if (OP(node) == OPTIMIZED) {
22830 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22837 regprop(r, sv, node, NULL, NULL);
22838 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
22839 (int)(2*indent + 1), "", SvPVX_const(sv));
22841 if (OP(node) != OPTIMIZED) {
22842 if (next == NULL) /* Next ptr. */
22843 Perl_re_printf( aTHX_ " (0)");
22844 else if (PL_regkind[(U8)op] == BRANCH
22845 && PL_regkind[OP(next)] != BRANCH )
22846 Perl_re_printf( aTHX_ " (FAIL)");
22848 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
22849 Perl_re_printf( aTHX_ "\n");
22853 if (PL_regkind[(U8)op] == BRANCHJ) {
22856 const regnode *nnode = (OP(next) == LONGJMP
22857 ? regnext((regnode *)next)
22859 if (last && nnode > last)
22861 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22864 else if (PL_regkind[(U8)op] == BRANCH) {
22866 DUMPUNTIL(NEXTOPER(node), next);
22868 else if ( PL_regkind[(U8)op] == TRIE ) {
22869 const regnode *this_trie = node;
22870 const char op = OP(node);
22871 const U32 n = ARG(node);
22872 const reg_ac_data * const ac = op>=AHOCORASICK ?
22873 (reg_ac_data *)ri->data->data[n] :
22875 const reg_trie_data * const trie =
22876 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22878 AV *const trie_words
22879 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22881 const regnode *nextbranch= NULL;
22884 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22885 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22887 Perl_re_indentf( aTHX_ "%s ",
22890 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22891 SvCUR(*elem_ptr), PL_dump_re_max_len,
22892 PL_colors[0], PL_colors[1],
22894 ? PERL_PV_ESCAPE_UNI
22896 | PERL_PV_PRETTY_ELLIPSES
22897 | PERL_PV_PRETTY_LTGT
22902 U16 dist= trie->jump[word_idx+1];
22903 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22904 (UV)((dist ? this_trie + dist : next) - start));
22907 nextbranch= this_trie + trie->jump[0];
22908 DUMPUNTIL(this_trie + dist, nextbranch);
22910 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22911 nextbranch= regnext((regnode *)nextbranch);
22913 Perl_re_printf( aTHX_ "\n");
22916 if (last && next > last)
22921 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22922 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22923 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22925 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22927 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22929 else if ( op == PLUS || op == STAR) {
22930 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22932 else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22933 /* Literal string, where present. */
22934 node += NODE_SZ_STR(node) - 1;
22935 node = NEXTOPER(node);
22938 node = NEXTOPER(node);
22939 node += regarglen[(U8)op];
22941 if (op == CURLYX || op == OPEN || op == SROPEN)
22945 #ifdef DEBUG_DUMPUNTIL
22946 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22951 #endif /* DEBUGGING */
22953 #ifndef PERL_IN_XSUB_RE
22955 # include "uni_keywords.h"
22958 Perl_init_uniprops(pTHX)
22962 char * dump_len_string;
22964 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22965 if ( ! dump_len_string
22966 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22968 PL_dump_re_max_len = 60; /* A reasonable default */
22972 PL_user_def_props = newHV();
22974 # ifdef USE_ITHREADS
22976 HvSHAREKEYS_off(PL_user_def_props);
22977 PL_user_def_props_aTHX = aTHX;
22981 /* Set up the inversion list interpreter-level variables */
22983 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22984 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22985 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22986 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22987 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22988 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22989 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22990 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22991 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22992 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22993 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22994 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22995 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22996 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22997 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22998 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23000 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23001 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23002 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23003 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23004 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23005 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23006 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23007 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23008 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23009 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23010 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23011 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23012 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23013 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23014 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23015 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23017 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23018 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23019 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23020 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23021 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23023 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23024 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23025 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23026 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23028 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23030 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23031 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23033 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23034 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23036 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23037 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23038 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23039 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23040 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23041 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23042 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23043 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23044 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23045 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23046 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23047 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23048 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23049 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23052 /* The below are used only by deprecated functions. They could be removed */
23053 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23054 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23055 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23059 /* These four functions are compiled only in regcomp.c, where they have access
23060 * to the data they return. They are a way for re_comp.c to get access to that
23061 * data without having to compile the whole data structures. */
23064 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23066 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23068 return match_uniprop((U8 *) key, key_len);
23072 Perl_get_prop_definition(pTHX_ const int table_index)
23074 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23076 /* Create and return the inversion list */
23077 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23080 const char * const *
23081 Perl_get_prop_values(const int table_index)
23083 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23085 return UNI_prop_value_ptrs[table_index];
23089 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23091 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23093 return deprecated_property_msgs[warning_offset];
23098 This code was mainly added for backcompat to give a warning for non-portable
23099 code points in user-defined properties. But experiments showed that the
23100 warning in earlier perls were only omitted on overflow, which should be an
23101 error, so there really isnt a backcompat issue, and actually adding the
23102 warning when none was present before might cause breakage, for little gain. So
23103 khw left this code in, but not enabled. Tests were never added.
23106 Ei |const char *|get_extended_utf8_msg|const UV cp
23108 PERL_STATIC_INLINE const char *
23109 S_get_extended_utf8_msg(pTHX_ const UV cp)
23111 U8 dummy[UTF8_MAXBYTES + 1];
23115 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23118 msg = hv_fetchs(msgs, "text", 0);
23121 (void) sv_2mortal((SV *) msgs);
23123 return SvPVX(*msg);
23127 #endif /* end of ! PERL_IN_XSUB_RE */
23130 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23131 const bool ignore_case)
23133 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23134 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23135 * because nothing outside of ASCII will match. Use /m because the input
23136 * string may be a bunch of lines strung together.
23138 * Also sets up the debugging info */
23140 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23142 SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23143 REGEXP * subpattern_re;
23144 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23146 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23151 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23153 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23154 rx_flags = flags & RXf_PMf_COMPILETIME;
23156 #ifndef PERL_IN_XSUB_RE
23157 /* Use the core engine if this file is regcomp.c. That means no
23158 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23159 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23160 &PL_core_reg_engine,
23164 if (isDEBUG_WILDCARD) {
23165 /* Use the special debugging engine if this file is re_comp.c and wants
23166 * to output the wildcard matching. This uses whatever
23167 * 'use re "Debug ..." is in effect */
23168 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23174 /* Use the special wildcard engine if this file is re_comp.c and
23175 * doesn't want to output the wildcard matching. This uses whatever
23176 * 'use re "Debug ..." is in effect for compilation, but this engine
23177 * structure has been set up so that it uses the core engine for
23178 * execution, so no execution debugging as a result of re.pm will be
23180 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23184 /* XXX The above has the effect that any user-supplied regex engine
23185 * won't be called for matching wildcards. That might be good, or bad.
23186 * It could be changed in several ways. The reason it is done the
23187 * current way is to avoid having to save and restore
23188 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
23189 * could be used. Another suggestion is to keep the authoritative
23190 * value of the debug flags in a thread-local variable and add set/get
23191 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23192 * Still another is to pass a flag, say in the engine's intflags that
23193 * would be checked each time before doing the debug output */
23197 assert(subpattern_re); /* Should have died if didn't compile successfully */
23198 return subpattern_re;
23202 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23203 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23206 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23208 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23212 /* The compilation has set things up so that if the program doesn't want to
23213 * see the wildcard matching procedure, it will get the core execution
23214 * engine, which is subject only to -Dr. So we have to turn that off
23215 * around this procedure */
23216 if (! isDEBUG_WILDCARD) {
23217 /* Note! Casts away 'volatile' */
23219 PL_debug &= ~ DEBUG_r_FLAG;
23222 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23230 S_handle_user_defined_property(pTHX_
23232 /* Parses the contents of a user-defined property definition; returning the
23233 * expanded definition if possible. If so, the return is an inversion
23236 * If there are subroutines that are part of the expansion and which aren't
23237 * known at the time of the call to this function, this returns what
23238 * parse_uniprop_string() returned for the first one encountered.
23240 * If an error was found, NULL is returned, and 'msg' gets a suitable
23241 * message appended to it. (Appending allows the back trace of how we got
23242 * to the faulty definition to be displayed through nested calls of
23243 * user-defined subs.)
23245 * The caller IS responsible for freeing any returned SV.
23247 * The syntax of the contents is pretty much described in perlunicode.pod,
23248 * but we also allow comments on each line */
23250 const char * name, /* Name of property */
23251 const STRLEN name_len, /* The name's length in bytes */
23252 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23253 const bool to_fold, /* ? Is this under /i */
23254 const bool runtime, /* ? Are we in compile- or run-time */
23255 const bool deferrable, /* Is it ok for this property's full definition
23256 to be deferred until later? */
23257 SV* contents, /* The property's definition */
23258 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
23259 getting called unless this is thought to be
23260 a user-defined property */
23261 SV * msg, /* Any error or warning msg(s) are appended to
23263 const STRLEN level) /* Recursion level of this call */
23266 const char * string = SvPV_const(contents, len);
23267 const char * const e = string + len;
23268 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23269 const STRLEN msgs_length_on_entry = SvCUR(msg);
23271 const char * s0 = string; /* Points to first byte in the current line
23272 being parsed in 'string' */
23273 const char overflow_msg[] = "Code point too large in \"";
23274 SV* running_definition = NULL;
23276 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23278 *user_defined_ptr = TRUE;
23280 /* Look at each line */
23282 const char * s; /* Current byte */
23283 char op = '+'; /* Default operation is 'union' */
23284 IV min = 0; /* range begin code point */
23285 IV max = -1; /* and range end */
23286 SV* this_definition;
23288 /* Skip comment lines */
23290 s0 = strchr(s0, '\n');
23298 /* For backcompat, allow an empty first line */
23304 /* First character in the line may optionally be the operation */
23313 /* If the line is one or two hex digits separated by blank space, its
23314 * a range; otherwise it is either another user-defined property or an
23319 if (! isXDIGIT(*s)) {
23320 goto check_if_property;
23323 do { /* Each new hex digit will add 4 bits. */
23324 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23325 s = strchr(s, '\n');
23329 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23330 sv_catpv(msg, overflow_msg);
23331 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23332 UTF8fARG(is_contents_utf8, s - s0, s0));
23333 sv_catpvs(msg, "\"");
23334 goto return_failure;
23337 /* Accumulate this digit into the value */
23338 min = (min << 4) + READ_XDIGIT(s);
23339 } while (isXDIGIT(*s));
23341 while (isBLANK(*s)) { s++; }
23343 /* We allow comments at the end of the line */
23345 s = strchr(s, '\n');
23351 else if (s < e && *s != '\n') {
23352 if (! isXDIGIT(*s)) {
23353 goto check_if_property;
23356 /* Look for the high point of the range */
23359 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23360 s = strchr(s, '\n');
23364 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23365 sv_catpv(msg, overflow_msg);
23366 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23367 UTF8fARG(is_contents_utf8, s - s0, s0));
23368 sv_catpvs(msg, "\"");
23369 goto return_failure;
23372 max = (max << 4) + READ_XDIGIT(s);
23373 } while (isXDIGIT(*s));
23375 while (isBLANK(*s)) { s++; }
23378 s = strchr(s, '\n');
23383 else if (s < e && *s != '\n') {
23384 goto check_if_property;
23388 if (max == -1) { /* The line only had one entry */
23391 else if (max < min) {
23392 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23393 sv_catpvs(msg, "Illegal range in \"");
23394 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23395 UTF8fARG(is_contents_utf8, s - s0, s0));
23396 sv_catpvs(msg, "\"");
23397 goto return_failure;
23400 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
23402 if ( UNICODE_IS_PERL_EXTENDED(min)
23403 || UNICODE_IS_PERL_EXTENDED(max))
23405 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23407 /* If both code points are non-portable, warn only on the lower
23409 sv_catpv(msg, get_extended_utf8_msg(
23410 (UNICODE_IS_PERL_EXTENDED(min))
23412 sv_catpvs(msg, " in \"");
23413 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23414 UTF8fARG(is_contents_utf8, s - s0, s0));
23415 sv_catpvs(msg, "\"");
23420 /* Here, this line contains a legal range */
23421 this_definition = sv_2mortal(_new_invlist(2));
23422 this_definition = _add_range_to_invlist(this_definition, min, max);
23427 /* Here it isn't a legal range line. See if it is a legal property
23428 * line. First find the end of the meat of the line */
23429 s = strpbrk(s, "#\n");
23434 /* Ignore trailing blanks in keeping with the requirements of
23435 * parse_uniprop_string() */
23437 while (s > s0 && isBLANK_A(*s)) {
23442 this_definition = parse_uniprop_string(s0, s - s0,
23443 is_utf8, to_fold, runtime,
23446 user_defined_ptr, msg,
23448 ? level /* Don't increase level
23449 if input is empty */
23452 if (this_definition == NULL) {
23453 goto return_failure; /* 'msg' should have had the reason
23454 appended to it by the above call */
23457 if (! is_invlist(this_definition)) { /* Unknown at this time */
23458 return newSVsv(this_definition);
23462 s = strchr(s, '\n');
23472 _invlist_union(running_definition, this_definition,
23473 &running_definition);
23476 _invlist_subtract(running_definition, this_definition,
23477 &running_definition);
23480 _invlist_intersection(running_definition, this_definition,
23481 &running_definition);
23484 _invlist_union_complement_2nd(running_definition,
23485 this_definition, &running_definition);
23488 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23489 __FILE__, __LINE__, op);
23493 /* Position past the '\n' */
23495 } /* End of loop through the lines of 'contents' */
23497 /* Here, we processed all the lines in 'contents' without error. If we
23498 * didn't add any warnings, simply return success */
23499 if (msgs_length_on_entry == SvCUR(msg)) {
23501 /* If the expansion was empty, the answer isn't nothing: its an empty
23502 * inversion list */
23503 if (running_definition == NULL) {
23504 running_definition = _new_invlist(1);
23507 return running_definition;
23510 /* Otherwise, add some explanatory text, but we will return success */
23514 running_definition = NULL;
23518 if (name_len > 0) {
23519 sv_catpvs(msg, " in expansion of ");
23520 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23523 return running_definition;
23526 /* As explained below, certain operations need to take place in the first
23527 * thread created. These macros switch contexts */
23528 # ifdef USE_ITHREADS
23529 # define DECLARATION_FOR_GLOBAL_CONTEXT \
23530 PerlInterpreter * save_aTHX = aTHX;
23531 # define SWITCH_TO_GLOBAL_CONTEXT \
23532 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23533 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
23534 # define CUR_CONTEXT aTHX
23535 # define ORIGINAL_CONTEXT save_aTHX
23537 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
23538 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
23539 # define RESTORE_CONTEXT NOOP
23540 # define CUR_CONTEXT NULL
23541 # define ORIGINAL_CONTEXT NULL
23545 S_delete_recursion_entry(pTHX_ void *key)
23547 /* Deletes the entry used to detect recursion when expanding user-defined
23548 * properties. This is a function so it can be set up to be called even if
23549 * the program unexpectedly quits */
23551 SV ** current_entry;
23552 const STRLEN key_len = strlen((const char *) key);
23553 DECLARATION_FOR_GLOBAL_CONTEXT;
23555 SWITCH_TO_GLOBAL_CONTEXT;
23557 /* If the entry is one of these types, it is a permanent entry, and not the
23558 * one used to detect recursions. This function should delete only the
23559 * recursion entry */
23560 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23562 && ! is_invlist(*current_entry)
23563 && ! SvPOK(*current_entry))
23565 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23573 S_get_fq_name(pTHX_
23574 const char * const name, /* The first non-blank in the \p{}, \P{} */
23575 const Size_t name_len, /* Its length in bytes, not including any trailing space */
23576 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23577 const bool has_colon_colon
23580 /* Returns a mortal SV containing the fully qualified version of the input
23585 fq_name = newSVpvs_flags("", SVs_TEMP);
23587 /* Use the current package if it wasn't included in our input */
23588 if (! has_colon_colon) {
23589 const HV * pkg = (IN_PERL_COMPILETIME)
23591 : CopSTASH(PL_curcop);
23592 const char* pkgname = HvNAME(pkg);
23594 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23595 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23596 sv_catpvs(fq_name, "::");
23599 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23600 UTF8fARG(is_utf8, name_len, name));
23605 S_parse_uniprop_string(pTHX_
23607 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
23608 * now. If so, the return is an inversion list.
23610 * If the property is user-defined, it is a subroutine, which in turn
23611 * may call other subroutines. This function will call the whole nest of
23612 * them to get the definition they return; if some aren't known at the time
23613 * of the call to this function, the fully qualified name of the highest
23614 * level sub is returned. It is an error to call this function at runtime
23615 * without every sub defined.
23617 * If an error was found, NULL is returned, and 'msg' gets a suitable
23618 * message appended to it. (Appending allows the back trace of how we got
23619 * to the faulty definition to be displayed through nested calls of
23620 * user-defined subs.)
23622 * The caller should NOT try to free any returned inversion list.
23624 * Other parameters will be set on return as described below */
23626 const char * const name, /* The first non-blank in the \p{}, \P{} */
23627 Size_t name_len, /* Its length in bytes, not including any
23629 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23630 const bool to_fold, /* ? Is this under /i */
23631 const bool runtime, /* TRUE if this is being called at run time */
23632 const bool deferrable, /* TRUE if it's ok for the definition to not be
23633 known at this call */
23634 AV ** strings, /* To return string property values, like named
23636 bool *user_defined_ptr, /* Upon return from this function it will be
23637 set to TRUE if any component is a
23638 user-defined property */
23639 SV * msg, /* Any error or warning msg(s) are appended to
23641 const STRLEN level) /* Recursion level of this call */
23643 char* lookup_name; /* normalized name for lookup in our tables */
23644 unsigned lookup_len; /* Its length */
23645 enum { Not_Strict = 0, /* Some properties have stricter name */
23646 Strict, /* normalization rules, which we decide */
23647 As_Is /* upon based on parsing */
23648 } stricter = Not_Strict;
23650 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23651 * (though it requires extra effort to download them from Unicode and
23652 * compile perl to know about them) */
23653 bool is_nv_type = FALSE;
23655 unsigned int i, j = 0;
23656 int equals_pos = -1; /* Where the '=' is found, or negative if none */
23657 int slash_pos = -1; /* Where the '/' is found, or negative if none */
23658 int table_index = 0; /* The entry number for this property in the table
23659 of all Unicode property names */
23660 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
23661 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
23662 the normalized name in certain situations */
23663 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
23664 part of a package name */
23665 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
23666 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
23667 property rather than a Unicode
23669 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
23670 if an error. If it is an inversion list,
23671 it is the definition. Otherwise it is a
23672 string containing the fully qualified sub
23674 SV * fq_name = NULL; /* For user-defined properties, the fully
23676 bool invert_return = FALSE; /* ? Do we need to complement the result before
23678 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23679 explicit utf8:: package that we strip
23681 /* The expansion of properties that could be either user-defined or
23682 * official unicode ones is deferred until runtime, including a marker for
23683 * those that might be in the latter category. This boolean indicates if
23684 * we've seen that marker. If not, what we're parsing can't be such an
23685 * official Unicode property whose expansion was deferred */
23686 bool could_be_deferred_official = FALSE;
23688 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23690 /* The input will be normalized into 'lookup_name' */
23691 Newx(lookup_name, name_len, char);
23692 SAVEFREEPV(lookup_name);
23694 /* Parse the input. */
23695 for (i = 0; i < name_len; i++) {
23696 char cur = name[i];
23698 /* Most of the characters in the input will be of this ilk, being parts
23700 if (isIDCONT_A(cur)) {
23702 /* Case differences are ignored. Our lookup routine assumes
23703 * everything is lowercase, so normalize to that */
23704 if (isUPPER_A(cur)) {
23705 lookup_name[j++] = toLOWER_A(cur);
23709 if (cur == '_') { /* Don't include these in the normalized name */
23713 lookup_name[j++] = cur;
23715 /* The first character in a user-defined name must be of this type.
23717 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23718 could_be_user_defined = FALSE;
23724 /* Here, the character is not something typically in a name, But these
23725 * two types of characters (and the '_' above) can be freely ignored in
23726 * most situations. Later it may turn out we shouldn't have ignored
23727 * them, and we have to reparse, but we don't have enough information
23728 * yet to make that decision */
23729 if (cur == '-' || isSPACE_A(cur)) {
23730 could_be_user_defined = FALSE;
23734 /* An equals sign or single colon mark the end of the first part of
23735 * the property name */
23737 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23739 lookup_name[j++] = '='; /* Treat the colon as an '=' */
23740 equals_pos = j; /* Note where it occurred in the input */
23741 could_be_user_defined = FALSE;
23745 /* If this looks like it is a marker we inserted at compile time,
23746 * set a flag and otherwise ignore it. If it isn't in the final
23747 * position, keep it as it would have been user input. */
23748 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23750 && could_be_user_defined
23751 && i == name_len - 1)
23754 could_be_deferred_official = TRUE;
23758 /* Otherwise, this character is part of the name. */
23759 lookup_name[j++] = cur;
23761 /* Here it isn't a single colon, so if it is a colon, it must be a
23765 /* A double colon should be a package qualifier. We note its
23766 * position and continue. Note that one could have
23767 * pkg1::pkg2::...::foo
23768 * so that the position at the end of the loop will be just after
23769 * the final qualifier */
23772 non_pkg_begin = i + 1;
23773 lookup_name[j++] = ':';
23774 lun_non_pkg_begin = j;
23776 else { /* Only word chars (and '::') can be in a user-defined name */
23777 could_be_user_defined = FALSE;
23779 } /* End of parsing through the lhs of the property name (or all of it if
23782 # define STRLENs(s) (sizeof("" s "") - 1)
23784 /* If there is a single package name 'utf8::', it is ambiguous. It could
23785 * be for a user-defined property, or it could be a Unicode property, as
23786 * all of them are considered to be for that package. For the purposes of
23787 * parsing the rest of the property, strip it off */
23788 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23789 lookup_name += STRLENs("utf8::");
23790 j -= STRLENs("utf8::");
23791 equals_pos -= STRLENs("utf8::");
23792 stripped_utf8_pkg = TRUE;
23795 /* Here, we are either done with the whole property name, if it was simple;
23796 * or are positioned just after the '=' if it is compound. */
23798 if (equals_pos >= 0) {
23799 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23801 /* Space immediately after the '=' is ignored */
23803 for (; i < name_len; i++) {
23804 if (! isSPACE_A(name[i])) {
23809 /* Most punctuation after the equals indicates a subpattern, like
23811 if ( isPUNCT_A(name[i])
23816 /* A backslash means the real delimitter is the next character,
23817 * but it must be punctuation */
23818 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23820 bool special_property = memEQs(lookup_name, j - 1, "name")
23821 || memEQs(lookup_name, j - 1, "na");
23822 if (! special_property) {
23823 /* Find the property. The table includes the equals sign, so
23824 * we use 'j' as-is */
23825 table_index = do_uniprop_match(lookup_name, j);
23827 if (special_property || table_index) {
23828 REGEXP * subpattern_re;
23829 char open = name[i++];
23831 const char * pos_in_brackets;
23832 const char * const * prop_values;
23835 /* Backslash => delimitter is the character following. We
23836 * already checked that it is punctuation */
23837 if (open == '\\') {
23842 /* This data structure is constructed so that the matching
23843 * closing bracket is 3 past its matching opening. The second
23844 * set of closing is so that if the opening is something like
23845 * ']', the closing will be that as well. Something similar is
23846 * done in toke.c */
23847 pos_in_brackets = memCHRs("([<)]>)]>", open);
23848 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23851 || name[name_len-1] != close
23852 || (escaped && name[name_len-2] != '\\')
23853 /* Also make sure that there are enough characters.
23854 * e.g., '\\\' would show up incorrectly as legal even
23855 * though it is too short */
23856 || (SSize_t) (name_len - i - 1 - escaped) < 0)
23858 sv_catpvs(msg, "Unicode property wildcard not terminated");
23859 goto append_name_to_msg;
23862 Perl_ck_warner_d(aTHX_
23863 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23864 "The Unicode property wildcards feature is experimental");
23866 if (special_property) {
23867 const char * error_msg;
23868 const char * revised_name = name + i;
23869 Size_t revised_name_len = name_len - (i + 1 + escaped);
23871 /* Currently, the only 'special_property' is name, which we
23872 * lookup in _charnames.pm */
23874 if (! load_charnames(newSVpvs("placeholder"),
23875 revised_name, revised_name_len,
23878 sv_catpv(msg, error_msg);
23879 goto append_name_to_msg;
23882 /* Farm this out to a function just to make the current
23883 * function less unwieldy */
23884 if (handle_names_wildcard(revised_name, revised_name_len,
23888 return prop_definition;
23894 prop_values = get_prop_values(table_index);
23896 /* Now create and compile the wildcard subpattern. Use /i
23897 * because the property values are supposed to match with case
23899 subpattern_re = compile_wildcard(name + i,
23900 name_len - i - 1 - escaped,
23904 /* For each legal property value, see if the supplied pattern
23906 while (*prop_values) {
23907 const char * const entry = *prop_values;
23908 const Size_t len = strlen(entry);
23909 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23911 if (execute_wildcard(subpattern_re,
23913 (char *) entry + len,
23917 { /* Here, matched. Add to the returned list */
23918 Size_t total_len = j + len;
23919 SV * sub_invlist = NULL;
23920 char * this_string;
23922 /* We know this is a legal \p{property=value}. Call
23923 * the function to return the list of code points that
23925 Newxz(this_string, total_len + 1, char);
23926 Copy(lookup_name, this_string, j, char);
23927 my_strlcat(this_string, entry, total_len + 1);
23928 SAVEFREEPV(this_string);
23929 sub_invlist = parse_uniprop_string(this_string,
23939 _invlist_union(prop_definition, sub_invlist,
23943 prop_values++; /* Next iteration, look at next propvalue */
23944 } /* End of looking through property values; (the data
23945 structure is terminated by a NULL ptr) */
23947 SvREFCNT_dec_NN(subpattern_re);
23949 if (prop_definition) {
23950 return prop_definition;
23953 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23954 goto append_name_to_msg;
23957 /* Here's how khw thinks we should proceed to handle the properties
23958 * not yet done: Bidi Mirroring Glyph can map to ""
23959 Bidi Paired Bracket can map to ""
23960 Case Folding (both full and simple)
23961 Shouldn't /i be good enough for Full
23962 Decomposition Mapping
23963 Equivalent Unified Ideograph can map to ""
23964 Lowercase Mapping (both full and simple)
23965 NFKC Case Fold can map to ""
23966 Titlecase Mapping (both full and simple)
23967 Uppercase Mapping (both full and simple)
23968 * Handle these the same way Name is done, using say, _wild.pm, but
23969 * having both loose and full, like in charclass_invlists.h.
23970 * Perhaps move block and script to that as they are somewhat large
23971 * in charclass_invlists.h.
23972 * For properties where the default is the code point itself, such
23973 * as any of the case changing mappings, the string would otherwise
23974 * consist of all Unicode code points in UTF-8 strung together.
23975 * This would be impractical. So instead, examine their compiled
23976 * pattern, looking at the ssc. If none, reject the pattern as an
23977 * error. Otherwise run the pattern against every code point in
23978 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
23979 * And it might be good to create an API to return the ssc.
23980 * Or handle them like the algorithmic names are done
23982 } /* End of is a wildcard subppattern */
23984 /* \p{name=...} is handled specially. Instead of using the normal
23985 * mechanism involving charclass_invlists.h, it uses _charnames.pm
23986 * which has the necessary (huge) data accessible to it, and which
23987 * doesn't get loaded unless necessary. The legal syntax for names is
23988 * somewhat different than other properties due both to the vagaries of
23989 * a few outlier official names, and the fact that only a few ASCII
23990 * characters are permitted in them */
23991 if ( memEQs(lookup_name, j - 1, "name")
23992 || memEQs(lookup_name, j - 1, "na"))
23997 const char * error_msg;
23999 SV * character_name;
24000 STRLEN character_len;
24005 /* Since the RHS (after skipping initial space) is passed unchanged
24006 * to charnames, and there are different criteria for what are
24007 * legal characters in the name, just parse it here. A character
24008 * name must begin with an ASCII alphabetic */
24009 if (! isALPHA(name[i])) {
24012 lookup_name[j++] = name[i];
24014 for (++i; i < name_len; i++) {
24015 /* Official names can only be in the ASCII range, and only
24016 * certain characters */
24017 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24020 lookup_name[j++] = name[i];
24023 /* Finished parsing, save the name into an SV */
24024 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24026 /* Make sure _charnames is loaded. (The parameters give context
24027 * for any errors generated */
24028 table = load_charnames(character_name, name, name_len, &error_msg);
24029 if (table == NULL) {
24030 sv_catpv(msg, error_msg);
24031 goto append_name_to_msg;
24034 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24035 if (! lookup_loose) {
24037 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24040 PUSHSTACKi(PERLSI_REGCOMP);
24046 XPUSHs(character_name);
24048 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24053 SvREFCNT_inc_simple_void_NN(character);
24060 if (! SvOK(character)) {
24064 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24065 if (character_len == SvCUR(character)) {
24066 prop_definition = add_cp_to_invlist(NULL, cp);
24071 /* First of the remaining characters in the string. */
24072 char * remaining = SvPVX(character) + character_len;
24074 if (strings == NULL) {
24075 goto failed; /* XXX Perhaps a specific msg instead, like
24076 'not available here' */
24079 if (*strings == NULL) {
24080 *strings = newAV();
24083 this_string = newAV();
24084 av_push(this_string, newSVuv(cp));
24087 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24088 av_push(this_string, newSVuv(cp));
24089 remaining += character_len;
24090 } while (remaining < SvEND(character));
24092 av_push(*strings, (SV *) this_string);
24095 return prop_definition;
24098 /* Certain properties whose values are numeric need special handling.
24099 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24100 * purposes of checking if this is one of those properties */
24101 if (memBEGINPs(lookup_name, j, "is")) {
24105 /* Then check if it is one of these specially-handled properties. The
24106 * possibilities are hard-coded because easier this way, and the list
24107 * is unlikely to change.
24109 * All numeric value type properties are of this ilk, and are also
24110 * special in a different way later on. So find those first. There
24111 * are several numeric value type properties in the Unihan DB (which is
24112 * unlikely to be compiled with perl, but we handle it here in case it
24113 * does get compiled). They all end with 'numeric'. The interiors
24114 * aren't checked for the precise property. This would stop working if
24115 * a cjk property were to be created that ended with 'numeric' and
24116 * wasn't a numeric type */
24117 is_nv_type = memEQs(lookup_name + lookup_offset,
24118 j - 1 - lookup_offset, "numericvalue")
24119 || memEQs(lookup_name + lookup_offset,
24120 j - 1 - lookup_offset, "nv")
24121 || ( memENDPs(lookup_name + lookup_offset,
24122 j - 1 - lookup_offset, "numeric")
24123 && ( memBEGINPs(lookup_name + lookup_offset,
24124 j - 1 - lookup_offset, "cjk")
24125 || memBEGINPs(lookup_name + lookup_offset,
24126 j - 1 - lookup_offset, "k")));
24128 || memEQs(lookup_name + lookup_offset,
24129 j - 1 - lookup_offset, "canonicalcombiningclass")
24130 || memEQs(lookup_name + lookup_offset,
24131 j - 1 - lookup_offset, "ccc")
24132 || memEQs(lookup_name + lookup_offset,
24133 j - 1 - lookup_offset, "age")
24134 || memEQs(lookup_name + lookup_offset,
24135 j - 1 - lookup_offset, "in")
24136 || memEQs(lookup_name + lookup_offset,
24137 j - 1 - lookup_offset, "presentin"))
24141 /* Since the stuff after the '=' is a number, we can't throw away
24142 * '-' willy-nilly, as those could be a minus sign. Other stricter
24143 * rules also apply. However, these properties all can have the
24144 * rhs not be a number, in which case they contain at least one
24145 * alphabetic. In those cases, the stricter rules don't apply.
24146 * But the numeric type properties can have the alphas [Ee] to
24147 * signify an exponent, and it is still a number with stricter
24148 * rules. So look for an alpha that signifies not-strict */
24150 for (k = i; k < name_len; k++) {
24151 if ( isALPHA_A(name[k])
24152 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24154 stricter = Not_Strict;
24162 /* A number may have a leading '+' or '-'. The latter is retained
24164 if (name[i] == '+') {
24167 else if (name[i] == '-') {
24168 lookup_name[j++] = '-';
24172 /* Skip leading zeros including single underscores separating the
24173 * zeros, or between the final leading zero and the first other
24175 for (; i < name_len - 1; i++) {
24176 if ( name[i] != '0'
24177 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24184 else { /* No '=' */
24186 /* Only a few properties without an '=' should be parsed with stricter
24187 * rules. The list is unlikely to change. */
24188 if ( memBEGINPs(lookup_name, j, "perl")
24189 && memNEs(lookup_name + 4, j - 4, "space")
24190 && memNEs(lookup_name + 4, j - 4, "word"))
24194 /* We set the inputs back to 0 and the code below will reparse,
24200 /* Here, we have either finished the property, or are positioned to parse
24201 * the remainder, and we know if stricter rules apply. Finish out, if not
24203 for (; i < name_len; i++) {
24204 char cur = name[i];
24206 /* In all instances, case differences are ignored, and we normalize to
24208 if (isUPPER_A(cur)) {
24209 lookup_name[j++] = toLOWER(cur);
24213 /* An underscore is skipped, but not under strict rules unless it
24214 * separates two digits */
24217 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
24218 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24220 lookup_name[j++] = '_';
24225 /* Hyphens are skipped except under strict */
24226 if (cur == '-' && ! stricter) {
24230 /* XXX Bug in documentation. It says white space skipped adjacent to
24231 * non-word char. Maybe we should, but shouldn't skip it next to a dot
24233 if (isSPACE_A(cur) && ! stricter) {
24237 lookup_name[j++] = cur;
24239 /* Unless this is a non-trailing slash, we are done with it */
24240 if (i >= name_len - 1 || cur != '/') {
24246 /* A slash in the 'numeric value' property indicates that what follows
24247 * is a denominator. It can have a leading '+' and '0's that should be
24248 * skipped. But we have never allowed a negative denominator, so treat
24249 * a minus like every other character. (No need to rule out a second
24250 * '/', as that won't match anything anyway */
24253 if (i < name_len && name[i] == '+') {
24257 /* Skip leading zeros including underscores separating digits */
24258 for (; i < name_len - 1; i++) {
24259 if ( name[i] != '0'
24260 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24266 /* Store the first real character in the denominator */
24267 if (i < name_len) {
24268 lookup_name[j++] = name[i];
24273 /* Here are completely done parsing the input 'name', and 'lookup_name'
24274 * contains a copy, normalized.
24276 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24277 * different from without the underscores. */
24278 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
24279 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24280 && UNLIKELY(name[name_len-1] == '_'))
24282 lookup_name[j++] = '&';
24285 /* If the original input began with 'In' or 'Is', it could be a subroutine
24286 * call to a user-defined property instead of a Unicode property name. */
24287 if ( name_len - non_pkg_begin > 2
24288 && name[non_pkg_begin+0] == 'I'
24289 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24291 /* Names that start with In have different characterstics than those
24292 * that start with Is */
24293 if (name[non_pkg_begin+1] == 's') {
24294 starts_with_Is = TRUE;
24298 could_be_user_defined = FALSE;
24301 if (could_be_user_defined) {
24304 /* If the user defined property returns the empty string, it could
24305 * easily be because the pattern is being compiled before the data it
24306 * actually needs to compile is available. This could be argued to be
24307 * a bug in the perl code, but this is a change of behavior for Perl,
24308 * so we handle it. This means that intentionally returning nothing
24309 * will not be resolved until runtime */
24310 bool empty_return = FALSE;
24312 /* Here, the name could be for a user defined property, which are
24313 * implemented as subs. */
24314 user_sub = get_cvn_flags(name, name_len, 0);
24317 /* Here, the property name could be a user-defined one, but there
24318 * is no subroutine to handle it (as of now). Defer handling it
24319 * until runtime. Otherwise, a block defined by Unicode in a later
24320 * release would get the synonym InFoo added for it, and existing
24321 * code that used that name would suddenly break if it referred to
24322 * the property before the sub was declared. See [perl #134146] */
24324 goto definition_deferred;
24327 /* Here, we are at runtime, and didn't find the user property. It
24328 * could be an official property, but only if no package was
24329 * specified, or just the utf8:: package. */
24330 if (could_be_deferred_official) {
24331 lookup_name += lun_non_pkg_begin;
24332 j -= lun_non_pkg_begin;
24334 else if (! stripped_utf8_pkg) {
24335 goto unknown_user_defined;
24338 /* Drop down to look up in the official properties */
24341 const char insecure[] = "Insecure user-defined property";
24343 /* Here, there is a sub by the correct name. Normally we call it
24344 * to get the property definition */
24346 SV * user_sub_sv = MUTABLE_SV(user_sub);
24347 SV * error; /* Any error returned by calling 'user_sub' */
24348 SV * key; /* The key into the hash of user defined sub names
24351 SV ** saved_user_prop_ptr; /* Hash entry for this property */
24353 /* How many times to retry when another thread is in the middle of
24354 * expanding the same definition we want */
24355 PERL_INT_FAST8_T retry_countdown = 10;
24357 DECLARATION_FOR_GLOBAL_CONTEXT;
24359 /* If we get here, we know this property is user-defined */
24360 *user_defined_ptr = TRUE;
24362 /* We refuse to call a potentially tainted subroutine; returning an
24365 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24366 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24367 goto append_name_to_msg;
24370 /* In principal, we only call each subroutine property definition
24371 * once during the life of the program. This guarantees that the
24372 * property definition never changes. The results of the single
24373 * sub call are stored in a hash, which is used instead for future
24374 * references to this property. The property definition is thus
24375 * immutable. But, to allow the user to have a /i-dependent
24376 * definition, we call the sub once for non-/i, and once for /i,
24377 * should the need arise, passing the /i status as a parameter.
24379 * We start by constructing the hash key name, consisting of the
24380 * fully qualified subroutine name, preceded by the /i status, so
24381 * that there is a key for /i and a different key for non-/i */
24382 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24383 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24384 non_pkg_begin != 0);
24385 sv_catsv(key, fq_name);
24388 /* We only call the sub once throughout the life of the program
24389 * (with the /i, non-/i exception noted above). That means the
24390 * hash must be global and accessible to all threads. It is
24391 * created at program start-up, before any threads are created, so
24392 * is accessible to all children. But this creates some
24395 * 1) The keys can't be shared, or else problems arise; sharing is
24396 * turned off at hash creation time
24397 * 2) All SVs in it are there for the remainder of the life of the
24398 * program, and must be created in the same interpreter context
24399 * as the hash, or else they will be freed from the wrong pool
24400 * at global destruction time. This is handled by switching to
24401 * the hash's context to create each SV going into it, and then
24402 * immediately switching back
24403 * 3) All accesses to the hash must be controlled by a mutex, to
24404 * prevent two threads from getting an unstable state should
24405 * they simultaneously be accessing it. The code below is
24406 * crafted so that the mutex is locked whenever there is an
24407 * access and unlocked only when the next stable state is
24410 * The hash stores either the definition of the property if it was
24411 * valid, or, if invalid, the error message that was raised. We
24412 * use the type of SV to distinguish.
24414 * There's also the need to guard against the definition expansion
24415 * from infinitely recursing. This is handled by storing the aTHX
24416 * of the expanding thread during the expansion. Again the SV type
24417 * is used to distinguish this from the other two cases. If we
24418 * come to here and the hash entry for this property is our aTHX,
24419 * it means we have recursed, and the code assumes that we would
24420 * infinitely recurse, so instead stops and raises an error.
24421 * (Any recursion has always been treated as infinite recursion in
24424 * If instead, the entry is for a different aTHX, it means that
24425 * that thread has gotten here first, and hasn't finished expanding
24426 * the definition yet. We just have to wait until it is done. We
24427 * sleep and retry a few times, returning an error if the other
24428 * thread doesn't complete. */
24431 USER_PROP_MUTEX_LOCK;
24433 /* If we have an entry for this key, the subroutine has already
24434 * been called once with this /i status. */
24435 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24436 SvPVX(key), SvCUR(key), 0);
24437 if (saved_user_prop_ptr) {
24439 /* If the saved result is an inversion list, it is the valid
24440 * definition of this property */
24441 if (is_invlist(*saved_user_prop_ptr)) {
24442 prop_definition = *saved_user_prop_ptr;
24444 /* The SV in the hash won't be removed until global
24445 * destruction, so it is stable and we can unlock */
24446 USER_PROP_MUTEX_UNLOCK;
24448 /* The caller shouldn't try to free this SV */
24449 return prop_definition;
24452 /* Otherwise, if it is a string, it is the error message
24453 * that was returned when we first tried to evaluate this
24454 * property. Fail, and append the message */
24455 if (SvPOK(*saved_user_prop_ptr)) {
24456 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24457 sv_catsv(msg, *saved_user_prop_ptr);
24459 /* The SV in the hash won't be removed until global
24460 * destruction, so it is stable and we can unlock */
24461 USER_PROP_MUTEX_UNLOCK;
24466 assert(SvIOK(*saved_user_prop_ptr));
24468 /* Here, we have an unstable entry in the hash. Either another
24469 * thread is in the middle of expanding the property's
24470 * definition, or we are ourselves recursing. We use the aTHX
24471 * in it to distinguish */
24472 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24474 /* Here, it's another thread doing the expanding. We've
24475 * looked as much as we are going to at the contents of the
24476 * hash entry. It's safe to unlock. */
24477 USER_PROP_MUTEX_UNLOCK;
24479 /* Retry a few times */
24480 if (retry_countdown-- > 0) {
24485 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24486 sv_catpvs(msg, "Timeout waiting for another thread to "
24488 goto append_name_to_msg;
24491 /* Here, we are recursing; don't dig any deeper */
24492 USER_PROP_MUTEX_UNLOCK;
24494 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24496 "Infinite recursion in user-defined property");
24497 goto append_name_to_msg;
24500 /* Here, this thread has exclusive control, and there is no entry
24501 * for this property in the hash. So we have the go ahead to
24502 * expand the definition ourselves. */
24504 PUSHSTACKi(PERLSI_REGCOMP);
24507 /* Create a temporary placeholder in the hash to detect recursion
24509 SWITCH_TO_GLOBAL_CONTEXT;
24510 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24511 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24514 /* Now that we have a placeholder, we can let other threads
24516 USER_PROP_MUTEX_UNLOCK;
24518 /* Make sure the placeholder always gets destroyed */
24519 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24524 /* Call the user's function, with the /i status as a parameter.
24525 * Note that we have gone to a lot of trouble to keep this call
24526 * from being within the locked mutex region. */
24527 XPUSHs(boolSV(to_fold));
24530 /* The following block was taken from swash_init(). Presumably
24531 * they apply to here as well, though we no longer use a swash --
24535 /* We might get here via a subroutine signature which uses a utf8
24536 * parameter name, at which point PL_subname will have been set
24537 * but not yet used. */
24538 save_item(PL_subname);
24540 /* G_SCALAR guarantees a single return value */
24541 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24546 if (TAINT_get || SvTRUE(error)) {
24547 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24548 if (SvTRUE(error)) {
24549 sv_catpvs(msg, "Error \"");
24550 sv_catsv(msg, error);
24551 sv_catpvs(msg, "\"");
24554 if (SvTRUE(error)) sv_catpvs(msg, "; ");
24555 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24558 if (name_len > 0) {
24559 sv_catpvs(msg, " in expansion of ");
24560 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24566 prop_definition = NULL;
24569 SV * contents = POPs;
24571 /* The contents is supposed to be the expansion of the property
24572 * definition. If the definition is deferrable, and we got an
24573 * empty string back, set a flag to later defer it (after clean
24576 && (! SvPOK(contents) || SvCUR(contents) == 0))
24578 empty_return = TRUE;
24580 else { /* Otherwise, call a function to check for valid syntax,
24583 prop_definition = handle_user_defined_property(
24585 is_utf8, to_fold, runtime,
24587 contents, user_defined_ptr,
24593 /* Here, we have the results of the expansion. Delete the
24594 * placeholder, and if the definition is now known, replace it with
24595 * that definition. We need exclusive access to the hash, and we
24596 * can't let anyone else in, between when we delete the placeholder
24597 * and add the permanent entry */
24598 USER_PROP_MUTEX_LOCK;
24600 S_delete_recursion_entry(aTHX_ SvPVX(key));
24602 if ( ! empty_return
24603 && (! prop_definition || is_invlist(prop_definition)))
24605 /* If we got success we use the inversion list defining the
24606 * property; otherwise use the error message */
24607 SWITCH_TO_GLOBAL_CONTEXT;
24608 (void) hv_store_ent(PL_user_def_props,
24611 ? newSVsv(prop_definition)
24617 /* All done, and the hash now has a permanent entry for this
24618 * property. Give up exclusive control */
24619 USER_PROP_MUTEX_UNLOCK;
24625 if (empty_return) {
24626 goto definition_deferred;
24629 if (prop_definition) {
24631 /* If the definition is for something not known at this time,
24632 * we toss it, and go return the main property name, as that's
24633 * the one the user will be aware of */
24634 if (! is_invlist(prop_definition)) {
24635 SvREFCNT_dec_NN(prop_definition);
24636 goto definition_deferred;
24639 sv_2mortal(prop_definition);
24643 return prop_definition;
24645 } /* End of calling the subroutine for the user-defined property */
24646 } /* End of it could be a user-defined property */
24648 /* Here it wasn't a user-defined property that is known at this time. See
24649 * if it is a Unicode property */
24651 lookup_len = j; /* This is a more mnemonic name than 'j' */
24653 /* Get the index into our pointer table of the inversion list corresponding
24654 * to the property */
24655 table_index = do_uniprop_match(lookup_name, lookup_len);
24657 /* If it didn't find the property ... */
24658 if (table_index == 0) {
24660 /* Try again stripping off any initial 'Is'. This is because we
24661 * promise that an initial Is is optional. The same isn't true of
24662 * names that start with 'In'. Those can match only blocks, and the
24663 * lookup table already has those accounted for. The lookup table also
24664 * has already accounted for Perl extensions (without and = sign)
24665 * starting with 'i's'. */
24666 if (starts_with_Is && equals_pos >= 0) {
24672 table_index = do_uniprop_match(lookup_name, lookup_len);
24675 if (table_index == 0) {
24678 /* Here, we didn't find it. If not a numeric type property, and
24679 * can't be a user-defined one, it isn't a legal property */
24680 if (! is_nv_type) {
24681 if (! could_be_user_defined) {
24685 /* Here, the property name is legal as a user-defined one. At
24686 * compile time, it might just be that the subroutine for that
24687 * property hasn't been encountered yet, but at runtime, it's
24688 * an error to try to use an undefined one */
24689 if (! deferrable) {
24690 goto unknown_user_defined;;
24693 goto definition_deferred;
24694 } /* End of isn't a numeric type property */
24696 /* The numeric type properties need more work to decide. What we
24697 * do is make sure we have the number in canonical form and look
24700 if (slash_pos < 0) { /* No slash */
24702 /* When it isn't a rational, take the input, convert it to a
24703 * NV, then create a canonical string representation of that
24707 SSize_t value_len = lookup_len - equals_pos;
24709 /* Get the value */
24710 if ( value_len <= 0
24711 || my_atof3(lookup_name + equals_pos, &value,
24713 != lookup_name + lookup_len)
24718 /* If the value is an integer, the canonical value is integral
24720 if (Perl_ceil(value) == value) {
24721 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24722 equals_pos, lookup_name, value);
24724 else { /* Otherwise, it is %e with a known precision */
24727 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24728 equals_pos, lookup_name,
24729 PL_E_FORMAT_PRECISION, value);
24731 /* The exponent generated is expecting two digits, whereas
24732 * %e on some systems will generate three. Remove leading
24733 * zeros in excess of 2 from the exponent. We start
24734 * looking for them after the '=' */
24735 exp_ptr = strchr(canonical + equals_pos, 'e');
24737 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24738 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24740 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24742 if (excess_exponent_len > 0) {
24743 SSize_t leading_zeros = strspn(cur_ptr, "0");
24744 SSize_t excess_leading_zeros
24745 = MIN(leading_zeros, excess_exponent_len);
24746 if (excess_leading_zeros > 0) {
24747 Move(cur_ptr + excess_leading_zeros,
24749 strlen(cur_ptr) - excess_leading_zeros
24750 + 1, /* Copy the NUL as well */
24757 else { /* Has a slash. Create a rational in canonical form */
24758 UV numerator, denominator, gcd, trial;
24759 const char * end_ptr;
24760 const char * sign = "";
24762 /* We can't just find the numerator, denominator, and do the
24763 * division, then use the method above, because that is
24764 * inexact. And the input could be a rational that is within
24765 * epsilon (given our precision) of a valid rational, and would
24766 * then incorrectly compare valid.
24768 * We're only interested in the part after the '=' */
24769 const char * this_lookup_name = lookup_name + equals_pos;
24770 lookup_len -= equals_pos;
24771 slash_pos -= equals_pos;
24773 /* Handle any leading minus */
24774 if (this_lookup_name[0] == '-') {
24776 this_lookup_name++;
24781 /* Convert the numerator to numeric */
24782 end_ptr = this_lookup_name + slash_pos;
24783 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24787 /* It better have included all characters before the slash */
24788 if (*end_ptr != '/') {
24792 /* Set to look at just the denominator */
24793 this_lookup_name += slash_pos;
24794 lookup_len -= slash_pos;
24795 end_ptr = this_lookup_name + lookup_len;
24797 /* Convert the denominator to numeric */
24798 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24802 /* It better be the rest of the characters, and don't divide by
24804 if ( end_ptr != this_lookup_name + lookup_len
24805 || denominator == 0)
24810 /* Get the greatest common denominator using
24811 http://en.wikipedia.org/wiki/Euclidean_algorithm */
24813 trial = denominator;
24814 while (trial != 0) {
24816 trial = gcd % trial;
24820 /* If already in lowest possible terms, we have already tried
24821 * looking this up */
24826 /* Reduce the rational, which should put it in canonical form
24829 denominator /= gcd;
24831 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24832 equals_pos, lookup_name, sign, numerator, denominator);
24835 /* Here, we have the number in canonical form. Try that */
24836 table_index = do_uniprop_match(canonical, strlen(canonical));
24837 if (table_index == 0) {
24840 } /* End of still didn't find the property in our table */
24841 } /* End of didn't find the property in our table */
24843 /* Here, we have a non-zero return, which is an index into a table of ptrs.
24844 * A negative return signifies that the real index is the absolute value,
24845 * but the result needs to be inverted */
24846 if (table_index < 0) {
24847 invert_return = TRUE;
24848 table_index = -table_index;
24851 /* Out-of band indices indicate a deprecated property. The proper index is
24852 * modulo it with the table size. And dividing by the table size yields
24853 * an offset into a table constructed by regen/mk_invlists.pl to contain
24854 * the corresponding warning message */
24855 if (table_index > MAX_UNI_KEYWORD_INDEX) {
24856 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24857 table_index %= MAX_UNI_KEYWORD_INDEX;
24858 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24859 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24860 (int) name_len, name,
24861 get_deprecated_property_msg(warning_offset));
24864 /* In a few properties, a different property is used under /i. These are
24865 * unlikely to change, so are hard-coded here. */
24867 if ( table_index == UNI_XPOSIXUPPER
24868 || table_index == UNI_XPOSIXLOWER
24869 || table_index == UNI_TITLE)
24871 table_index = UNI_CASED;
24873 else if ( table_index == UNI_UPPERCASELETTER
24874 || table_index == UNI_LOWERCASELETTER
24875 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
24876 || table_index == UNI_TITLECASELETTER
24879 table_index = UNI_CASEDLETTER;
24881 else if ( table_index == UNI_POSIXUPPER
24882 || table_index == UNI_POSIXLOWER)
24884 table_index = UNI_POSIXALPHA;
24888 /* Create and return the inversion list */
24889 prop_definition = get_prop_definition(table_index);
24890 sv_2mortal(prop_definition);
24892 /* See if there is a private use override to add to this definition */
24894 COPHH * hinthash = (IN_PERL_COMPILETIME)
24895 ? CopHINTHASH_get(&PL_compiling)
24896 : CopHINTHASH_get(PL_curcop);
24897 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24899 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24901 /* See if there is an element in the hints hash for this table */
24902 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24903 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24907 SV * pu_definition;
24909 SV * expanded_prop_definition =
24910 sv_2mortal(invlist_clone(prop_definition, NULL));
24912 /* If so, it's definition is the string from here to the next
24913 * \a character. And its format is the same as a user-defined
24915 pos += SvCUR(pu_lookup);
24916 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24917 pu_invlist = handle_user_defined_property(lookup_name,
24920 0, /* Not folded */
24928 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24929 sv_catpvs(msg, "Insecure private-use override");
24930 goto append_name_to_msg;
24933 /* For now, as a safety measure, make sure that it doesn't
24934 * override non-private use code points */
24935 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24937 /* Add it to the list to be returned */
24938 _invlist_union(prop_definition, pu_invlist,
24939 &expanded_prop_definition);
24940 prop_definition = expanded_prop_definition;
24941 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24946 if (invert_return) {
24947 _invlist_invert(prop_definition);
24949 return prop_definition;
24951 unknown_user_defined:
24952 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24953 sv_catpvs(msg, "Unknown user-defined property name");
24954 goto append_name_to_msg;
24957 if (non_pkg_begin != 0) {
24958 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24959 sv_catpvs(msg, "Illegal user-defined property name");
24962 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24963 sv_catpvs(msg, "Can't find Unicode property definition");
24967 append_name_to_msg:
24969 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
24970 const char * suffix = (runtime && level == 0) ? "}" : "\"";
24972 sv_catpv(msg, prefix);
24973 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24974 sv_catpv(msg, suffix);
24979 definition_deferred:
24982 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
24984 /* Here it could yet to be defined, so defer evaluation of this until
24985 * its needed at runtime. We need the fully qualified property name to
24986 * avoid ambiguity */
24988 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24992 /* If it didn't come with a package, or the package is utf8::, this
24993 * actually could be an official Unicode property whose inclusion we
24994 * are deferring until runtime to make sure that it isn't overridden by
24995 * a user-defined property of the same name (which we haven't
24996 * encountered yet). Add a marker to indicate this possibility, for
24997 * use at such time when we first need the definition during pattern
24998 * matching execution */
24999 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25000 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25003 /* We also need a trailing newline */
25004 sv_catpvs(fq_name, "\n");
25006 *user_defined_ptr = TRUE;
25012 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25013 const STRLEN wname_len, /* Its length */
25014 SV ** prop_definition,
25017 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25018 * any matches, adding them to prop_definition */
25022 CV * get_names_info; /* entry to charnames.pm to get info we need */
25023 SV * names_string; /* Contains all character names, except algo */
25024 SV * algorithmic_names; /* Contains info about algorithmically
25025 generated character names */
25026 REGEXP * subpattern_re; /* The user's pattern to match with */
25027 struct regexp * prog; /* The compiled pattern */
25028 char * all_names_start; /* lib/unicore/Name.pl string of every
25029 (non-algorithmic) character name */
25030 char * cur_pos; /* We match, effectively using /gc; this is
25031 where we are now */
25032 bool found_matches = FALSE; /* Did any name match so far? */
25033 SV * empty; /* For matching zero length names */
25034 SV * must_sv; /* Contains the substring, if any, that must be
25035 in a name for the subpattern to match */
25036 const char * must; /* The PV of 'must' */
25037 STRLEN must_len; /* And its length */
25038 SV * syllable_name = NULL; /* For Hangul syllables */
25039 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25040 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25042 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25043 * syllable name, and these are immutable and guaranteed by the Unicode
25044 * standard to never be extended */
25045 const STRLEN syl_max_len = hangul_prefix_len + 7;
25049 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25051 /* Make sure _charnames is loaded. (The parameters give context
25052 * for any errors generated */
25053 get_names_info = get_cv("_charnames::_get_names_info", 0);
25054 if (! get_names_info) {
25055 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25058 /* Get the charnames data */
25059 PUSHSTACKi(PERLSI_REGCOMP);
25067 /* Special _charnames entry point that returns the info this routine
25069 call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25073 /* Data structure for names which end in their very own code points */
25074 algorithmic_names = POPs;
25075 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25077 /* The lib/unicore/Name.pl string */
25078 names_string = POPs;
25079 SvREFCNT_inc_simple_void_NN(names_string);
25086 if ( ! SvROK(names_string)
25087 || ! SvROK(algorithmic_names))
25088 { /* Perhaps should panic instead XXX */
25089 SvREFCNT_dec(names_string);
25090 SvREFCNT_dec(algorithmic_names);
25094 names_string = sv_2mortal(SvRV(names_string));
25095 all_names_start = SvPVX(names_string);
25096 cur_pos = all_names_start;
25098 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25100 /* Compile the subpattern consisting of the name being looked for */
25101 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25103 must_sv = re_intuit_string(subpattern_re);
25105 /* regexec.c can free the re_intuit_string() return. GH #17734 */
25106 must_sv = sv_2mortal(newSVsv(must_sv));
25107 must = SvPV(must_sv, must_len);
25114 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
25115 * This works because the NUL causes the function to return early, thus
25116 * showing that there are characters in it other than the acceptable ones,
25117 * which is our desired result.) */
25119 prog = ReANY(subpattern_re);
25121 /* If only nothing is matched, skip to where empty names are looked for */
25122 if (prog->maxlen == 0) {
25126 /* And match against the string of all names /gc. Don't even try if it
25127 * must match a character not found in any name. */
25128 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25130 while (execute_wildcard(subpattern_re,
25132 SvEND(names_string),
25133 all_names_start, 0,
25136 { /* Here, matched. */
25138 /* Note the string entries look like
25139 * 00001\nSTART OF HEADING\n\n
25140 * so we could match anywhere in that string. We have to rule out
25141 * matching a code point line */
25142 char * this_name_start = all_names_start
25143 + RX_OFFS(subpattern_re)->start;
25144 char * this_name_end = all_names_start
25145 + RX_OFFS(subpattern_re)->end;
25148 UV cp = 0; /* Silences some compilers */
25149 AV * this_string = NULL;
25150 bool is_multi = FALSE;
25152 /* If matched nothing, advance to next possible match */
25153 if (this_name_start == this_name_end) {
25154 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25155 SvEND(names_string) - this_name_end);
25156 if (cur_pos == NULL) {
25161 /* Position the next match to start beyond the current returned
25163 cur_pos = (char *) memchr(this_name_end, '\n',
25164 SvEND(names_string) - this_name_end);
25167 /* Back up to the \n just before the beginning of the character. */
25168 cp_end = (char *) my_memrchr(all_names_start,
25170 this_name_start - all_names_start);
25172 /* If we didn't find a \n, it means it matched somewhere in the
25173 * initial '00000' in the string, so isn't a real match */
25174 if (cp_end == NULL) {
25178 this_name_start = cp_end + 1; /* The name starts just after */
25179 cp_end--; /* the \n, and the code point */
25180 /* ends just before it */
25182 /* All code points are 5 digits long */
25183 cp_start = cp_end - 4;
25185 /* This shouldn't happen, as we found a \n, and the first \n is
25186 * further along than what we subtracted */
25187 assert(cp_start >= all_names_start);
25189 if (cp_start == all_names_start) {
25190 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25194 /* If the character is a blank, we either have a named sequence, or
25195 * something is wrong */
25196 if (*(cp_start - 1) == ' ') {
25197 cp_start = (char *) my_memrchr(all_names_start,
25199 cp_start - all_names_start);
25203 assert(cp_start != NULL && cp_start >= all_names_start + 2);
25205 /* Except for the first line in the string, the sequence before the
25206 * code point is \n\n. If that isn't the case here, we didn't
25207 * match the name of a character. (We could have matched a named
25208 * sequence, not currently handled */
25209 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25213 /* We matched! Add this to the list */
25214 found_matches = TRUE;
25216 /* Loop through all the code points in the sequence */
25217 while (cp_start < cp_end) {
25219 /* Calculate this code point from its 5 digits */
25220 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25221 + (XDIGIT_VALUE(cp_start[1]) << 12)
25222 + (XDIGIT_VALUE(cp_start[2]) << 8)
25223 + (XDIGIT_VALUE(cp_start[3]) << 4)
25224 + XDIGIT_VALUE(cp_start[4]);
25226 cp_start += 6; /* Go past any blank */
25228 if (cp_start < cp_end || is_multi) {
25229 if (this_string == NULL) {
25230 this_string = newAV();
25234 av_push(this_string, newSVuv(cp));
25238 if (is_multi) { /* Was more than one code point */
25239 if (*strings == NULL) {
25240 *strings = newAV();
25243 av_push(*strings, (SV *) this_string);
25245 else { /* Only a single code point */
25246 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25248 } /* End of loop through the non-algorithmic names string */
25251 /* There are also character names not in 'names_string'. These are
25252 * algorithmically generatable. Try this pattern on each possible one.
25253 * (khw originally planned to leave this out given the large number of
25254 * matches attempted; but the speed turned out to be quite acceptable
25256 * There are plenty of opportunities to optimize to skip many of the tests.
25257 * beyond the rudimentary ones already here */
25259 /* First see if the subpattern matches any of the algorithmic generatable
25260 * Hangul syllable names.
25262 * We know none of these syllable names will match if the input pattern
25263 * requires more bytes than any syllable has, or if the input pattern only
25264 * matches an empty name, or if the pattern has something it must match and
25265 * one of the characters in that isn't in any Hangul syllable. */
25266 if ( prog->minlen <= (SSize_t) syl_max_len
25267 && prog->maxlen > 0
25268 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25270 /* These constants, names, values, and algorithm are adapted from the
25271 * Unicode standard, version 5.1, section 3.12, and should never
25273 const char * JamoL[] = {
25274 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25275 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25277 const int LCount = C_ARRAY_LENGTH(JamoL);
25279 const char * JamoV[] = {
25280 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25281 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25284 const int VCount = C_ARRAY_LENGTH(JamoV);
25286 const char * JamoT[] = {
25287 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25288 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25289 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25291 const int TCount = C_ARRAY_LENGTH(JamoT);
25295 /* This is the initial Hangul syllable code point; each time through the
25296 * inner loop, it maps to the next higher code point. For more info,
25297 * see the Hangul syllable section of the Unicode standard. */
25300 syllable_name = sv_2mortal(newSV(syl_max_len));
25301 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25303 for (L = 0; L < LCount; L++) {
25304 for (V = 0; V < VCount; V++) {
25305 for (T = 0; T < TCount; T++) {
25307 /* Truncate back to the prefix, which is unvarying */
25308 SvCUR_set(syllable_name, hangul_prefix_len);
25310 sv_catpv(syllable_name, JamoL[L]);
25311 sv_catpv(syllable_name, JamoV[V]);
25312 sv_catpv(syllable_name, JamoT[T]);
25314 if (execute_wildcard(subpattern_re,
25315 SvPVX(syllable_name),
25316 SvEND(syllable_name),
25317 SvPVX(syllable_name), 0,
25321 *prop_definition = add_cp_to_invlist(*prop_definition,
25323 found_matches = TRUE;
25332 /* The rest of the algorithmically generatable names are of the form
25333 * "PREFIX-code_point". The prefixes and the code point limits of each
25334 * were returned to us in the array 'algorithmic_names' from data in
25335 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
25336 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25339 /* Each element of the array is a hash, giving the details for the
25340 * series of names it covers. There is the base name of the characters
25341 * in the series, and the low and high code points in the series. And,
25342 * for optimization purposes a string containing all the legal
25343 * characters that could possibly be in a name in this series. */
25344 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25345 SV * prefix = * hv_fetchs(this_series, "name", 0);
25346 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25347 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25348 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25350 /* Pre-allocate an SV with enough space */
25351 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25353 if (high >= 0x10000) {
25354 sv_catpvs(algo_name, "0");
25357 /* This series can be skipped entirely if the pattern requires
25358 * something longer than any name in the series, or can only match an
25359 * empty name, or contains a character not found in any name in the
25361 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
25362 && prog->maxlen > 0
25363 && (strspn(must, legal) == must_len))
25365 for (j = low; j <= high; j++) { /* For each code point in the series */
25367 /* Get its name, and see if it matches the subpattern */
25368 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25371 if (execute_wildcard(subpattern_re,
25374 SvPVX(algo_name), 0,
25378 *prop_definition = add_cp_to_invlist(*prop_definition, j);
25379 found_matches = TRUE;
25386 /* Finally, see if the subpattern matches an empty string */
25387 empty = newSVpvs("");
25388 if (execute_wildcard(subpattern_re,
25395 /* Many code points have empty names. Currently these are the \p{GC=C}
25396 * ones, minus CC and CF */
25398 SV * empty_names_ref = get_prop_definition(UNI_C);
25399 SV * empty_names = invlist_clone(empty_names_ref, NULL);
25401 SV * subtract = get_prop_definition(UNI_CC);
25403 _invlist_subtract(empty_names, subtract, &empty_names);
25404 SvREFCNT_dec_NN(empty_names_ref);
25405 SvREFCNT_dec_NN(subtract);
25407 subtract = get_prop_definition(UNI_CF);
25408 _invlist_subtract(empty_names, subtract, &empty_names);
25409 SvREFCNT_dec_NN(subtract);
25411 _invlist_union(*prop_definition, empty_names, prop_definition);
25412 found_matches = TRUE;
25413 SvREFCNT_dec_NN(empty_names);
25415 SvREFCNT_dec_NN(empty);
25418 /* If we ever were to accept aliases for, say private use names, we would
25419 * need to do something fancier to find empty names. The code below works
25420 * (at the time it was written), and is slower than the above */
25421 const char empties_pat[] = "^.";
25422 if (strNE(name, empties_pat)) {
25423 SV * empty = newSVpvs("");
25424 if (execute_wildcard(subpattern_re,
25431 SV * empties = NULL;
25433 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25435 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25436 SvREFCNT_dec_NN(empties);
25438 found_matches = TRUE;
25440 SvREFCNT_dec_NN(empty);
25444 SvREFCNT_dec_NN(subpattern_re);
25445 return found_matches;
25449 * ex: set ts=8 sts=4 sw=4 et: