5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
154 #define STATIC static
157 /* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
161 typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
166 bool in_gosub; /* this or an outer frame is for GOSUB */
168 struct scan_frame *this_prev_frame; /* this previous frame */
169 struct scan_frame *prev_frame; /* previous frame */
170 struct scan_frame *next_frame; /* next frame */
173 /* Certain characters are output as a sequence with the first being a
175 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
178 struct RExC_state_t {
179 U32 flags; /* RXf_* are we folding, multilining? */
180 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
181 char *precomp; /* uncompiled string. */
182 char *precomp_end; /* pointer to end of uncompiled string. */
183 REGEXP *rx_sv; /* The SV that is the regexp. */
184 regexp *rx; /* perl core regexp structure */
185 regexp_internal *rxi; /* internal data for regexp object
187 char *start; /* Start of input for compile */
188 char *end; /* End of input for compile */
189 char *parse; /* Input-scan pointer. */
190 char *copy_start; /* start of copy of input within
191 constructed parse string */
192 char *save_copy_start; /* Provides one level of saving
193 and restoring 'copy_start' */
194 char *copy_start_in_input; /* Position in input string
195 corresponding to copy_start */
196 SSize_t whilem_seen; /* number of WHILEM in this expr */
197 regnode *emit_start; /* Start of emitted-code area */
198 regnode_offset emit; /* Code-emit pointer */
199 I32 naughty; /* How bad is this pattern? */
200 I32 sawback; /* Did we see \1, ...? */
201 SSize_t size; /* Number of regnode equivalents in
203 Size_t sets_depth; /* Counts recursion depth of already-
204 compiled regex set patterns */
207 I32 parens_buf_size; /* #slots malloced open/close_parens */
208 regnode_offset *open_parens; /* offsets to open parens */
209 regnode_offset *close_parens; /* offsets to close parens */
210 HV *paren_names; /* Paren names */
212 /* position beyond 'precomp' of the warning message furthest away from
213 * 'precomp'. During the parse, no warnings are raised for any problems
214 * earlier in the parse than this position. This works if warnings are
215 * raised the first time a given spot is parsed, and if only one
216 * independent warning is raised for any given spot */
217 Size_t latest_warn_offset;
219 I32 npar; /* Capture buffer count so far in the
220 parse, (OPEN) plus one. ("par" 0 is
222 I32 total_par; /* During initial parse, is either 0,
223 or -1; the latter indicating a
224 reparse is needed. After that pass,
225 it is what 'npar' became after the
226 pass. Hence, it being > 0 indicates
227 we are in a reparse situation */
228 I32 nestroot; /* root parens we are in - used by
231 regnode *end_op; /* END node in program */
232 I32 utf8; /* whether the pattern is utf8 or not */
233 I32 orig_utf8; /* whether the pattern was originally in utf8 */
234 /* XXX use this for future optimisation of case
235 * where pattern must be upgraded to utf8. */
236 I32 uni_semantics; /* If a d charset modifier should use unicode
237 rules, even if the pattern is not in
240 I32 recurse_count; /* Number of recurse regops we have generated */
241 regnode **recurse; /* Recurse regops */
242 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
244 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
248 I32 override_recoding;
249 I32 recode_x_to_native;
250 I32 in_multi_char_class;
251 int code_index; /* next code_blocks[] slot */
252 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
254 SSize_t maxlen; /* mininum possible number of chars in string to match */
255 scan_frame *frame_head;
256 scan_frame *frame_last;
260 SV *runtime_code_qr; /* qr with the runtime code blocks */
262 const char *lastparse;
264 U32 study_chunk_recursed_count;
265 AV *paren_name_list; /* idx -> name */
269 #define RExC_lastparse (pRExC_state->lastparse)
270 #define RExC_lastnum (pRExC_state->lastnum)
271 #define RExC_paren_name_list (pRExC_state->paren_name_list)
272 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
273 #define RExC_mysv (pRExC_state->mysv1)
274 #define RExC_mysv1 (pRExC_state->mysv1)
275 #define RExC_mysv2 (pRExC_state->mysv2)
283 bool sWARN_EXPERIMENTAL__VLB;
284 bool sWARN_EXPERIMENTAL__REGEX_SETS;
287 #define RExC_flags (pRExC_state->flags)
288 #define RExC_pm_flags (pRExC_state->pm_flags)
289 #define RExC_precomp (pRExC_state->precomp)
290 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
291 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
292 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
293 #define RExC_precomp_end (pRExC_state->precomp_end)
294 #define RExC_rx_sv (pRExC_state->rx_sv)
295 #define RExC_rx (pRExC_state->rx)
296 #define RExC_rxi (pRExC_state->rxi)
297 #define RExC_start (pRExC_state->start)
298 #define RExC_end (pRExC_state->end)
299 #define RExC_parse (pRExC_state->parse)
300 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
301 #define RExC_whilem_seen (pRExC_state->whilem_seen)
302 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
303 under /d from /u ? */
305 #ifdef RE_TRACK_PATTERN_OFFSETS
306 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
309 #define RExC_emit (pRExC_state->emit)
310 #define RExC_emit_start (pRExC_state->emit_start)
311 #define RExC_sawback (pRExC_state->sawback)
312 #define RExC_seen (pRExC_state->seen)
313 #define RExC_size (pRExC_state->size)
314 #define RExC_maxlen (pRExC_state->maxlen)
315 #define RExC_npar (pRExC_state->npar)
316 #define RExC_total_parens (pRExC_state->total_par)
317 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
318 #define RExC_nestroot (pRExC_state->nestroot)
319 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
320 #define RExC_utf8 (pRExC_state->utf8)
321 #define RExC_uni_semantics (pRExC_state->uni_semantics)
322 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
323 #define RExC_open_parens (pRExC_state->open_parens)
324 #define RExC_close_parens (pRExC_state->close_parens)
325 #define RExC_end_op (pRExC_state->end_op)
326 #define RExC_paren_names (pRExC_state->paren_names)
327 #define RExC_recurse (pRExC_state->recurse)
328 #define RExC_recurse_count (pRExC_state->recurse_count)
329 #define RExC_sets_depth (pRExC_state->sets_depth)
330 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
331 #define RExC_study_chunk_recursed_bytes \
332 (pRExC_state->study_chunk_recursed_bytes)
333 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
334 #define RExC_in_lookahead (pRExC_state->in_lookahead)
335 #define RExC_contains_locale (pRExC_state->contains_locale)
336 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
339 # define SET_recode_x_to_native(x) \
340 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
342 # define SET_recode_x_to_native(x) NOOP
345 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
346 #define RExC_frame_head (pRExC_state->frame_head)
347 #define RExC_frame_last (pRExC_state->frame_last)
348 #define RExC_frame_count (pRExC_state->frame_count)
349 #define RExC_strict (pRExC_state->strict)
350 #define RExC_study_started (pRExC_state->study_started)
351 #define RExC_warn_text (pRExC_state->warn_text)
352 #define RExC_in_script_run (pRExC_state->in_script_run)
353 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
354 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
355 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
356 #define RExC_unlexed_names (pRExC_state->unlexed_names)
358 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
359 * a flag to disable back-off on the fixed/floating substrings - if it's
360 * a high complexity pattern we assume the benefit of avoiding a full match
361 * is worth the cost of checking for the substrings even if they rarely help.
363 #define RExC_naughty (pRExC_state->naughty)
364 #define TOO_NAUGHTY (10)
365 #define MARK_NAUGHTY(add) \
366 if (RExC_naughty < TOO_NAUGHTY) \
367 RExC_naughty += (add)
368 #define MARK_NAUGHTY_EXP(exp, add) \
369 if (RExC_naughty < TOO_NAUGHTY) \
370 RExC_naughty += RExC_naughty / (exp) + (add)
372 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
373 #define ISMULT2(s) (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
376 * Flags to be passed up and down.
378 #define HASWIDTH 0x01 /* Known to not match null strings, could match
380 #define SIMPLE 0x02 /* Exactly one character wide */
381 /* (or LNBREAK as a special case) */
382 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
383 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
384 #define RESTART_PARSE 0x20 /* Need to redo the parse */
385 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
386 calcuate sizes as UTF-8 */
388 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
390 /* whether trie related optimizations are enabled */
391 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
392 #define TRIE_STUDY_OPT
393 #define FULL_TRIE_STUDY
399 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
400 #define PBITVAL(paren) (1 << ((paren) & 7))
401 #define PAREN_OFFSET(depth) \
402 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
403 #define PAREN_TEST(depth, paren) \
404 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
405 #define PAREN_SET(depth, paren) \
406 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
407 #define PAREN_UNSET(depth, paren) \
408 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
410 #define REQUIRE_UTF8(flagp) STMT_START { \
412 *flagp = RESTART_PARSE|NEED_UTF8; \
417 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
418 * a flag that indicates we need to override /d with /u as a result of
419 * something in the pattern. It should only be used in regards to calling
420 * set_regex_charset() or get_regex_charset() */
421 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
423 if (DEPENDS_SEMANTICS) { \
424 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
425 RExC_uni_semantics = 1; \
426 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
427 /* No need to restart the parse if we haven't seen \
428 * anything that differs between /u and /d, and no need \
429 * to restart immediately if we're going to reparse \
430 * anyway to count parens */ \
431 *flagp |= RESTART_PARSE; \
432 return restart_retval; \
437 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
439 RExC_use_BRANCHJ = 1; \
440 *flagp |= RESTART_PARSE; \
441 return restart_retval; \
444 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
445 * less. After that, it must always be positive, because the whole re is
446 * considered to be surrounded by virtual parens. Setting it to negative
447 * indicates there is some construct that needs to know the actual number of
448 * parens to be properly handled. And that means an extra pass will be
449 * required after we've counted them all */
450 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
451 #define REQUIRE_PARENS_PASS \
452 STMT_START { /* No-op if have completed a pass */ \
453 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
455 #define IN_PARENS_PASS (RExC_total_parens < 0)
458 /* This is used to return failure (zero) early from the calling function if
459 * various flags in 'flags' are set. Two flags always cause a return:
460 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
461 * additional flags that should cause a return; 0 if none. If the return will
462 * be done, '*flagp' is first set to be all of the flags that caused the
464 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
466 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
467 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
472 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
474 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
475 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
476 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
477 if (MUST_RESTART(*(flagp))) return 0
479 /* This converts the named class defined in regcomp.h to its equivalent class
480 * number defined in handy.h. */
481 #define namedclass_to_classnum(class) ((int) ((class) / 2))
482 #define classnum_to_namedclass(classnum) ((classnum) * 2)
484 #define _invlist_union_complement_2nd(a, b, output) \
485 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
486 #define _invlist_intersection_complement_2nd(a, b, output) \
487 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
489 /* We add a marker if we are deferring expansion of a property that is both
490 * 1) potentiallly user-defined; and
491 * 2) could also be an official Unicode property.
493 * Without this marker, any deferred expansion can only be for a user-defined
494 * one. This marker shouldn't conflict with any that could be in a legal name,
495 * and is appended to its name to indicate this. There is a string and
497 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
498 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
500 /* What is infinity for optimization purposes */
501 #define OPTIMIZE_INFTY SSize_t_MAX
503 /* About scan_data_t.
505 During optimisation we recurse through the regexp program performing
506 various inplace (keyhole style) optimisations. In addition study_chunk
507 and scan_commit populate this data structure with information about
508 what strings MUST appear in the pattern. We look for the longest
509 string that must appear at a fixed location, and we look for the
510 longest string that may appear at a floating location. So for instance
515 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
516 strings (because they follow a .* construct). study_chunk will identify
517 both FOO and BAR as being the longest fixed and floating strings respectively.
519 The strings can be composites, for instance
523 will result in a composite fixed substring 'foo'.
525 For each string some basic information is maintained:
528 This is the position the string must appear at, or not before.
529 It also implicitly (when combined with minlenp) tells us how many
530 characters must match before the string we are searching for.
531 Likewise when combined with minlenp and the length of the string it
532 tells us how many characters must appear after the string we have
536 Only used for floating strings. This is the rightmost point that
537 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
538 string can occur infinitely far to the right.
539 For fixed strings, it is equal to min_offset.
542 A pointer to the minimum number of characters of the pattern that the
543 string was found inside. This is important as in the case of positive
544 lookahead or positive lookbehind we can have multiple patterns
549 The minimum length of the pattern overall is 3, the minimum length
550 of the lookahead part is 3, but the minimum length of the part that
551 will actually match is 1. So 'FOO's minimum length is 3, but the
552 minimum length for the F is 1. This is important as the minimum length
553 is used to determine offsets in front of and behind the string being
554 looked for. Since strings can be composites this is the length of the
555 pattern at the time it was committed with a scan_commit. Note that
556 the length is calculated by study_chunk, so that the minimum lengths
557 are not known until the full pattern has been compiled, thus the
558 pointer to the value.
562 In the case of lookbehind the string being searched for can be
563 offset past the start point of the final matching string.
564 If this value was just blithely removed from the min_offset it would
565 invalidate some of the calculations for how many chars must match
566 before or after (as they are derived from min_offset and minlen and
567 the length of the string being searched for).
568 When the final pattern is compiled and the data is moved from the
569 scan_data_t structure into the regexp structure the information
570 about lookbehind is factored in, with the information that would
571 have been lost precalculated in the end_shift field for the
574 The fields pos_min and pos_delta are used to store the minimum offset
575 and the delta to the maximum offset at the current point in the pattern.
579 struct scan_data_substrs {
580 SV *str; /* longest substring found in pattern */
581 SSize_t min_offset; /* earliest point in string it can appear */
582 SSize_t max_offset; /* latest point in string it can appear */
583 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
584 SSize_t lookbehind; /* is the pos of the string modified by LB */
585 I32 flags; /* per substring SF_* and SCF_* flags */
588 typedef struct scan_data_t {
589 /*I32 len_min; unused */
590 /*I32 len_delta; unused */
594 SSize_t last_end; /* min value, <0 unless valid. */
595 SSize_t last_start_min;
596 SSize_t last_start_max;
597 U8 cur_is_floating; /* whether the last_* values should be set as
598 * the next fixed (0) or floating (1)
601 /* [0] is longest fixed substring so far, [1] is longest float so far */
602 struct scan_data_substrs substrs[2];
604 I32 flags; /* common SF_* and SCF_* flags */
606 SSize_t *last_closep;
607 regnode_ssc *start_class;
611 * Forward declarations for pregcomp()'s friends.
614 static const scan_data_t zero_scan_data = {
615 0, 0, NULL, 0, 0, 0, 0,
617 { NULL, 0, 0, 0, 0, 0 },
618 { NULL, 0, 0, 0, 0, 0 },
625 #define SF_BEFORE_SEOL 0x0001
626 #define SF_BEFORE_MEOL 0x0002
627 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
629 #define SF_IS_INF 0x0040
630 #define SF_HAS_PAR 0x0080
631 #define SF_IN_PAR 0x0100
632 #define SF_HAS_EVAL 0x0200
635 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
636 * longest substring in the pattern. When it is not set the optimiser keeps
637 * track of position, but does not keep track of the actual strings seen,
639 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
642 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
643 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
644 * turned off because of the alternation (BRANCH). */
645 #define SCF_DO_SUBSTR 0x0400
647 #define SCF_DO_STCLASS_AND 0x0800
648 #define SCF_DO_STCLASS_OR 0x1000
649 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
650 #define SCF_WHILEM_VISITED_POS 0x2000
652 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
653 #define SCF_SEEN_ACCEPT 0x8000
654 #define SCF_TRIE_DOING_RESTUDY 0x10000
655 #define SCF_IN_DEFINE 0x20000
660 #define UTF cBOOL(RExC_utf8)
662 /* The enums for all these are ordered so things work out correctly */
663 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
664 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
665 == REGEX_DEPENDS_CHARSET)
666 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
667 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
668 >= REGEX_UNICODE_CHARSET)
669 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
670 == REGEX_ASCII_RESTRICTED_CHARSET)
671 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
672 >= REGEX_ASCII_RESTRICTED_CHARSET)
673 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
674 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
676 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
678 /* For programs that want to be strictly Unicode compatible by dying if any
679 * attempt is made to match a non-Unicode code point against a Unicode
681 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
683 #define OOB_NAMEDCLASS -1
685 /* There is no code point that is out-of-bounds, so this is problematic. But
686 * its only current use is to initialize a variable that is always set before
688 #define OOB_UNICODE 0xDEADBEEF
690 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
693 /* length of regex to show in messages that don't mark a position within */
694 #define RegexLengthToShowInErrorMessages 127
697 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
698 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
699 * op/pragma/warn/regcomp.
701 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
702 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
704 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
705 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
707 /* The code in this file in places uses one level of recursion with parsing
708 * rebased to an alternate string constructed by us in memory. This can take
709 * the form of something that is completely different from the input, or
710 * something that uses the input as part of the alternate. In the first case,
711 * there should be no possibility of an error, as we are in complete control of
712 * the alternate string. But in the second case we don't completely control
713 * the input portion, so there may be errors in that. Here's an example:
715 * is handled specially because \x{df} folds to a sequence of more than one
716 * character: 'ss'. What is done is to create and parse an alternate string,
717 * which looks like this:
718 * /(?:\x{DF}|[abc\x{DF}def])/ui
719 * where it uses the input unchanged in the middle of something it constructs,
720 * which is a branch for the DF outside the character class, and clustering
721 * parens around the whole thing. (It knows enough to skip the DF inside the
722 * class while in this substitute parse.) 'abc' and 'def' may have errors that
723 * need to be reported. The general situation looks like this:
725 * |<------- identical ------>|
727 * Input: ---------------------------------------------------------------
728 * Constructed: ---------------------------------------------------
730 * |<------- identical ------>|
732 * sI..eI is the portion of the input pattern we are concerned with here.
733 * sC..EC is the constructed substitute parse string.
734 * sC..tC is constructed by us
735 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
736 * In the diagram, these are vertically aligned.
737 * eC..EC is also constructed by us.
738 * xC is the position in the substitute parse string where we found a
740 * xI is the position in the original pattern corresponding to xC.
742 * We want to display a message showing the real input string. Thus we need to
743 * translate from xC to xI. We know that xC >= tC, since the portion of the
744 * string sC..tC has been constructed by us, and so shouldn't have errors. We
746 * xI = tI + (xC - tC)
748 * When the substitute parse is constructed, the code needs to set:
751 * RExC_copy_start_in_input (tI)
752 * RExC_copy_start_in_constructed (tC)
753 * and restore them when done.
755 * During normal processing of the input pattern, both
756 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
757 * sI, so that xC equals xI.
760 #define sI RExC_precomp
761 #define eI RExC_precomp_end
762 #define sC RExC_start
764 #define tI RExC_copy_start_in_input
765 #define tC RExC_copy_start_in_constructed
766 #define xI(xC) (tI + (xC - tC))
767 #define xI_offset(xC) (xI(xC) - sI)
769 #define REPORT_LOCATION_ARGS(xC) \
771 (xI(xC) > eI) /* Don't run off end */ \
772 ? eI - sI /* Length before the <--HERE */ \
773 : ((xI_offset(xC) >= 0) \
775 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
776 IVdf " trying to output message for " \
778 __FILE__, __LINE__, (IV) xI_offset(xC), \
779 ((int) (eC - sC)), sC), 0)), \
780 sI), /* The input pattern printed up to the <--HERE */ \
782 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
783 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
785 /* Used to point after bad bytes for an error message, but avoid skipping
786 * past a nul byte. */
787 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
789 /* Set up to clean up after our imminent demise */
790 #define PREPARE_TO_DIE \
793 SAVEFREESV(RExC_rx_sv); \
794 if (RExC_open_parens) \
795 SAVEFREEPV(RExC_open_parens); \
796 if (RExC_close_parens) \
797 SAVEFREEPV(RExC_close_parens); \
801 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
802 * arg. Show regex, up to a maximum length. If it's too long, chop and add
805 #define _FAIL(code) STMT_START { \
806 const char *ellipses = ""; \
807 IV len = RExC_precomp_end - RExC_precomp; \
810 if (len > RegexLengthToShowInErrorMessages) { \
811 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
812 len = RegexLengthToShowInErrorMessages - 10; \
818 #define FAIL(msg) _FAIL( \
819 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
820 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
822 #define FAIL2(msg,arg) _FAIL( \
823 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
824 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
826 #define FAIL3(msg,arg1,arg2) _FAIL( \
827 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
828 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
831 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
833 #define Simple_vFAIL(m) STMT_START { \
834 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
835 m, REPORT_LOCATION_ARGS(RExC_parse)); \
839 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
841 #define vFAIL(m) STMT_START { \
847 * Like Simple_vFAIL(), but accepts two arguments.
849 #define Simple_vFAIL2(m,a1) STMT_START { \
850 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
851 REPORT_LOCATION_ARGS(RExC_parse)); \
855 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
857 #define vFAIL2(m,a1) STMT_START { \
859 Simple_vFAIL2(m, a1); \
864 * Like Simple_vFAIL(), but accepts three arguments.
866 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
867 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
868 REPORT_LOCATION_ARGS(RExC_parse)); \
872 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
874 #define vFAIL3(m,a1,a2) STMT_START { \
876 Simple_vFAIL3(m, a1, a2); \
880 * Like Simple_vFAIL(), but accepts four arguments.
882 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
883 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
884 REPORT_LOCATION_ARGS(RExC_parse)); \
887 #define vFAIL4(m,a1,a2,a3) STMT_START { \
889 Simple_vFAIL4(m, a1, a2, a3); \
892 /* A specialized version of vFAIL2 that works with UTF8f */
893 #define vFAIL2utf8f(m, a1) STMT_START { \
895 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
896 REPORT_LOCATION_ARGS(RExC_parse)); \
899 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
901 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
902 REPORT_LOCATION_ARGS(RExC_parse)); \
905 /* Setting this to NULL is a signal to not output warnings */
906 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
908 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
909 RExC_copy_start_in_constructed = NULL; \
911 #define RESTORE_WARNINGS \
912 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
914 /* Since a warning can be generated multiple times as the input is reparsed, we
915 * output it the first time we come to that point in the parse, but suppress it
916 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
917 * generate any warnings */
918 #define TO_OUTPUT_WARNINGS(loc) \
919 ( RExC_copy_start_in_constructed \
920 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
922 /* After we've emitted a warning, we save the position in the input so we don't
924 #define UPDATE_WARNINGS_LOC(loc) \
926 if (TO_OUTPUT_WARNINGS(loc)) { \
927 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
932 /* 'warns' is the output of the packWARNx macro used in 'code' */
933 #define _WARN_HELPER(loc, warns, code) \
935 if (! RExC_copy_start_in_constructed) { \
936 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
937 " expected at '%s'", \
938 __FILE__, __LINE__, loc); \
940 if (TO_OUTPUT_WARNINGS(loc)) { \
944 UPDATE_WARNINGS_LOC(loc); \
948 /* m is not necessarily a "literal string", in this macro */
949 #define warn_non_literal_string(loc, packed_warn, m) \
950 _WARN_HELPER(loc, packed_warn, \
951 Perl_warner(aTHX_ packed_warn, \
952 "%s" REPORT_LOCATION, \
953 m, REPORT_LOCATION_ARGS(loc)))
954 #define reg_warn_non_literal_string(loc, m) \
955 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
957 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
960 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
961 Newx(format, format_size, char); \
962 my_strlcpy(format, m, format_size); \
963 my_strlcat(format, REPORT_LOCATION, format_size); \
964 SAVEFREEPV(format); \
965 _WARN_HELPER(loc, packwarn, \
966 Perl_ck_warner(aTHX_ packwarn, \
968 a1, REPORT_LOCATION_ARGS(loc))); \
971 #define ckWARNreg(loc,m) \
972 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
973 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
975 REPORT_LOCATION_ARGS(loc)))
977 #define vWARN(loc, m) \
978 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
979 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
981 REPORT_LOCATION_ARGS(loc))) \
983 #define vWARN_dep(loc, m) \
984 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
985 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
987 REPORT_LOCATION_ARGS(loc)))
989 #define ckWARNdep(loc,m) \
990 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
991 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
993 REPORT_LOCATION_ARGS(loc)))
995 #define ckWARNregdep(loc,m) \
996 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
997 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1000 REPORT_LOCATION_ARGS(loc)))
1002 #define ckWARN2reg_d(loc,m, a1) \
1003 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1004 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1005 m REPORT_LOCATION, \
1006 a1, REPORT_LOCATION_ARGS(loc)))
1008 #define ckWARN2reg(loc, m, a1) \
1009 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1010 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1011 m REPORT_LOCATION, \
1012 a1, REPORT_LOCATION_ARGS(loc)))
1014 #define vWARN3(loc, m, a1, a2) \
1015 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1016 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1017 m REPORT_LOCATION, \
1018 a1, a2, REPORT_LOCATION_ARGS(loc)))
1020 #define ckWARN3reg(loc, m, a1, a2) \
1021 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1022 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1023 m REPORT_LOCATION, \
1025 REPORT_LOCATION_ARGS(loc)))
1027 #define vWARN4(loc, m, a1, a2, a3) \
1028 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1029 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1030 m REPORT_LOCATION, \
1032 REPORT_LOCATION_ARGS(loc)))
1034 #define ckWARN4reg(loc, m, a1, a2, a3) \
1035 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1036 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1037 m REPORT_LOCATION, \
1039 REPORT_LOCATION_ARGS(loc)))
1041 #define vWARN5(loc, m, a1, a2, a3, a4) \
1042 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1043 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1044 m REPORT_LOCATION, \
1046 REPORT_LOCATION_ARGS(loc)))
1048 #define ckWARNexperimental(loc, class, m) \
1050 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1051 RExC_warned_ ## class = 1; \
1052 _WARN_HELPER(loc, packWARN(class), \
1053 Perl_ck_warner_d(aTHX_ packWARN(class), \
1054 m REPORT_LOCATION, \
1055 REPORT_LOCATION_ARGS(loc)));\
1059 /* Convert between a pointer to a node and its offset from the beginning of the
1061 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1062 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1064 /* Macros for recording node offsets. 20001227 mjd@plover.com
1065 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1066 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1067 * Element 0 holds the number n.
1068 * Position is 1 indexed.
1070 #ifndef RE_TRACK_PATTERN_OFFSETS
1071 #define Set_Node_Offset_To_R(offset,byte)
1072 #define Set_Node_Offset(node,byte)
1073 #define Set_Cur_Node_Offset
1074 #define Set_Node_Length_To_R(node,len)
1075 #define Set_Node_Length(node,len)
1076 #define Set_Node_Cur_Length(node,start)
1077 #define Node_Offset(n)
1078 #define Node_Length(n)
1079 #define Set_Node_Offset_Length(node,offset,len)
1080 #define ProgLen(ri) ri->u.proglen
1081 #define SetProgLen(ri,x) ri->u.proglen = x
1082 #define Track_Code(code)
1084 #define ProgLen(ri) ri->u.offsets[0]
1085 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1086 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
1087 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
1088 __LINE__, (int)(offset), (int)(byte))); \
1089 if((offset) < 0) { \
1090 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
1093 RExC_offsets[2*(offset)-1] = (byte); \
1097 #define Set_Node_Offset(node,byte) \
1098 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1099 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1101 #define Set_Node_Length_To_R(node,len) STMT_START { \
1102 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1103 __LINE__, (int)(node), (int)(len))); \
1105 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1108 RExC_offsets[2*(node)] = (len); \
1112 #define Set_Node_Length(node,len) \
1113 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1114 #define Set_Node_Cur_Length(node, start) \
1115 Set_Node_Length(node, RExC_parse - start)
1117 /* Get offsets and lengths */
1118 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1119 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1121 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1122 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1123 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1126 #define Track_Code(code) STMT_START { code } STMT_END
1129 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1130 #define EXPERIMENTAL_INPLACESCAN
1131 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1135 Perl_re_printf(pTHX_ const char *fmt, ...)
1139 PerlIO *f= Perl_debug_log;
1140 PERL_ARGS_ASSERT_RE_PRINTF;
1142 result = PerlIO_vprintf(f, fmt, ap);
1148 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1152 PerlIO *f= Perl_debug_log;
1153 PERL_ARGS_ASSERT_RE_INDENTF;
1154 va_start(ap, depth);
1155 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1156 result = PerlIO_vprintf(f, fmt, ap);
1160 #endif /* DEBUGGING */
1162 #define DEBUG_RExC_seen() \
1163 DEBUG_OPTIMISE_MORE_r({ \
1164 Perl_re_printf( aTHX_ "RExC_seen: "); \
1166 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1167 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1169 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1170 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1172 if (RExC_seen & REG_GPOS_SEEN) \
1173 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1175 if (RExC_seen & REG_RECURSE_SEEN) \
1176 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1178 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1179 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1181 if (RExC_seen & REG_VERBARG_SEEN) \
1182 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1184 if (RExC_seen & REG_CUTGROUP_SEEN) \
1185 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1187 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1188 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1190 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1191 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1193 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1194 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1196 Perl_re_printf( aTHX_ "\n"); \
1199 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1200 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1205 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1206 const char *close_str)
1211 Perl_re_printf( aTHX_ "%s", open_str);
1212 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1213 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1214 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1215 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1216 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1217 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1218 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1219 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1221 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1227 Perl_re_printf( aTHX_ "%s", close_str);
1232 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1233 U32 depth, int is_inf)
1235 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1237 DEBUG_OPTIMISE_MORE_r({
1240 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1244 (IV)data->pos_delta,
1248 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1250 Perl_re_printf( aTHX_
1251 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1253 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1254 is_inf ? "INF " : ""
1257 if (data->last_found) {
1259 Perl_re_printf(aTHX_
1260 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1261 SvPVX_const(data->last_found),
1263 (IV)data->last_start_min,
1264 (IV)data->last_start_max
1267 for (i = 0; i < 2; i++) {
1268 Perl_re_printf(aTHX_
1269 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1270 data->cur_is_floating == i ? "*" : "",
1271 i ? "Float" : "Fixed",
1272 SvPVX_const(data->substrs[i].str),
1273 (IV)data->substrs[i].min_offset,
1274 (IV)data->substrs[i].max_offset
1276 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1280 Perl_re_printf( aTHX_ "\n");
1286 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1287 regnode *scan, U32 depth, U32 flags)
1289 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1296 Next = regnext(scan);
1297 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1298 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1301 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1302 Next ? (REG_NODE_NUM(Next)) : 0 );
1303 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1304 Perl_re_printf( aTHX_ "\n");
1309 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1310 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1312 # define DEBUG_PEEP(str, scan, depth, flags) \
1313 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1316 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1317 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1321 /* =========================================================
1322 * BEGIN edit_distance stuff.
1324 * This calculates how many single character changes of any type are needed to
1325 * transform a string into another one. It is taken from version 3.1 of
1327 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1330 /* Our unsorted dictionary linked list. */
1331 /* Note we use UVs, not chars. */
1336 struct dictionary* next;
1338 typedef struct dictionary item;
1341 PERL_STATIC_INLINE item*
1342 push(UV key, item* curr)
1345 Newx(head, 1, item);
1353 PERL_STATIC_INLINE item*
1354 find(item* head, UV key)
1356 item* iterator = head;
1358 if (iterator->key == key){
1361 iterator = iterator->next;
1367 PERL_STATIC_INLINE item*
1368 uniquePush(item* head, UV key)
1370 item* iterator = head;
1373 if (iterator->key == key) {
1376 iterator = iterator->next;
1379 return push(key, head);
1382 PERL_STATIC_INLINE void
1383 dict_free(item* head)
1385 item* iterator = head;
1388 item* temp = iterator;
1389 iterator = iterator->next;
1396 /* End of Dictionary Stuff */
1398 /* All calculations/work are done here */
1400 S_edit_distance(const UV* src,
1402 const STRLEN x, /* length of src[] */
1403 const STRLEN y, /* length of tgt[] */
1404 const SSize_t maxDistance
1408 UV swapCount, swapScore, targetCharCount, i, j;
1410 UV score_ceil = x + y;
1412 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1414 /* intialize matrix start values */
1415 Newx(scores, ( (x + 2) * (y + 2)), UV);
1416 scores[0] = score_ceil;
1417 scores[1 * (y + 2) + 0] = score_ceil;
1418 scores[0 * (y + 2) + 1] = score_ceil;
1419 scores[1 * (y + 2) + 1] = 0;
1420 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1425 for (i=1;i<=x;i++) {
1427 head = uniquePush(head, src[i]);
1428 scores[(i+1) * (y + 2) + 1] = i;
1429 scores[(i+1) * (y + 2) + 0] = score_ceil;
1432 for (j=1;j<=y;j++) {
1435 head = uniquePush(head, tgt[j]);
1436 scores[1 * (y + 2) + (j + 1)] = j;
1437 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1440 targetCharCount = find(head, tgt[j-1])->value;
1441 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1443 if (src[i-1] != tgt[j-1]){
1444 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1448 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1452 find(head, src[i-1])->value = i;
1456 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1459 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1463 /* END of edit_distance() stuff
1464 * ========================================================= */
1466 /* Mark that we cannot extend a found fixed substring at this point.
1467 Update the longest found anchored substring or the longest found
1468 floating substrings if needed. */
1471 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1472 SSize_t *minlenp, int is_inf)
1474 const STRLEN l = CHR_SVLEN(data->last_found);
1475 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1476 const STRLEN old_l = CHR_SVLEN(longest_sv);
1477 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1479 PERL_ARGS_ASSERT_SCAN_COMMIT;
1481 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1482 const U8 i = data->cur_is_floating;
1483 SvSetMagicSV(longest_sv, data->last_found);
1484 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1487 data->substrs[0].max_offset = data->substrs[0].min_offset;
1489 data->substrs[1].max_offset =
1493 ? data->last_start_max
1494 /* temporary underflow guard for 5.32 */
1495 : data->pos_delta < 0 ? OPTIMIZE_INFTY
1496 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1498 : data->pos_min + data->pos_delta));
1501 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1502 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1503 data->substrs[i].minlenp = minlenp;
1504 data->substrs[i].lookbehind = 0;
1507 SvCUR_set(data->last_found, 0);
1509 SV * const sv = data->last_found;
1510 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1511 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1516 data->last_end = -1;
1517 data->flags &= ~SF_BEFORE_EOL;
1518 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1521 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1522 * list that describes which code points it matches */
1525 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1527 /* Set the SSC 'ssc' to match an empty string or any code point */
1529 PERL_ARGS_ASSERT_SSC_ANYTHING;
1531 assert(is_ANYOF_SYNTHETIC(ssc));
1533 /* mortalize so won't leak */
1534 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1535 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1539 S_ssc_is_anything(const regnode_ssc *ssc)
1541 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1542 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1543 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1544 * in any way, so there's no point in using it */
1549 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1551 assert(is_ANYOF_SYNTHETIC(ssc));
1553 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1557 /* See if the list consists solely of the range 0 - Infinity */
1558 invlist_iterinit(ssc->invlist);
1559 ret = invlist_iternext(ssc->invlist, &start, &end)
1563 invlist_iterfinish(ssc->invlist);
1569 /* If e.g., both \w and \W are set, matches everything */
1570 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1572 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1573 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1583 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1585 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1586 * string, any code point, or any posix class under locale */
1588 PERL_ARGS_ASSERT_SSC_INIT;
1590 Zero(ssc, 1, regnode_ssc);
1591 set_ANYOF_SYNTHETIC(ssc);
1592 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1595 /* If any portion of the regex is to operate under locale rules that aren't
1596 * fully known at compile time, initialization includes it. The reason
1597 * this isn't done for all regexes is that the optimizer was written under
1598 * the assumption that locale was all-or-nothing. Given the complexity and
1599 * lack of documentation in the optimizer, and that there are inadequate
1600 * test cases for locale, many parts of it may not work properly, it is
1601 * safest to avoid locale unless necessary. */
1602 if (RExC_contains_locale) {
1603 ANYOF_POSIXL_SETALL(ssc);
1606 ANYOF_POSIXL_ZERO(ssc);
1611 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1612 const regnode_ssc *ssc)
1614 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1615 * to the list of code points matched, and locale posix classes; hence does
1616 * not check its flags) */
1621 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1623 assert(is_ANYOF_SYNTHETIC(ssc));
1625 invlist_iterinit(ssc->invlist);
1626 ret = invlist_iternext(ssc->invlist, &start, &end)
1630 invlist_iterfinish(ssc->invlist);
1636 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1643 #define INVLIST_INDEX 0
1644 #define ONLY_LOCALE_MATCHES_INDEX 1
1645 #define DEFERRED_USER_DEFINED_INDEX 2
1648 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1649 const regnode_charclass* const node)
1651 /* Returns a mortal inversion list defining which code points are matched
1652 * by 'node', which is of type ANYOF. Handles complementing the result if
1653 * appropriate. If some code points aren't knowable at this time, the
1654 * returned list must, and will, contain every code point that is a
1658 SV* only_utf8_locale_invlist = NULL;
1660 const U32 n = ARG(node);
1661 bool new_node_has_latin1 = FALSE;
1662 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1664 : ANYOF_FLAGS(node);
1666 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1668 /* Look at the data structure created by S_set_ANYOF_arg() */
1669 if (n != ANYOF_ONLY_HAS_BITMAP) {
1670 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1671 AV * const av = MUTABLE_AV(SvRV(rv));
1672 SV **const ary = AvARRAY(av);
1673 assert(RExC_rxi->data->what[n] == 's');
1675 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1677 /* Here there are things that won't be known until runtime -- we
1678 * have to assume it could be anything */
1679 invlist = sv_2mortal(_new_invlist(1));
1680 return _add_range_to_invlist(invlist, 0, UV_MAX);
1682 else if (ary[INVLIST_INDEX]) {
1684 /* Use the node's inversion list */
1685 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1688 /* Get the code points valid only under UTF-8 locales */
1689 if ( (flags & ANYOFL_FOLD)
1690 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1692 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1697 invlist = sv_2mortal(_new_invlist(0));
1700 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1701 * code points, and an inversion list for the others, but if there are code
1702 * points that should match only conditionally on the target string being
1703 * UTF-8, those are placed in the inversion list, and not the bitmap.
1704 * Since there are circumstances under which they could match, they are
1705 * included in the SSC. But if the ANYOF node is to be inverted, we have
1706 * to exclude them here, so that when we invert below, the end result
1707 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1708 * have to do this here before we add the unconditionally matched code
1710 if (flags & ANYOF_INVERT) {
1711 _invlist_intersection_complement_2nd(invlist,
1716 /* Add in the points from the bit map */
1717 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1718 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1719 if (ANYOF_BITMAP_TEST(node, i)) {
1720 unsigned int start = i++;
1722 for (; i < NUM_ANYOF_CODE_POINTS
1723 && ANYOF_BITMAP_TEST(node, i); ++i)
1727 invlist = _add_range_to_invlist(invlist, start, i-1);
1728 new_node_has_latin1 = TRUE;
1733 /* If this can match all upper Latin1 code points, have to add them
1734 * as well. But don't add them if inverting, as when that gets done below,
1735 * it would exclude all these characters, including the ones it shouldn't
1736 * that were added just above */
1737 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1738 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1740 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1743 /* Similarly for these */
1744 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1745 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1748 if (flags & ANYOF_INVERT) {
1749 _invlist_invert(invlist);
1751 else if (flags & ANYOFL_FOLD) {
1752 if (new_node_has_latin1) {
1754 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1755 * the locale. We can skip this if there are no 0-255 at all. */
1756 _invlist_union(invlist, PL_Latin1, &invlist);
1758 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1759 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1762 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1763 invlist = add_cp_to_invlist(invlist, 'I');
1765 if (_invlist_contains_cp(invlist,
1766 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1768 invlist = add_cp_to_invlist(invlist, 'i');
1773 /* Similarly add the UTF-8 locale possible matches. These have to be
1774 * deferred until after the non-UTF-8 locale ones are taken care of just
1775 * above, or it leads to wrong results under ANYOF_INVERT */
1776 if (only_utf8_locale_invlist) {
1777 _invlist_union_maybe_complement_2nd(invlist,
1778 only_utf8_locale_invlist,
1779 flags & ANYOF_INVERT,
1786 /* These two functions currently do the exact same thing */
1787 #define ssc_init_zero ssc_init
1789 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1790 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1792 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1793 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1794 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1797 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1798 const regnode_charclass *and_with)
1800 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1801 * another SSC or a regular ANYOF class. Can create false positives. */
1804 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1806 : ANYOF_FLAGS(and_with);
1809 PERL_ARGS_ASSERT_SSC_AND;
1811 assert(is_ANYOF_SYNTHETIC(ssc));
1813 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1814 * the code point inversion list and just the relevant flags */
1815 if (is_ANYOF_SYNTHETIC(and_with)) {
1816 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1817 anded_flags = and_with_flags;
1819 /* XXX This is a kludge around what appears to be deficiencies in the
1820 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1821 * there are paths through the optimizer where it doesn't get weeded
1822 * out when it should. And if we don't make some extra provision for
1823 * it like the code just below, it doesn't get added when it should.
1824 * This solution is to add it only when AND'ing, which is here, and
1825 * only when what is being AND'ed is the pristine, original node
1826 * matching anything. Thus it is like adding it to ssc_anything() but
1827 * only when the result is to be AND'ed. Probably the same solution
1828 * could be adopted for the same problem we have with /l matching,
1829 * which is solved differently in S_ssc_init(), and that would lead to
1830 * fewer false positives than that solution has. But if this solution
1831 * creates bugs, the consequences are only that a warning isn't raised
1832 * that should be; while the consequences for having /l bugs is
1833 * incorrect matches */
1834 if (ssc_is_anything((regnode_ssc *)and_with)) {
1835 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1839 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1840 if (OP(and_with) == ANYOFD) {
1841 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1844 anded_flags = and_with_flags
1845 &( ANYOF_COMMON_FLAGS
1846 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1847 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1848 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1850 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1855 ANYOF_FLAGS(ssc) &= anded_flags;
1857 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1858 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1859 * 'and_with' may be inverted. When not inverted, we have the situation of
1861 * (C1 | P1) & (C2 | P2)
1862 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1863 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1864 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1865 * <= ((C1 & C2) | P1 | P2)
1866 * Alternatively, the last few steps could be:
1867 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1868 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1869 * <= (C1 | C2 | (P1 & P2))
1870 * We favor the second approach if either P1 or P2 is non-empty. This is
1871 * because these components are a barrier to doing optimizations, as what
1872 * they match cannot be known until the moment of matching as they are
1873 * dependent on the current locale, 'AND"ing them likely will reduce or
1875 * But we can do better if we know that C1,P1 are in their initial state (a
1876 * frequent occurrence), each matching everything:
1877 * (<everything>) & (C2 | P2) = C2 | P2
1878 * Similarly, if C2,P2 are in their initial state (again a frequent
1879 * occurrence), the result is a no-op
1880 * (C1 | P1) & (<everything>) = C1 | P1
1883 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1884 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1885 * <= (C1 & ~C2) | (P1 & ~P2)
1888 if ((and_with_flags & ANYOF_INVERT)
1889 && ! is_ANYOF_SYNTHETIC(and_with))
1893 ssc_intersection(ssc,
1895 FALSE /* Has already been inverted */
1898 /* If either P1 or P2 is empty, the intersection will be also; can skip
1900 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1901 ANYOF_POSIXL_ZERO(ssc);
1903 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1905 /* Note that the Posix class component P from 'and_with' actually
1907 * P = Pa | Pb | ... | Pn
1908 * where each component is one posix class, such as in [\w\s].
1910 * ~P = ~(Pa | Pb | ... | Pn)
1911 * = ~Pa & ~Pb & ... & ~Pn
1912 * <= ~Pa | ~Pb | ... | ~Pn
1913 * The last is something we can easily calculate, but unfortunately
1914 * is likely to have many false positives. We could do better
1915 * in some (but certainly not all) instances if two classes in
1916 * P have known relationships. For example
1917 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1919 * :lower: & :print: = :lower:
1920 * And similarly for classes that must be disjoint. For example,
1921 * since \s and \w can have no elements in common based on rules in
1922 * the POSIX standard,
1923 * \w & ^\S = nothing
1924 * Unfortunately, some vendor locales do not meet the Posix
1925 * standard, in particular almost everything by Microsoft.
1926 * The loop below just changes e.g., \w into \W and vice versa */
1928 regnode_charclass_posixl temp;
1929 int add = 1; /* To calculate the index of the complement */
1931 Zero(&temp, 1, regnode_charclass_posixl);
1932 ANYOF_POSIXL_ZERO(&temp);
1933 for (i = 0; i < ANYOF_MAX; i++) {
1935 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1936 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1938 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1939 ANYOF_POSIXL_SET(&temp, i + add);
1941 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1943 ANYOF_POSIXL_AND(&temp, ssc);
1945 } /* else ssc already has no posixes */
1946 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1947 in its initial state */
1948 else if (! is_ANYOF_SYNTHETIC(and_with)
1949 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1951 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1952 * copy it over 'ssc' */
1953 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1954 if (is_ANYOF_SYNTHETIC(and_with)) {
1955 StructCopy(and_with, ssc, regnode_ssc);
1958 ssc->invlist = anded_cp_list;
1959 ANYOF_POSIXL_ZERO(ssc);
1960 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1961 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1965 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1966 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1968 /* One or the other of P1, P2 is non-empty. */
1969 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1970 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1972 ssc_union(ssc, anded_cp_list, FALSE);
1974 else { /* P1 = P2 = empty */
1975 ssc_intersection(ssc, anded_cp_list, FALSE);
1981 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1982 const regnode_charclass *or_with)
1984 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1985 * another SSC or a regular ANYOF class. Can create false positives if
1986 * 'or_with' is to be inverted. */
1990 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1992 : ANYOF_FLAGS(or_with);
1994 PERL_ARGS_ASSERT_SSC_OR;
1996 assert(is_ANYOF_SYNTHETIC(ssc));
1998 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1999 * the code point inversion list and just the relevant flags */
2000 if (is_ANYOF_SYNTHETIC(or_with)) {
2001 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2002 ored_flags = or_with_flags;
2005 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2006 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2007 if (OP(or_with) != ANYOFD) {
2010 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2011 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2012 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2014 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2019 ANYOF_FLAGS(ssc) |= ored_flags;
2021 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2022 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2023 * 'or_with' may be inverted. When not inverted, we have the simple
2024 * situation of computing:
2025 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2026 * If P1|P2 yields a situation with both a class and its complement are
2027 * set, like having both \w and \W, this matches all code points, and we
2028 * can delete these from the P component of the ssc going forward. XXX We
2029 * might be able to delete all the P components, but I (khw) am not certain
2030 * about this, and it is better to be safe.
2033 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2034 * <= (C1 | P1) | ~C2
2035 * <= (C1 | ~C2) | P1
2036 * (which results in actually simpler code than the non-inverted case)
2039 if ((or_with_flags & ANYOF_INVERT)
2040 && ! is_ANYOF_SYNTHETIC(or_with))
2042 /* We ignore P2, leaving P1 going forward */
2043 } /* else Not inverted */
2044 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2045 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2046 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2048 for (i = 0; i < ANYOF_MAX; i += 2) {
2049 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2051 ssc_match_all_cp(ssc);
2052 ANYOF_POSIXL_CLEAR(ssc, i);
2053 ANYOF_POSIXL_CLEAR(ssc, i+1);
2061 FALSE /* Already has been inverted */
2066 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2068 PERL_ARGS_ASSERT_SSC_UNION;
2070 assert(is_ANYOF_SYNTHETIC(ssc));
2072 _invlist_union_maybe_complement_2nd(ssc->invlist,
2079 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2081 const bool invert2nd)
2083 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2085 assert(is_ANYOF_SYNTHETIC(ssc));
2087 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2094 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2096 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2098 assert(is_ANYOF_SYNTHETIC(ssc));
2100 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2104 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2106 /* AND just the single code point 'cp' into the SSC 'ssc' */
2108 SV* cp_list = _new_invlist(2);
2110 PERL_ARGS_ASSERT_SSC_CP_AND;
2112 assert(is_ANYOF_SYNTHETIC(ssc));
2114 cp_list = add_cp_to_invlist(cp_list, cp);
2115 ssc_intersection(ssc, cp_list,
2116 FALSE /* Not inverted */
2118 SvREFCNT_dec_NN(cp_list);
2122 S_ssc_clear_locale(regnode_ssc *ssc)
2124 /* Set the SSC 'ssc' to not match any locale things */
2125 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2127 assert(is_ANYOF_SYNTHETIC(ssc));
2129 ANYOF_POSIXL_ZERO(ssc);
2130 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2133 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2136 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2138 /* The synthetic start class is used to hopefully quickly winnow down
2139 * places where a pattern could start a match in the target string. If it
2140 * doesn't really narrow things down that much, there isn't much point to
2141 * having the overhead of using it. This function uses some very crude
2142 * heuristics to decide if to use the ssc or not.
2144 * It returns TRUE if 'ssc' rules out more than half what it considers to
2145 * be the "likely" possible matches, but of course it doesn't know what the
2146 * actual things being matched are going to be; these are only guesses
2148 * For /l matches, it assumes that the only likely matches are going to be
2149 * in the 0-255 range, uniformly distributed, so half of that is 127
2150 * For /a and /d matches, it assumes that the likely matches will be just
2151 * the ASCII range, so half of that is 63
2152 * For /u and there isn't anything matching above the Latin1 range, it
2153 * assumes that that is the only range likely to be matched, and uses
2154 * half that as the cut-off: 127. If anything matches above Latin1,
2155 * it assumes that all of Unicode could match (uniformly), except for
2156 * non-Unicode code points and things in the General Category "Other"
2157 * (unassigned, private use, surrogates, controls and formats). This
2158 * is a much large number. */
2160 U32 count = 0; /* Running total of number of code points matched by
2162 UV start, end; /* Start and end points of current range in inversion
2163 XXX outdated. UTF-8 locales are common, what about invert? list */
2164 const U32 max_code_points = (LOC)
2166 : (( ! UNI_SEMANTICS
2167 || invlist_highest(ssc->invlist) < 256)
2170 const U32 max_match = max_code_points / 2;
2172 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2174 invlist_iterinit(ssc->invlist);
2175 while (invlist_iternext(ssc->invlist, &start, &end)) {
2176 if (start >= max_code_points) {
2179 end = MIN(end, max_code_points - 1);
2180 count += end - start + 1;
2181 if (count >= max_match) {
2182 invlist_iterfinish(ssc->invlist);
2192 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2194 /* The inversion list in the SSC is marked mortal; now we need a more
2195 * permanent copy, which is stored the same way that is done in a regular
2196 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2199 SV* invlist = invlist_clone(ssc->invlist, NULL);
2201 PERL_ARGS_ASSERT_SSC_FINALIZE;
2203 assert(is_ANYOF_SYNTHETIC(ssc));
2205 /* The code in this file assumes that all but these flags aren't relevant
2206 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2207 * by the time we reach here */
2208 assert(! (ANYOF_FLAGS(ssc)
2209 & ~( ANYOF_COMMON_FLAGS
2210 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2211 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2213 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2215 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2216 SvREFCNT_dec(invlist);
2218 /* Make sure is clone-safe */
2219 ssc->invlist = NULL;
2221 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2222 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2223 OP(ssc) = ANYOFPOSIXL;
2225 else if (RExC_contains_locale) {
2229 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2232 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2233 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2234 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2235 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2236 ? (TRIE_LIST_CUR( idx ) - 1) \
2242 dump_trie(trie,widecharmap,revcharmap)
2243 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2244 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2246 These routines dump out a trie in a somewhat readable format.
2247 The _interim_ variants are used for debugging the interim
2248 tables that are used to generate the final compressed
2249 representation which is what dump_trie expects.
2251 Part of the reason for their existence is to provide a form
2252 of documentation as to how the different representations function.
2257 Dumps the final compressed table form of the trie to Perl_debug_log.
2258 Used for debugging make_trie().
2262 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2263 AV *revcharmap, U32 depth)
2266 SV *sv=sv_newmortal();
2267 int colwidth= widecharmap ? 6 : 4;
2269 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2271 PERL_ARGS_ASSERT_DUMP_TRIE;
2273 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2274 depth+1, "Match","Base","Ofs" );
2276 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2277 SV ** const tmp = av_fetch( revcharmap, state, 0);
2279 Perl_re_printf( aTHX_ "%*s",
2281 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2282 PL_colors[0], PL_colors[1],
2283 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2284 PERL_PV_ESCAPE_FIRSTCHAR
2289 Perl_re_printf( aTHX_ "\n");
2290 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2292 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2293 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2294 Perl_re_printf( aTHX_ "\n");
2296 for( state = 1 ; state < trie->statecount ; state++ ) {
2297 const U32 base = trie->states[ state ].trans.base;
2299 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2301 if ( trie->states[ state ].wordnum ) {
2302 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2304 Perl_re_printf( aTHX_ "%6s", "" );
2307 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2312 while( ( base + ofs < trie->uniquecharcount ) ||
2313 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2314 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2318 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2320 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2321 if ( ( base + ofs >= trie->uniquecharcount )
2322 && ( base + ofs - trie->uniquecharcount
2324 && trie->trans[ base + ofs
2325 - trie->uniquecharcount ].check == state )
2327 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2328 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2331 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2335 Perl_re_printf( aTHX_ "]");
2338 Perl_re_printf( aTHX_ "\n" );
2340 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2342 for (word=1; word <= trie->wordcount; word++) {
2343 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2344 (int)word, (int)(trie->wordinfo[word].prev),
2345 (int)(trie->wordinfo[word].len));
2347 Perl_re_printf( aTHX_ "\n" );
2350 Dumps a fully constructed but uncompressed trie in list form.
2351 List tries normally only are used for construction when the number of
2352 possible chars (trie->uniquecharcount) is very high.
2353 Used for debugging make_trie().
2356 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2357 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2361 SV *sv=sv_newmortal();
2362 int colwidth= widecharmap ? 6 : 4;
2363 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2365 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2367 /* print out the table precompression. */
2368 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2370 Perl_re_indentf( aTHX_ "%s",
2371 depth+1, "------:-----+-----------------\n" );
2373 for( state=1 ; state < next_alloc ; state ++ ) {
2376 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2377 depth+1, (UV)state );
2378 if ( ! trie->states[ state ].wordnum ) {
2379 Perl_re_printf( aTHX_ "%5s| ","");
2381 Perl_re_printf( aTHX_ "W%4x| ",
2382 trie->states[ state ].wordnum
2385 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2386 SV ** const tmp = av_fetch( revcharmap,
2387 TRIE_LIST_ITEM(state, charid).forid, 0);
2389 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2391 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2393 PL_colors[0], PL_colors[1],
2394 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2395 | PERL_PV_ESCAPE_FIRSTCHAR
2397 TRIE_LIST_ITEM(state, charid).forid,
2398 (UV)TRIE_LIST_ITEM(state, charid).newstate
2401 Perl_re_printf( aTHX_ "\n%*s| ",
2402 (int)((depth * 2) + 14), "");
2405 Perl_re_printf( aTHX_ "\n");
2410 Dumps a fully constructed but uncompressed trie in table form.
2411 This is the normal DFA style state transition table, with a few
2412 twists to facilitate compression later.
2413 Used for debugging make_trie().
2416 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2417 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2422 SV *sv=sv_newmortal();
2423 int colwidth= widecharmap ? 6 : 4;
2424 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2426 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2429 print out the table precompression so that we can do a visual check
2430 that they are identical.
2433 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2435 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2436 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2438 Perl_re_printf( aTHX_ "%*s",
2440 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2441 PL_colors[0], PL_colors[1],
2442 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2443 PERL_PV_ESCAPE_FIRSTCHAR
2449 Perl_re_printf( aTHX_ "\n");
2450 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2452 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2453 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2456 Perl_re_printf( aTHX_ "\n" );
2458 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2460 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2462 (UV)TRIE_NODENUM( state ) );
2464 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2465 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2467 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2469 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2471 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2472 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2473 (UV)trie->trans[ state ].check );
2475 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2476 (UV)trie->trans[ state ].check,
2477 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2485 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2486 startbranch: the first branch in the whole branch sequence
2487 first : start branch of sequence of branch-exact nodes.
2488 May be the same as startbranch
2489 last : Thing following the last branch.
2490 May be the same as tail.
2491 tail : item following the branch sequence
2492 count : words in the sequence
2493 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2494 depth : indent depth
2496 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2498 A trie is an N'ary tree where the branches are determined by digital
2499 decomposition of the key. IE, at the root node you look up the 1st character and
2500 follow that branch repeat until you find the end of the branches. Nodes can be
2501 marked as "accepting" meaning they represent a complete word. Eg:
2505 would convert into the following structure. Numbers represent states, letters
2506 following numbers represent valid transitions on the letter from that state, if
2507 the number is in square brackets it represents an accepting state, otherwise it
2508 will be in parenthesis.
2510 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2514 (1) +-i->(6)-+-s->[7]
2516 +-s->(3)-+-h->(4)-+-e->[5]
2518 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2520 This shows that when matching against the string 'hers' we will begin at state 1
2521 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2522 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2523 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2524 single traverse. We store a mapping from accepting to state to which word was
2525 matched, and then when we have multiple possibilities we try to complete the
2526 rest of the regex in the order in which they occurred in the alternation.
2528 The only prior NFA like behaviour that would be changed by the TRIE support is
2529 the silent ignoring of duplicate alternations which are of the form:
2531 / (DUPE|DUPE) X? (?{ ... }) Y /x
2533 Thus EVAL blocks following a trie may be called a different number of times with
2534 and without the optimisation. With the optimisations dupes will be silently
2535 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2536 the following demonstrates:
2538 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2540 which prints out 'word' three times, but
2542 'words'=~/(word|word|word)(?{ print $1 })S/
2544 which doesnt print it out at all. This is due to other optimisations kicking in.
2546 Example of what happens on a structural level:
2548 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2550 1: CURLYM[1] {1,32767}(18)
2561 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2562 and should turn into:
2564 1: CURLYM[1] {1,32767}(18)
2566 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2574 Cases where tail != last would be like /(?foo|bar)baz/:
2584 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2585 and would end up looking like:
2588 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2595 d = uvchr_to_utf8_flags(d, uv, 0);
2597 is the recommended Unicode-aware way of saying
2602 #define TRIE_STORE_REVCHAR(val) \
2605 SV *zlopp = newSV(UTF8_MAXBYTES); \
2606 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2607 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2609 SvCUR_set(zlopp, kapow - flrbbbbb); \
2612 av_push(revcharmap, zlopp); \
2614 char ooooff = (char)val; \
2615 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2619 /* This gets the next character from the input, folding it if not already
2621 #define TRIE_READ_CHAR STMT_START { \
2624 /* if it is UTF then it is either already folded, or does not need \
2626 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2628 else if (folder == PL_fold_latin1) { \
2629 /* This folder implies Unicode rules, which in the range expressible \
2630 * by not UTF is the lower case, with the two exceptions, one of \
2631 * which should have been taken care of before calling this */ \
2632 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2633 uvc = toLOWER_L1(*uc); \
2634 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2637 /* raw data, will be folded later if needed */ \
2645 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2646 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2647 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2648 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2649 TRIE_LIST_LEN( state ) = ging; \
2651 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2652 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2653 TRIE_LIST_CUR( state )++; \
2656 #define TRIE_LIST_NEW(state) STMT_START { \
2657 Newx( trie->states[ state ].trans.list, \
2658 4, reg_trie_trans_le ); \
2659 TRIE_LIST_CUR( state ) = 1; \
2660 TRIE_LIST_LEN( state ) = 4; \
2663 #define TRIE_HANDLE_WORD(state) STMT_START { \
2664 U16 dupe= trie->states[ state ].wordnum; \
2665 regnode * const noper_next = regnext( noper ); \
2668 /* store the word for dumping */ \
2670 if (OP(noper) != NOTHING) \
2671 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2673 tmp = newSVpvn_utf8( "", 0, UTF ); \
2674 av_push( trie_words, tmp ); \
2678 trie->wordinfo[curword].prev = 0; \
2679 trie->wordinfo[curword].len = wordlen; \
2680 trie->wordinfo[curword].accept = state; \
2682 if ( noper_next < tail ) { \
2684 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2686 trie->jump[curword] = (U16)(noper_next - convert); \
2688 jumper = noper_next; \
2690 nextbranch= regnext(cur); \
2694 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2695 /* chain, so that when the bits of chain are later */\
2696 /* linked together, the dups appear in the chain */\
2697 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2698 trie->wordinfo[dupe].prev = curword; \
2700 /* we haven't inserted this word yet. */ \
2701 trie->states[ state ].wordnum = curword; \
2706 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2707 ( ( base + charid >= ucharcount \
2708 && base + charid < ubound \
2709 && state == trie->trans[ base - ucharcount + charid ].check \
2710 && trie->trans[ base - ucharcount + charid ].next ) \
2711 ? trie->trans[ base - ucharcount + charid ].next \
2712 : ( state==1 ? special : 0 ) \
2715 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2717 TRIE_BITMAP_SET(trie, uvc); \
2718 /* store the folded codepoint */ \
2720 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2723 /* store first byte of utf8 representation of */ \
2724 /* variant codepoints */ \
2725 if (! UVCHR_IS_INVARIANT(uvc)) { \
2726 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2731 #define MADE_JUMP_TRIE 2
2732 #define MADE_EXACT_TRIE 4
2735 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2736 regnode *first, regnode *last, regnode *tail,
2737 U32 word_count, U32 flags, U32 depth)
2739 /* first pass, loop through and scan words */
2740 reg_trie_data *trie;
2741 HV *widecharmap = NULL;
2742 AV *revcharmap = newAV();
2748 regnode *jumper = NULL;
2749 regnode *nextbranch = NULL;
2750 regnode *convert = NULL;
2751 U32 *prev_states; /* temp array mapping each state to previous one */
2752 /* we just use folder as a flag in utf8 */
2753 const U8 * folder = NULL;
2755 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2756 * which stands for one trie structure, one hash, optionally followed
2759 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2760 AV *trie_words = NULL;
2761 /* along with revcharmap, this only used during construction but both are
2762 * useful during debugging so we store them in the struct when debugging.
2765 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2766 STRLEN trie_charcount=0;
2768 SV *re_trie_maxbuff;
2769 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2771 PERL_ARGS_ASSERT_MAKE_TRIE;
2773 PERL_UNUSED_ARG(depth);
2777 case EXACT: case EXACT_REQ8: case EXACTL: break;
2781 case EXACTFLU8: folder = PL_fold_latin1; break;
2782 case EXACTF: folder = PL_fold; break;
2783 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2786 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2788 trie->startstate = 1;
2789 trie->wordcount = word_count;
2790 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2791 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2792 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2793 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2794 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2795 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2798 trie_words = newAV();
2801 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2802 assert(re_trie_maxbuff);
2803 if (!SvIOK(re_trie_maxbuff)) {
2804 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2806 DEBUG_TRIE_COMPILE_r({
2807 Perl_re_indentf( aTHX_
2808 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2810 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2811 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2814 /* Find the node we are going to overwrite */
2815 if ( first == startbranch && OP( last ) != BRANCH ) {
2816 /* whole branch chain */
2819 /* branch sub-chain */
2820 convert = NEXTOPER( first );
2823 /* -- First loop and Setup --
2825 We first traverse the branches and scan each word to determine if it
2826 contains widechars, and how many unique chars there are, this is
2827 important as we have to build a table with at least as many columns as we
2830 We use an array of integers to represent the character codes 0..255
2831 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2832 the native representation of the character value as the key and IV's for
2835 *TODO* If we keep track of how many times each character is used we can
2836 remap the columns so that the table compression later on is more
2837 efficient in terms of memory by ensuring the most common value is in the
2838 middle and the least common are on the outside. IMO this would be better
2839 than a most to least common mapping as theres a decent chance the most
2840 common letter will share a node with the least common, meaning the node
2841 will not be compressible. With a middle is most common approach the worst
2842 case is when we have the least common nodes twice.
2846 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2847 regnode *noper = NEXTOPER( cur );
2851 U32 wordlen = 0; /* required init */
2852 STRLEN minchars = 0;
2853 STRLEN maxchars = 0;
2854 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2857 if (OP(noper) == NOTHING) {
2858 /* skip past a NOTHING at the start of an alternation
2859 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2861 * If the next node is not something we are supposed to process
2862 * we will just ignore it due to the condition guarding the
2866 regnode *noper_next= regnext(noper);
2867 if (noper_next < tail)
2872 && ( OP(noper) == flags
2873 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2874 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
2875 || OP(noper) == EXACTFUP))))
2877 uc= (U8*)STRING(noper);
2878 e= uc + STR_LEN(noper);
2885 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2886 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2887 regardless of encoding */
2888 if (OP( noper ) == EXACTFUP) {
2889 /* false positives are ok, so just set this */
2890 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2894 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2896 TRIE_CHARCOUNT(trie)++;
2899 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2900 * is in effect. Under /i, this character can match itself, or
2901 * anything that folds to it. If not under /i, it can match just
2902 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2903 * all fold to k, and all are single characters. But some folds
2904 * expand to more than one character, so for example LATIN SMALL
2905 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2906 * the string beginning at 'uc' is 'ffi', it could be matched by
2907 * three characters, or just by the one ligature character. (It
2908 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2909 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2910 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2911 * match.) The trie needs to know the minimum and maximum number
2912 * of characters that could match so that it can use size alone to
2913 * quickly reject many match attempts. The max is simple: it is
2914 * the number of folded characters in this branch (since a fold is
2915 * never shorter than what folds to it. */
2919 /* And the min is equal to the max if not under /i (indicated by
2920 * 'folder' being NULL), or there are no multi-character folds. If
2921 * there is a multi-character fold, the min is incremented just
2922 * once, for the character that folds to the sequence. Each
2923 * character in the sequence needs to be added to the list below of
2924 * characters in the trie, but we count only the first towards the
2925 * min number of characters needed. This is done through the
2926 * variable 'foldlen', which is returned by the macros that look
2927 * for these sequences as the number of bytes the sequence
2928 * occupies. Each time through the loop, we decrement 'foldlen' by
2929 * how many bytes the current char occupies. Only when it reaches
2930 * 0 do we increment 'minchars' or look for another multi-character
2932 if (folder == NULL) {
2935 else if (foldlen > 0) {
2936 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2941 /* See if *uc is the beginning of a multi-character fold. If
2942 * so, we decrement the length remaining to look at, to account
2943 * for the current character this iteration. (We can use 'uc'
2944 * instead of the fold returned by TRIE_READ_CHAR because for
2945 * non-UTF, the latin1_safe macro is smart enough to account
2946 * for all the unfolded characters, and because for UTF, the
2947 * string will already have been folded earlier in the
2948 * compilation process */
2950 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2951 foldlen -= UTF8SKIP(uc);
2954 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2959 /* The current character (and any potential folds) should be added
2960 * to the possible matching characters for this position in this
2964 U8 folded= folder[ (U8) uvc ];
2965 if ( !trie->charmap[ folded ] ) {
2966 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2967 TRIE_STORE_REVCHAR( folded );
2970 if ( !trie->charmap[ uvc ] ) {
2971 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2972 TRIE_STORE_REVCHAR( uvc );
2975 /* store the codepoint in the bitmap, and its folded
2977 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2978 set_bit = 0; /* We've done our bit :-) */
2982 /* XXX We could come up with the list of code points that fold
2983 * to this using PL_utf8_foldclosures, except not for
2984 * multi-char folds, as there may be multiple combinations
2985 * there that could work, which needs to wait until runtime to
2986 * resolve (The comment about LIGATURE FFI above is such an
2991 widecharmap = newHV();
2993 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2996 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2998 if ( !SvTRUE( *svpp ) ) {
2999 sv_setiv( *svpp, ++trie->uniquecharcount );
3000 TRIE_STORE_REVCHAR(uvc);
3003 } /* end loop through characters in this branch of the trie */
3005 /* We take the min and max for this branch and combine to find the min
3006 * and max for all branches processed so far */
3007 if( cur == first ) {
3008 trie->minlen = minchars;
3009 trie->maxlen = maxchars;
3010 } else if (minchars < trie->minlen) {
3011 trie->minlen = minchars;
3012 } else if (maxchars > trie->maxlen) {
3013 trie->maxlen = maxchars;
3015 } /* end first pass */
3016 DEBUG_TRIE_COMPILE_r(
3017 Perl_re_indentf( aTHX_
3018 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3020 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3021 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3022 (int)trie->minlen, (int)trie->maxlen )
3026 We now know what we are dealing with in terms of unique chars and
3027 string sizes so we can calculate how much memory a naive
3028 representation using a flat table will take. If it's over a reasonable
3029 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3030 conservative but potentially much slower representation using an array
3033 At the end we convert both representations into the same compressed
3034 form that will be used in regexec.c for matching with. The latter
3035 is a form that cannot be used to construct with but has memory
3036 properties similar to the list form and access properties similar
3037 to the table form making it both suitable for fast searches and
3038 small enough that its feasable to store for the duration of a program.
3040 See the comment in the code where the compressed table is produced
3041 inplace from the flat tabe representation for an explanation of how
3042 the compression works.
3047 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3050 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3051 > SvIV(re_trie_maxbuff) )
3054 Second Pass -- Array Of Lists Representation
3056 Each state will be represented by a list of charid:state records
3057 (reg_trie_trans_le) the first such element holds the CUR and LEN
3058 points of the allocated array. (See defines above).
3060 We build the initial structure using the lists, and then convert
3061 it into the compressed table form which allows faster lookups
3062 (but cant be modified once converted).
3065 STRLEN transcount = 1;
3067 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3070 trie->states = (reg_trie_state *)
3071 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3072 sizeof(reg_trie_state) );
3076 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3078 regnode *noper = NEXTOPER( cur );
3079 U32 state = 1; /* required init */
3080 U16 charid = 0; /* sanity init */
3081 U32 wordlen = 0; /* required init */
3083 if (OP(noper) == NOTHING) {
3084 regnode *noper_next= regnext(noper);
3085 if (noper_next < tail)
3087 /* we will undo this assignment if noper does not
3088 * point at a trieable type in the else clause of
3089 * the following statement. */
3093 && ( OP(noper) == flags
3094 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3095 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3096 || OP(noper) == EXACTFUP))))
3098 const U8 *uc= (U8*)STRING(noper);
3099 const U8 *e= uc + STR_LEN(noper);
3101 for ( ; uc < e ; uc += len ) {
3106 charid = trie->charmap[ uvc ];
3108 SV** const svpp = hv_fetch( widecharmap,
3115 charid=(U16)SvIV( *svpp );
3118 /* charid is now 0 if we dont know the char read, or
3119 * nonzero if we do */
3126 if ( !trie->states[ state ].trans.list ) {
3127 TRIE_LIST_NEW( state );
3130 check <= TRIE_LIST_USED( state );
3133 if ( TRIE_LIST_ITEM( state, check ).forid
3136 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3141 newstate = next_alloc++;
3142 prev_states[newstate] = state;
3143 TRIE_LIST_PUSH( state, charid, newstate );
3148 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3152 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3153 * on a trieable type. So we need to reset noper back to point at the first regop
3154 * in the branch before we call TRIE_HANDLE_WORD()
3156 noper= NEXTOPER(cur);
3158 TRIE_HANDLE_WORD(state);
3160 } /* end second pass */
3162 /* next alloc is the NEXT state to be allocated */
3163 trie->statecount = next_alloc;
3164 trie->states = (reg_trie_state *)
3165 PerlMemShared_realloc( trie->states,
3167 * sizeof(reg_trie_state) );
3169 /* and now dump it out before we compress it */
3170 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3171 revcharmap, next_alloc,
3175 trie->trans = (reg_trie_trans *)
3176 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3183 for( state=1 ; state < next_alloc ; state ++ ) {
3187 DEBUG_TRIE_COMPILE_MORE_r(
3188 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3192 if (trie->states[state].trans.list) {
3193 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3197 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3198 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3199 if ( forid < minid ) {
3201 } else if ( forid > maxid ) {
3205 if ( transcount < tp + maxid - minid + 1) {
3207 trie->trans = (reg_trie_trans *)
3208 PerlMemShared_realloc( trie->trans,
3210 * sizeof(reg_trie_trans) );
3211 Zero( trie->trans + (transcount / 2),
3215 base = trie->uniquecharcount + tp - minid;
3216 if ( maxid == minid ) {
3218 for ( ; zp < tp ; zp++ ) {
3219 if ( ! trie->trans[ zp ].next ) {
3220 base = trie->uniquecharcount + zp - minid;
3221 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3223 trie->trans[ zp ].check = state;
3229 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3231 trie->trans[ tp ].check = state;
3236 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3237 const U32 tid = base
3238 - trie->uniquecharcount
3239 + TRIE_LIST_ITEM( state, idx ).forid;
3240 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3242 trie->trans[ tid ].check = state;
3244 tp += ( maxid - minid + 1 );
3246 Safefree(trie->states[ state ].trans.list);
3249 DEBUG_TRIE_COMPILE_MORE_r(
3250 Perl_re_printf( aTHX_ " base: %d\n",base);
3253 trie->states[ state ].trans.base=base;
3255 trie->lasttrans = tp + 1;
3259 Second Pass -- Flat Table Representation.
3261 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3262 each. We know that we will need Charcount+1 trans at most to store
3263 the data (one row per char at worst case) So we preallocate both
3264 structures assuming worst case.
3266 We then construct the trie using only the .next slots of the entry
3269 We use the .check field of the first entry of the node temporarily
3270 to make compression both faster and easier by keeping track of how
3271 many non zero fields are in the node.
3273 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3276 There are two terms at use here: state as a TRIE_NODEIDX() which is
3277 a number representing the first entry of the node, and state as a
3278 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3279 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3280 if there are 2 entrys per node. eg:
3288 The table is internally in the right hand, idx form. However as we
3289 also have to deal with the states array which is indexed by nodenum
3290 we have to use TRIE_NODENUM() to convert.
3293 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3296 trie->trans = (reg_trie_trans *)
3297 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3298 * trie->uniquecharcount + 1,
3299 sizeof(reg_trie_trans) );
3300 trie->states = (reg_trie_state *)
3301 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3302 sizeof(reg_trie_state) );
3303 next_alloc = trie->uniquecharcount + 1;
3306 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3308 regnode *noper = NEXTOPER( cur );
3310 U32 state = 1; /* required init */
3312 U16 charid = 0; /* sanity init */
3313 U32 accept_state = 0; /* sanity init */
3315 U32 wordlen = 0; /* required init */
3317 if (OP(noper) == NOTHING) {
3318 regnode *noper_next= regnext(noper);
3319 if (noper_next < tail)
3321 /* we will undo this assignment if noper does not
3322 * point at a trieable type in the else clause of
3323 * the following statement. */
3327 && ( OP(noper) == flags
3328 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3329 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3330 || OP(noper) == EXACTFUP))))
3332 const U8 *uc= (U8*)STRING(noper);
3333 const U8 *e= uc + STR_LEN(noper);
3335 for ( ; uc < e ; uc += len ) {
3340 charid = trie->charmap[ uvc ];
3342 SV* const * const svpp = hv_fetch( widecharmap,
3346 charid = svpp ? (U16)SvIV(*svpp) : 0;
3350 if ( !trie->trans[ state + charid ].next ) {
3351 trie->trans[ state + charid ].next = next_alloc;
3352 trie->trans[ state ].check++;
3353 prev_states[TRIE_NODENUM(next_alloc)]
3354 = TRIE_NODENUM(state);
3355 next_alloc += trie->uniquecharcount;
3357 state = trie->trans[ state + charid ].next;
3359 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3361 /* charid is now 0 if we dont know the char read, or
3362 * nonzero if we do */
3365 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3366 * on a trieable type. So we need to reset noper back to point at the first regop
3367 * in the branch before we call TRIE_HANDLE_WORD().
3369 noper= NEXTOPER(cur);
3371 accept_state = TRIE_NODENUM( state );
3372 TRIE_HANDLE_WORD(accept_state);
3374 } /* end second pass */
3376 /* and now dump it out before we compress it */
3377 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3379 next_alloc, depth+1));
3383 * Inplace compress the table.*
3385 For sparse data sets the table constructed by the trie algorithm will
3386 be mostly 0/FAIL transitions or to put it another way mostly empty.
3387 (Note that leaf nodes will not contain any transitions.)
3389 This algorithm compresses the tables by eliminating most such
3390 transitions, at the cost of a modest bit of extra work during lookup:
3392 - Each states[] entry contains a .base field which indicates the
3393 index in the state[] array wheres its transition data is stored.
3395 - If .base is 0 there are no valid transitions from that node.
3397 - If .base is nonzero then charid is added to it to find an entry in
3400 -If trans[states[state].base+charid].check!=state then the
3401 transition is taken to be a 0/Fail transition. Thus if there are fail
3402 transitions at the front of the node then the .base offset will point
3403 somewhere inside the previous nodes data (or maybe even into a node
3404 even earlier), but the .check field determines if the transition is
3408 The following process inplace converts the table to the compressed
3409 table: We first do not compress the root node 1,and mark all its
3410 .check pointers as 1 and set its .base pointer as 1 as well. This
3411 allows us to do a DFA construction from the compressed table later,
3412 and ensures that any .base pointers we calculate later are greater
3415 - We set 'pos' to indicate the first entry of the second node.
3417 - We then iterate over the columns of the node, finding the first and
3418 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3419 and set the .check pointers accordingly, and advance pos
3420 appropriately and repreat for the next node. Note that when we copy
3421 the next pointers we have to convert them from the original
3422 NODEIDX form to NODENUM form as the former is not valid post
3425 - If a node has no transitions used we mark its base as 0 and do not
3426 advance the pos pointer.
3428 - If a node only has one transition we use a second pointer into the
3429 structure to fill in allocated fail transitions from other states.
3430 This pointer is independent of the main pointer and scans forward
3431 looking for null transitions that are allocated to a state. When it
3432 finds one it writes the single transition into the "hole". If the
3433 pointer doesnt find one the single transition is appended as normal.
3435 - Once compressed we can Renew/realloc the structures to release the
3438 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3439 specifically Fig 3.47 and the associated pseudocode.
3443 const U32 laststate = TRIE_NODENUM( next_alloc );
3446 trie->statecount = laststate;
3448 for ( state = 1 ; state < laststate ; state++ ) {
3450 const U32 stateidx = TRIE_NODEIDX( state );
3451 const U32 o_used = trie->trans[ stateidx ].check;
3452 U32 used = trie->trans[ stateidx ].check;
3453 trie->trans[ stateidx ].check = 0;
3456 used && charid < trie->uniquecharcount;
3459 if ( flag || trie->trans[ stateidx + charid ].next ) {
3460 if ( trie->trans[ stateidx + charid ].next ) {
3462 for ( ; zp < pos ; zp++ ) {
3463 if ( ! trie->trans[ zp ].next ) {
3467 trie->states[ state ].trans.base
3469 + trie->uniquecharcount
3471 trie->trans[ zp ].next
3472 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3474 trie->trans[ zp ].check = state;
3475 if ( ++zp > pos ) pos = zp;
3482 trie->states[ state ].trans.base
3483 = pos + trie->uniquecharcount - charid ;
3485 trie->trans[ pos ].next
3486 = SAFE_TRIE_NODENUM(
3487 trie->trans[ stateidx + charid ].next );
3488 trie->trans[ pos ].check = state;
3493 trie->lasttrans = pos + 1;
3494 trie->states = (reg_trie_state *)
3495 PerlMemShared_realloc( trie->states, laststate
3496 * sizeof(reg_trie_state) );
3497 DEBUG_TRIE_COMPILE_MORE_r(
3498 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3500 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3504 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3507 } /* end table compress */
3509 DEBUG_TRIE_COMPILE_MORE_r(
3510 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3512 (UV)trie->statecount,
3513 (UV)trie->lasttrans)
3515 /* resize the trans array to remove unused space */
3516 trie->trans = (reg_trie_trans *)
3517 PerlMemShared_realloc( trie->trans, trie->lasttrans
3518 * sizeof(reg_trie_trans) );
3520 { /* Modify the program and insert the new TRIE node */
3521 U8 nodetype =(U8)(flags & 0xFF);
3525 regnode *optimize = NULL;
3526 #ifdef RE_TRACK_PATTERN_OFFSETS
3529 U32 mjd_nodelen = 0;
3530 #endif /* RE_TRACK_PATTERN_OFFSETS */
3531 #endif /* DEBUGGING */
3533 This means we convert either the first branch or the first Exact,
3534 depending on whether the thing following (in 'last') is a branch
3535 or not and whther first is the startbranch (ie is it a sub part of
3536 the alternation or is it the whole thing.)
3537 Assuming its a sub part we convert the EXACT otherwise we convert
3538 the whole branch sequence, including the first.
3540 /* Find the node we are going to overwrite */
3541 if ( first != startbranch || OP( last ) == BRANCH ) {
3542 /* branch sub-chain */
3543 NEXT_OFF( first ) = (U16)(last - first);
3544 #ifdef RE_TRACK_PATTERN_OFFSETS
3546 mjd_offset= Node_Offset((convert));
3547 mjd_nodelen= Node_Length((convert));
3550 /* whole branch chain */
3552 #ifdef RE_TRACK_PATTERN_OFFSETS
3555 const regnode *nop = NEXTOPER( convert );
3556 mjd_offset= Node_Offset((nop));
3557 mjd_nodelen= Node_Length((nop));
3561 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3563 (UV)mjd_offset, (UV)mjd_nodelen)
3566 /* But first we check to see if there is a common prefix we can
3567 split out as an EXACT and put in front of the TRIE node. */
3568 trie->startstate= 1;
3569 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3570 /* we want to find the first state that has more than
3571 * one transition, if that state is not the first state
3572 * then we have a common prefix which we can remove.
3575 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3577 I32 first_ofs = -1; /* keeps track of the ofs of the first
3578 transition, -1 means none */
3580 const U32 base = trie->states[ state ].trans.base;
3582 /* does this state terminate an alternation? */
3583 if ( trie->states[state].wordnum )
3586 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3587 if ( ( base + ofs >= trie->uniquecharcount ) &&
3588 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3589 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3591 if ( ++count > 1 ) {
3592 /* we have more than one transition */
3595 /* if this is the first state there is no common prefix
3596 * to extract, so we can exit */
3597 if ( state == 1 ) break;
3598 tmp = av_fetch( revcharmap, ofs, 0);
3599 ch = (U8*)SvPV_nolen_const( *tmp );
3601 /* if we are on count 2 then we need to initialize the
3602 * bitmap, and store the previous char if there was one
3605 /* clear the bitmap */
3606 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3608 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3611 if (first_ofs >= 0) {
3612 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3613 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3615 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3617 Perl_re_printf( aTHX_ "%s", (char*)ch)
3621 /* store the current firstchar in the bitmap */
3622 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3623 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3629 /* This state has only one transition, its transition is part
3630 * of a common prefix - we need to concatenate the char it
3631 * represents to what we have so far. */
3632 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3634 char *ch = SvPV( *tmp, len );
3636 SV *sv=sv_newmortal();
3637 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3639 (UV)state, (UV)first_ofs,
3640 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3641 PL_colors[0], PL_colors[1],
3642 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3643 PERL_PV_ESCAPE_FIRSTCHAR
3648 OP( convert ) = nodetype;
3649 str=STRING(convert);
3650 setSTR_LEN(convert, 0);
3652 assert( ( STR_LEN(convert) + len ) < 256 );
3653 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3659 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3664 trie->prefixlen = (state-1);
3666 regnode *n = convert+NODE_SZ_STR(convert);
3667 assert( NODE_SZ_STR(convert) <= U16_MAX );
3668 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3669 trie->startstate = state;
3670 trie->minlen -= (state - 1);
3671 trie->maxlen -= (state - 1);
3673 /* At least the UNICOS C compiler choked on this
3674 * being argument to DEBUG_r(), so let's just have
3677 #ifdef PERL_EXT_RE_BUILD
3683 regnode *fix = convert;
3684 U32 word = trie->wordcount;
3685 #ifdef RE_TRACK_PATTERN_OFFSETS
3688 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3689 while( ++fix < n ) {
3690 Set_Node_Offset_Length(fix, 0, 0);
3693 SV ** const tmp = av_fetch( trie_words, word, 0 );
3695 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3696 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3698 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3706 NEXT_OFF(convert) = (U16)(tail - convert);
3707 DEBUG_r(optimize= n);
3713 if ( trie->maxlen ) {
3714 NEXT_OFF( convert ) = (U16)(tail - convert);
3715 ARG_SET( convert, data_slot );
3716 /* Store the offset to the first unabsorbed branch in
3717 jump[0], which is otherwise unused by the jump logic.
3718 We use this when dumping a trie and during optimisation. */
3720 trie->jump[0] = (U16)(nextbranch - convert);
3722 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3723 * and there is a bitmap
3724 * and the first "jump target" node we found leaves enough room
3725 * then convert the TRIE node into a TRIEC node, with the bitmap
3726 * embedded inline in the opcode - this is hypothetically faster.
3728 if ( !trie->states[trie->startstate].wordnum
3730 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3732 OP( convert ) = TRIEC;
3733 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3734 PerlMemShared_free(trie->bitmap);
3737 OP( convert ) = TRIE;
3739 /* store the type in the flags */
3740 convert->flags = nodetype;
3744 + regarglen[ OP( convert ) ];
3746 /* XXX We really should free up the resource in trie now,
3747 as we won't use them - (which resources?) dmq */
3749 /* needed for dumping*/
3750 DEBUG_r(if (optimize) {
3751 regnode *opt = convert;
3753 while ( ++opt < optimize) {
3754 Set_Node_Offset_Length(opt, 0, 0);
3757 Try to clean up some of the debris left after the
3760 while( optimize < jumper ) {
3761 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3762 OP( optimize ) = OPTIMIZED;
3763 Set_Node_Offset_Length(optimize, 0, 0);
3766 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3768 } /* end node insert */
3770 /* Finish populating the prev field of the wordinfo array. Walk back
3771 * from each accept state until we find another accept state, and if
3772 * so, point the first word's .prev field at the second word. If the
3773 * second already has a .prev field set, stop now. This will be the
3774 * case either if we've already processed that word's accept state,
3775 * or that state had multiple words, and the overspill words were
3776 * already linked up earlier.
3783 for (word=1; word <= trie->wordcount; word++) {
3785 if (trie->wordinfo[word].prev)
3787 state = trie->wordinfo[word].accept;
3789 state = prev_states[state];
3792 prev = trie->states[state].wordnum;
3796 trie->wordinfo[word].prev = prev;
3798 Safefree(prev_states);
3802 /* and now dump out the compressed format */
3803 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3805 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3807 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3808 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3810 SvREFCNT_dec_NN(revcharmap);
3814 : trie->startstate>1
3820 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3822 /* The Trie is constructed and compressed now so we can build a fail array if
3825 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3827 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3831 We find the fail state for each state in the trie, this state is the longest
3832 proper suffix of the current state's 'word' that is also a proper prefix of
3833 another word in our trie. State 1 represents the word '' and is thus the
3834 default fail state. This allows the DFA not to have to restart after its
3835 tried and failed a word at a given point, it simply continues as though it
3836 had been matching the other word in the first place.
3838 'abcdgu'=~/abcdefg|cdgu/
3839 When we get to 'd' we are still matching the first word, we would encounter
3840 'g' which would fail, which would bring us to the state representing 'd' in
3841 the second word where we would try 'g' and succeed, proceeding to match
3844 /* add a fail transition */
3845 const U32 trie_offset = ARG(source);
3846 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3848 const U32 ucharcount = trie->uniquecharcount;
3849 const U32 numstates = trie->statecount;
3850 const U32 ubound = trie->lasttrans + ucharcount;
3854 U32 base = trie->states[ 1 ].trans.base;
3857 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3859 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3861 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3862 PERL_UNUSED_CONTEXT;
3864 PERL_UNUSED_ARG(depth);
3867 if ( OP(source) == TRIE ) {
3868 struct regnode_1 *op = (struct regnode_1 *)
3869 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3870 StructCopy(source, op, struct regnode_1);
3871 stclass = (regnode *)op;
3873 struct regnode_charclass *op = (struct regnode_charclass *)
3874 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3875 StructCopy(source, op, struct regnode_charclass);
3876 stclass = (regnode *)op;
3878 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3880 ARG_SET( stclass, data_slot );
3881 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3882 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3883 aho->trie=trie_offset;
3884 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3885 Copy( trie->states, aho->states, numstates, reg_trie_state );
3886 Newx( q, numstates, U32);
3887 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3890 /* initialize fail[0..1] to be 1 so that we always have
3891 a valid final fail state */
3892 fail[ 0 ] = fail[ 1 ] = 1;
3894 for ( charid = 0; charid < ucharcount ; charid++ ) {
3895 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3897 q[ q_write ] = newstate;
3898 /* set to point at the root */
3899 fail[ q[ q_write++ ] ]=1;
3902 while ( q_read < q_write) {
3903 const U32 cur = q[ q_read++ % numstates ];
3904 base = trie->states[ cur ].trans.base;
3906 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3907 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3909 U32 fail_state = cur;
3912 fail_state = fail[ fail_state ];
3913 fail_base = aho->states[ fail_state ].trans.base;
3914 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3916 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3917 fail[ ch_state ] = fail_state;
3918 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3920 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3922 q[ q_write++ % numstates] = ch_state;
3926 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3927 when we fail in state 1, this allows us to use the
3928 charclass scan to find a valid start char. This is based on the principle
3929 that theres a good chance the string being searched contains lots of stuff
3930 that cant be a start char.
3932 fail[ 0 ] = fail[ 1 ] = 0;
3933 DEBUG_TRIE_COMPILE_r({
3934 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3935 depth, (UV)numstates
3937 for( q_read=1; q_read<numstates; q_read++ ) {
3938 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3940 Perl_re_printf( aTHX_ "\n");
3943 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3948 /* The below joins as many adjacent EXACTish nodes as possible into a single
3949 * one. The regop may be changed if the node(s) contain certain sequences that
3950 * require special handling. The joining is only done if:
3951 * 1) there is room in the current conglomerated node to entirely contain the
3953 * 2) they are compatible node types
3955 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3956 * these get optimized out
3958 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3959 * as possible, even if that means splitting an existing node so that its first
3960 * part is moved to the preceeding node. This would maximise the efficiency of
3961 * memEQ during matching.
3963 * If a node is to match under /i (folded), the number of characters it matches
3964 * can be different than its character length if it contains a multi-character
3965 * fold. *min_subtract is set to the total delta number of characters of the
3968 * And *unfolded_multi_char is set to indicate whether or not the node contains
3969 * an unfolded multi-char fold. This happens when it won't be known until
3970 * runtime whether the fold is valid or not; namely
3971 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3972 * target string being matched against turns out to be UTF-8 is that fold
3974 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3976 * (Multi-char folds whose components are all above the Latin1 range are not
3977 * run-time locale dependent, and have already been folded by the time this
3978 * function is called.)
3980 * This is as good a place as any to discuss the design of handling these
3981 * multi-character fold sequences. It's been wrong in Perl for a very long
3982 * time. There are three code points in Unicode whose multi-character folds
3983 * were long ago discovered to mess things up. The previous designs for
3984 * dealing with these involved assigning a special node for them. This
3985 * approach doesn't always work, as evidenced by this example:
3986 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3987 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3988 * would match just the \xDF, it won't be able to handle the case where a
3989 * successful match would have to cross the node's boundary. The new approach
3990 * that hopefully generally solves the problem generates an EXACTFUP node
3991 * that is "sss" in this case.
3993 * It turns out that there are problems with all multi-character folds, and not
3994 * just these three. Now the code is general, for all such cases. The
3995 * approach taken is:
3996 * 1) This routine examines each EXACTFish node that could contain multi-
3997 * character folded sequences. Since a single character can fold into
3998 * such a sequence, the minimum match length for this node is less than
3999 * the number of characters in the node. This routine returns in
4000 * *min_subtract how many characters to subtract from the actual
4001 * length of the string to get a real minimum match length; it is 0 if
4002 * there are no multi-char foldeds. This delta is used by the caller to
4003 * adjust the min length of the match, and the delta between min and max,
4004 * so that the optimizer doesn't reject these possibilities based on size
4007 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4008 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4009 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4010 * EXACTFU nodes. The node type of such nodes is then changed to
4011 * EXACTFUP, indicating it is problematic, and needs careful handling.
4012 * (The procedures in step 1) above are sufficient to handle this case in
4013 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4014 * the only case where there is a possible fold length change in non-UTF-8
4015 * patterns. By reserving a special node type for problematic cases, the
4016 * far more common regular EXACTFU nodes can be processed faster.
4017 * regexec.c takes advantage of this.
4019 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4020 * problematic cases. These all only occur when the pattern is not
4021 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4022 * length change, it handles the situation where the string cannot be
4023 * entirely folded. The strings in an EXACTFish node are folded as much
4024 * as possible during compilation in regcomp.c. This saves effort in
4025 * regex matching. By using an EXACTFUP node when it is not possible to
4026 * fully fold at compile time, regexec.c can know that everything in an
4027 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4028 * case where folding in EXACTFU nodes can't be done at compile time is
4029 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4030 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4031 * handle two very different cases. Alternatively, there could have been
4032 * a node type where there are length changes, one for unfolded, and one
4033 * for both. If yet another special case needed to be created, the number
4034 * of required node types would have to go to 7. khw figures that even
4035 * though there are plenty of node types to spare, that the maintenance
4036 * cost wasn't worth the small speedup of doing it that way, especially
4037 * since he thinks the MICRO SIGN is rarely encountered in practice.
4039 * There are other cases where folding isn't done at compile time, but
4040 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4041 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4042 * changes. Some folds in EXACTF depend on if the runtime target string
4043 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4044 * when no fold in it depends on the UTF-8ness of the target string.)
4046 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4047 * validity of the fold won't be known until runtime, and so must remain
4048 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4049 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4050 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4051 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4052 * The reason this is a problem is that the optimizer part of regexec.c
4053 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4054 * that a character in the pattern corresponds to at most a single
4055 * character in the target string. (And I do mean character, and not byte
4056 * here, unlike other parts of the documentation that have never been
4057 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4058 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4059 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4060 * EXACTFL nodes, violate the assumption, and they are the only instances
4061 * where it is violated. I'm reluctant to try to change the assumption,
4062 * as the code involved is impenetrable to me (khw), so instead the code
4063 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4064 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4065 * boolean indicating whether or not the node contains such a fold. When
4066 * it is true, the caller sets a flag that later causes the optimizer in
4067 * this file to not set values for the floating and fixed string lengths,
4068 * and thus avoids the optimizer code in regexec.c that makes the invalid
4069 * assumption. Thus, there is no optimization based on string lengths for
4070 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4071 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4072 * assumption is wrong only in these cases is that all other non-UTF-8
4073 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4074 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4075 * EXACTF nodes because we don't know at compile time if it actually
4076 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4077 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4078 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4079 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4080 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4081 * string would require the pattern to be forced into UTF-8, the overhead
4082 * of which we want to avoid. Similarly the unfolded multi-char folds in
4083 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4086 * Similarly, the code that generates tries doesn't currently handle
4087 * not-already-folded multi-char folds, and it looks like a pain to change
4088 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4089 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4090 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4091 * using /iaa matching will be doing so almost entirely with ASCII
4092 * strings, so this should rarely be encountered in practice */
4095 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4096 UV *min_subtract, bool *unfolded_multi_char,
4097 U32 flags, regnode *val, U32 depth)
4099 /* Merge several consecutive EXACTish nodes into one. */
4101 regnode *n = regnext(scan);
4103 regnode *next = scan + NODE_SZ_STR(scan);
4107 regnode *stop = scan;
4108 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4110 PERL_UNUSED_ARG(depth);
4113 PERL_ARGS_ASSERT_JOIN_EXACT;
4114 #ifndef EXPERIMENTAL_INPLACESCAN
4115 PERL_UNUSED_ARG(flags);
4116 PERL_UNUSED_ARG(val);
4118 DEBUG_PEEP("join", scan, depth, 0);
4120 assert(PL_regkind[OP(scan)] == EXACT);
4122 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4123 * EXACT ones that are mergeable to the current one. */
4125 && ( PL_regkind[OP(n)] == NOTHING
4126 || (stringok && PL_regkind[OP(n)] == EXACT))
4128 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4131 if (OP(n) == TAIL || n > next)
4133 if (PL_regkind[OP(n)] == NOTHING) {
4134 DEBUG_PEEP("skip:", n, depth, 0);
4135 NEXT_OFF(scan) += NEXT_OFF(n);
4136 next = n + NODE_STEP_REGNODE;
4143 else if (stringok) {
4144 const unsigned int oldl = STR_LEN(scan);
4145 regnode * const nnext = regnext(n);
4147 /* XXX I (khw) kind of doubt that this works on platforms (should
4148 * Perl ever run on one) where U8_MAX is above 255 because of lots
4149 * of other assumptions */
4150 /* Don't join if the sum can't fit into a single node */
4151 if (oldl + STR_LEN(n) > U8_MAX)
4154 /* Joining something that requires UTF-8 with something that
4155 * doesn't, means the result requires UTF-8. */
4156 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4157 OP(scan) = EXACT_REQ8;
4159 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4160 ; /* join is compatible, no need to change OP */
4162 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4163 OP(scan) = EXACTFU_REQ8;
4165 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4166 ; /* join is compatible, no need to change OP */
4168 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4169 ; /* join is compatible, no need to change OP */
4171 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4173 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4174 * which can join with EXACTFU ones. We check for this case
4175 * here. These need to be resolved to either EXACTFU or
4176 * EXACTF at joining time. They have nothing in them that
4177 * would forbid them from being the more desirable EXACTFU
4178 * nodes except that they begin and/or end with a single [Ss].
4179 * The reason this is problematic is because they could be
4180 * joined in this loop with an adjacent node that ends and/or
4181 * begins with [Ss] which would then form the sequence 'ss',
4182 * which matches differently under /di than /ui, in which case
4183 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4184 * formed, the nodes get absorbed into any adjacent EXACTFU
4185 * node. And if the only adjacent node is EXACTF, they get
4186 * absorbed into that, under the theory that a longer node is
4187 * better than two shorter ones, even if one is EXACTFU. Note
4188 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4189 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4191 if (STRING(n)[STR_LEN(n)-1] == 's') {
4193 /* Here the joined node would end with 's'. If the node
4194 * following the combination is an EXACTF one, it's better to
4195 * join this trailing edge 's' node with that one, leaving the
4196 * current one in 'scan' be the more desirable EXACTFU */
4197 if (OP(nnext) == EXACTF) {
4201 OP(scan) = EXACTFU_S_EDGE;
4203 } /* Otherwise, the beginning 's' of the 2nd node just
4204 becomes an interior 's' in 'scan' */
4206 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4207 ; /* join is compatible, no need to change OP */
4209 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4211 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4212 * nodes. But the latter nodes can be also joined with EXACTFU
4213 * ones, and that is a better outcome, so if the node following
4214 * 'n' is EXACTFU, quit now so that those two can be joined
4216 if (OP(nnext) == EXACTFU) {
4220 /* The join is compatible, and the combined node will be
4221 * EXACTF. (These don't care if they begin or end with 's' */
4223 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4224 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4225 && STRING(n)[0] == 's')
4227 /* When combined, we have the sequence 'ss', which means we
4228 * have to remain /di */
4232 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4233 if (STRING(n)[0] == 's') {
4234 ; /* Here the join is compatible and the combined node
4235 starts with 's', no need to change OP */
4237 else { /* Now the trailing 's' is in the interior */
4241 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4243 /* The join is compatible, and the combined node will be
4244 * EXACTF. (These don't care if they begin or end with 's' */
4247 else if (OP(scan) != OP(n)) {
4249 /* The only other compatible joinings are the same node type */
4253 DEBUG_PEEP("merg", n, depth, 0);
4256 NEXT_OFF(scan) += NEXT_OFF(n);
4257 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4258 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4259 next = n + NODE_SZ_STR(n);
4260 /* Now we can overwrite *n : */
4261 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4269 #ifdef EXPERIMENTAL_INPLACESCAN
4270 if (flags && !NEXT_OFF(n)) {
4271 DEBUG_PEEP("atch", val, depth, 0);
4272 if (reg_off_by_arg[OP(n)]) {
4273 ARG_SET(n, val - n);
4276 NEXT_OFF(n) = val - n;
4283 /* This temporary node can now be turned into EXACTFU, and must, as
4284 * regexec.c doesn't handle it */
4285 if (OP(scan) == EXACTFU_S_EDGE) {
4290 *unfolded_multi_char = FALSE;
4292 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4293 * can now analyze for sequences of problematic code points. (Prior to
4294 * this final joining, sequences could have been split over boundaries, and
4295 * hence missed). The sequences only happen in folding, hence for any
4296 * non-EXACT EXACTish node */
4297 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4298 U8* s0 = (U8*) STRING(scan);
4300 U8* s_end = s0 + STR_LEN(scan);
4302 int total_count_delta = 0; /* Total delta number of characters that
4303 multi-char folds expand to */
4305 /* One pass is made over the node's string looking for all the
4306 * possibilities. To avoid some tests in the loop, there are two main
4307 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4312 if (OP(scan) == EXACTFL) {
4315 /* An EXACTFL node would already have been changed to another
4316 * node type unless there is at least one character in it that
4317 * is problematic; likely a character whose fold definition
4318 * won't be known until runtime, and so has yet to be folded.
4319 * For all but the UTF-8 locale, folds are 1-1 in length, but
4320 * to handle the UTF-8 case, we need to create a temporary
4321 * folded copy using UTF-8 locale rules in order to analyze it.
4322 * This is because our macros that look to see if a sequence is
4323 * a multi-char fold assume everything is folded (otherwise the
4324 * tests in those macros would be too complicated and slow).
4325 * Note that here, the non-problematic folds will have already
4326 * been done, so we can just copy such characters. We actually
4327 * don't completely fold the EXACTFL string. We skip the
4328 * unfolded multi-char folds, as that would just create work
4329 * below to figure out the size they already are */
4331 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4334 STRLEN s_len = UTF8SKIP(s);
4335 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4336 Copy(s, d, s_len, U8);
4339 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4340 *unfolded_multi_char = TRUE;
4341 Copy(s, d, s_len, U8);
4344 else if (isASCII(*s)) {
4345 *(d++) = toFOLD(*s);
4349 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4355 /* Point the remainder of the routine to look at our temporary
4359 } /* End of creating folded copy of EXACTFL string */
4361 /* Examine the string for a multi-character fold sequence. UTF-8
4362 * patterns have all characters pre-folded by the time this code is
4364 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4365 length sequence we are looking for is 2 */
4367 int count = 0; /* How many characters in a multi-char fold */
4368 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4369 if (! len) { /* Not a multi-char fold: get next char */
4374 { /* Here is a generic multi-char fold. */
4375 U8* multi_end = s + len;
4377 /* Count how many characters are in it. In the case of
4378 * /aa, no folds which contain ASCII code points are
4379 * allowed, so check for those, and skip if found. */
4380 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4381 count = utf8_length(s, multi_end);
4385 while (s < multi_end) {
4388 goto next_iteration;
4398 /* The delta is how long the sequence is minus 1 (1 is how long
4399 * the character that folds to the sequence is) */
4400 total_count_delta += count - 1;
4404 /* We created a temporary folded copy of the string in EXACTFL
4405 * nodes. Therefore we need to be sure it doesn't go below zero,
4406 * as the real string could be shorter */
4407 if (OP(scan) == EXACTFL) {
4408 int total_chars = utf8_length((U8*) STRING(scan),
4409 (U8*) STRING(scan) + STR_LEN(scan));
4410 if (total_count_delta > total_chars) {
4411 total_count_delta = total_chars;
4415 *min_subtract += total_count_delta;
4418 else if (OP(scan) == EXACTFAA) {
4420 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4421 * fold to the ASCII range (and there are no existing ones in the
4422 * upper latin1 range). But, as outlined in the comments preceding
4423 * this function, we need to flag any occurrences of the sharp s.
4424 * This character forbids trie formation (because of added
4426 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4427 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4428 || UNICODE_DOT_DOT_VERSION > 0)
4430 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4431 OP(scan) = EXACTFAA_NO_TRIE;
4432 *unfolded_multi_char = TRUE;
4438 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4440 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4441 * folds that are all Latin1. As explained in the comments
4442 * preceding this function, we look also for the sharp s in EXACTF
4443 * and EXACTFL nodes; it can be in the final position. Otherwise
4444 * we can stop looking 1 byte earlier because have to find at least
4445 * two characters for a multi-fold */
4446 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4451 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4452 if (! len) { /* Not a multi-char fold. */
4453 if (*s == LATIN_SMALL_LETTER_SHARP_S
4454 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4456 *unfolded_multi_char = TRUE;
4463 && isALPHA_FOLD_EQ(*s, 's')
4464 && isALPHA_FOLD_EQ(*(s+1), 's'))
4467 /* EXACTF nodes need to know that the minimum length
4468 * changed so that a sharp s in the string can match this
4469 * ss in the pattern, but they remain EXACTF nodes, as they
4470 * won't match this unless the target string is in UTF-8,
4471 * which we don't know until runtime. EXACTFL nodes can't
4472 * transform into EXACTFU nodes */
4473 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4474 OP(scan) = EXACTFUP;
4478 *min_subtract += len - 1;
4486 /* Allow dumping but overwriting the collection of skipped
4487 * ops and/or strings with fake optimized ops */
4488 n = scan + NODE_SZ_STR(scan);
4496 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4500 /* REx optimizer. Converts nodes into quicker variants "in place".
4501 Finds fixed substrings. */
4503 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4504 to the position after last scanned or to NULL. */
4506 #define INIT_AND_WITHP \
4507 assert(!and_withp); \
4508 Newx(and_withp, 1, regnode_ssc); \
4509 SAVEFREEPV(and_withp)
4513 S_unwind_scan_frames(pTHX_ const void *p)
4515 scan_frame *f= (scan_frame *)p;
4517 scan_frame *n= f->next_frame;
4523 /* Follow the next-chain of the current node and optimize away
4524 all the NOTHINGs from it.
4527 S_rck_elide_nothing(pTHX_ regnode *node)
4529 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4531 if (OP(node) != CURLYX) {
4532 const int max = (reg_off_by_arg[OP(node)]
4534 /* I32 may be smaller than U16 on CRAYs! */
4535 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4536 int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4540 /* Skip NOTHING and LONGJMP. */
4544 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4545 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4551 if (reg_off_by_arg[OP(node)])
4554 NEXT_OFF(node) = off;
4559 /* the return from this sub is the minimum length that could possibly match */
4561 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4562 SSize_t *minlenp, SSize_t *deltap,
4567 regnode_ssc *and_withp,
4568 U32 flags, U32 depth, bool was_mutate_ok)
4569 /* scanp: Start here (read-write). */
4570 /* deltap: Write maxlen-minlen here. */
4571 /* last: Stop before this one. */
4572 /* data: string data about the pattern */
4573 /* stopparen: treat close N as END */
4574 /* recursed: which subroutines have we recursed into */
4575 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4577 SSize_t final_minlen;
4578 /* There must be at least this number of characters to match */
4581 regnode *scan = *scanp, *next;
4583 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4584 int is_inf_internal = 0; /* The studied chunk is infinite */
4585 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4586 scan_data_t data_fake;
4587 SV *re_trie_maxbuff = NULL;
4588 regnode *first_non_open = scan;
4589 SSize_t stopmin = OPTIMIZE_INFTY;
4590 scan_frame *frame = NULL;
4591 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4593 PERL_ARGS_ASSERT_STUDY_CHUNK;
4594 RExC_study_started= 1;
4596 Zero(&data_fake, 1, scan_data_t);
4599 while (first_non_open && OP(first_non_open) == OPEN)
4600 first_non_open=regnext(first_non_open);
4606 RExC_study_chunk_recursed_count++;
4608 DEBUG_OPTIMISE_MORE_r(
4610 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4611 depth, (long)stopparen,
4612 (unsigned long)RExC_study_chunk_recursed_count,
4613 (unsigned long)depth, (unsigned long)recursed_depth,
4616 if (recursed_depth) {
4619 for ( j = 0 ; j < recursed_depth ; j++ ) {
4620 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4621 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4622 Perl_re_printf( aTHX_ " %d",(int)i);
4626 if ( j + 1 < recursed_depth ) {
4627 Perl_re_printf( aTHX_ ",");
4631 Perl_re_printf( aTHX_ "\n");
4634 while ( scan && OP(scan) != END && scan < last ){
4635 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4636 node length to get a real minimum (because
4637 the folded version may be shorter) */
4638 bool unfolded_multi_char = FALSE;
4639 /* avoid mutating ops if we are anywhere within the recursed or
4640 * enframed handling for a GOSUB: the outermost level will handle it.
4642 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4643 /* Peephole optimizer: */
4644 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4645 DEBUG_PEEP("Peep", scan, depth, flags);
4648 /* The reason we do this here is that we need to deal with things like
4649 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4650 * parsing code, as each (?:..) is handled by a different invocation of
4653 if (PL_regkind[OP(scan)] == EXACT
4654 && OP(scan) != LEXACT
4655 && OP(scan) != LEXACT_REQ8
4658 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4659 0, NULL, depth + 1);
4662 /* Follow the next-chain of the current node and optimize
4663 away all the NOTHINGs from it.
4665 rck_elide_nothing(scan);
4667 /* The principal pseudo-switch. Cannot be a switch, since we look into
4668 * several different things. */
4669 if ( OP(scan) == DEFINEP ) {
4671 SSize_t deltanext = 0;
4672 SSize_t fake_last_close = 0;
4673 I32 f = SCF_IN_DEFINE;
4675 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4676 scan = regnext(scan);
4677 assert( OP(scan) == IFTHEN );
4678 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4680 data_fake.last_closep= &fake_last_close;
4682 next = regnext(scan);
4683 scan = NEXTOPER(NEXTOPER(scan));
4684 DEBUG_PEEP("scan", scan, depth, flags);
4685 DEBUG_PEEP("next", next, depth, flags);
4687 /* we suppose the run is continuous, last=next...
4688 * NOTE we dont use the return here! */
4689 /* DEFINEP study_chunk() recursion */
4690 (void)study_chunk(pRExC_state, &scan, &minlen,
4691 &deltanext, next, &data_fake, stopparen,
4692 recursed_depth, NULL, f, depth+1, mutate_ok);
4697 OP(scan) == BRANCH ||
4698 OP(scan) == BRANCHJ ||
4701 next = regnext(scan);
4704 /* The op(next)==code check below is to see if we
4705 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4706 * IFTHEN is special as it might not appear in pairs.
4707 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4708 * we dont handle it cleanly. */
4709 if (OP(next) == code || code == IFTHEN) {
4710 /* NOTE - There is similar code to this block below for
4711 * handling TRIE nodes on a re-study. If you change stuff here
4712 * check there too. */
4713 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4715 regnode * const startbranch=scan;
4717 if (flags & SCF_DO_SUBSTR) {
4718 /* Cannot merge strings after this. */
4719 scan_commit(pRExC_state, data, minlenp, is_inf);
4722 if (flags & SCF_DO_STCLASS)
4723 ssc_init_zero(pRExC_state, &accum);
4725 while (OP(scan) == code) {
4726 SSize_t deltanext, minnext, fake;
4728 regnode_ssc this_class;
4730 DEBUG_PEEP("Branch", scan, depth, flags);
4733 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4735 data_fake.whilem_c = data->whilem_c;
4736 data_fake.last_closep = data->last_closep;
4739 data_fake.last_closep = &fake;
4741 data_fake.pos_delta = delta;
4742 next = regnext(scan);
4744 scan = NEXTOPER(scan); /* everything */
4745 if (code != BRANCH) /* everything but BRANCH */
4746 scan = NEXTOPER(scan);
4748 if (flags & SCF_DO_STCLASS) {
4749 ssc_init(pRExC_state, &this_class);
4750 data_fake.start_class = &this_class;
4751 f = SCF_DO_STCLASS_AND;
4753 if (flags & SCF_WHILEM_VISITED_POS)
4754 f |= SCF_WHILEM_VISITED_POS;
4756 /* we suppose the run is continuous, last=next...*/
4757 /* recurse study_chunk() for each BRANCH in an alternation */
4758 minnext = study_chunk(pRExC_state, &scan, minlenp,
4759 &deltanext, next, &data_fake, stopparen,
4760 recursed_depth, NULL, f, depth+1,
4765 if (deltanext == OPTIMIZE_INFTY) {
4766 is_inf = is_inf_internal = 1;
4767 max1 = OPTIMIZE_INFTY;
4768 } else if (max1 < minnext + deltanext)
4769 max1 = minnext + deltanext;
4771 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4773 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4774 if ( stopmin > minnext)
4775 stopmin = min + min1;
4776 flags &= ~SCF_DO_SUBSTR;
4778 data->flags |= SCF_SEEN_ACCEPT;
4781 if (data_fake.flags & SF_HAS_EVAL)
4782 data->flags |= SF_HAS_EVAL;
4783 data->whilem_c = data_fake.whilem_c;
4785 if (flags & SCF_DO_STCLASS)
4786 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4788 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4790 if (flags & SCF_DO_SUBSTR) {
4791 data->pos_min += min1;
4792 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4793 data->pos_delta = OPTIMIZE_INFTY;
4795 data->pos_delta += max1 - min1;
4796 if (max1 != min1 || is_inf)
4797 data->cur_is_floating = 1;
4800 if (delta == OPTIMIZE_INFTY
4801 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4802 delta = OPTIMIZE_INFTY;
4804 delta += max1 - min1;
4805 if (flags & SCF_DO_STCLASS_OR) {
4806 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4808 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4809 flags &= ~SCF_DO_STCLASS;
4812 else if (flags & SCF_DO_STCLASS_AND) {
4814 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4815 flags &= ~SCF_DO_STCLASS;
4818 /* Switch to OR mode: cache the old value of
4819 * data->start_class */
4821 StructCopy(data->start_class, and_withp, regnode_ssc);
4822 flags &= ~SCF_DO_STCLASS_AND;
4823 StructCopy(&accum, data->start_class, regnode_ssc);
4824 flags |= SCF_DO_STCLASS_OR;
4828 if (PERL_ENABLE_TRIE_OPTIMISATION
4829 && OP(startbranch) == BRANCH
4834 Assuming this was/is a branch we are dealing with: 'scan'
4835 now points at the item that follows the branch sequence,
4836 whatever it is. We now start at the beginning of the
4837 sequence and look for subsequences of
4843 which would be constructed from a pattern like
4846 If we can find such a subsequence we need to turn the first
4847 element into a trie and then add the subsequent branch exact
4848 strings to the trie.
4852 1. patterns where the whole set of branches can be
4855 2. patterns where only a subset can be converted.
4857 In case 1 we can replace the whole set with a single regop
4858 for the trie. In case 2 we need to keep the start and end
4861 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4862 becomes BRANCH TRIE; BRANCH X;
4864 There is an additional case, that being where there is a
4865 common prefix, which gets split out into an EXACT like node
4866 preceding the TRIE node.
4868 If x(1..n)==tail then we can do a simple trie, if not we make
4869 a "jump" trie, such that when we match the appropriate word
4870 we "jump" to the appropriate tail node. Essentially we turn
4871 a nested if into a case structure of sorts.
4876 if (!re_trie_maxbuff) {
4877 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4878 if (!SvIOK(re_trie_maxbuff))
4879 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4881 if ( SvIV(re_trie_maxbuff)>=0 ) {
4883 regnode *first = (regnode *)NULL;
4884 regnode *prev = (regnode *)NULL;
4885 regnode *tail = scan;
4889 /* var tail is used because there may be a TAIL
4890 regop in the way. Ie, the exacts will point to the
4891 thing following the TAIL, but the last branch will
4892 point at the TAIL. So we advance tail. If we
4893 have nested (?:) we may have to move through several
4897 while ( OP( tail ) == TAIL ) {
4898 /* this is the TAIL generated by (?:) */
4899 tail = regnext( tail );
4903 DEBUG_TRIE_COMPILE_r({
4904 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4905 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4907 "Looking for TRIE'able sequences. Tail node is ",
4908 (UV) REGNODE_OFFSET(tail),
4909 SvPV_nolen_const( RExC_mysv )
4915 Step through the branches
4916 cur represents each branch,
4917 noper is the first thing to be matched as part
4919 noper_next is the regnext() of that node.
4921 We normally handle a case like this
4922 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4923 support building with NOJUMPTRIE, which restricts
4924 the trie logic to structures like /FOO|BAR/.
4926 If noper is a trieable nodetype then the branch is
4927 a possible optimization target. If we are building
4928 under NOJUMPTRIE then we require that noper_next is
4929 the same as scan (our current position in the regex
4932 Once we have two or more consecutive such branches
4933 we can create a trie of the EXACT's contents and
4934 stitch it in place into the program.
4936 If the sequence represents all of the branches in
4937 the alternation we replace the entire thing with a
4940 Otherwise when it is a subsequence we need to
4941 stitch it in place and replace only the relevant
4942 branches. This means the first branch has to remain
4943 as it is used by the alternation logic, and its
4944 next pointer, and needs to be repointed at the item
4945 on the branch chain following the last branch we
4946 have optimized away.
4948 This could be either a BRANCH, in which case the
4949 subsequence is internal, or it could be the item
4950 following the branch sequence in which case the
4951 subsequence is at the end (which does not
4952 necessarily mean the first node is the start of the
4955 TRIE_TYPE(X) is a define which maps the optype to a
4959 ----------------+-----------
4964 EXACTFU_REQ8 | EXACTFU
4968 EXACTFLU8 | EXACTFLU8
4972 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4974 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
4976 : ( EXACTFU == (X) \
4977 || EXACTFU_REQ8 == (X) \
4978 || EXACTFUP == (X) ) \
4980 : ( EXACTFAA == (X) ) \
4982 : ( EXACTL == (X) ) \
4984 : ( EXACTFLU8 == (X) ) \
4988 /* dont use tail as the end marker for this traverse */
4989 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4990 regnode * const noper = NEXTOPER( cur );
4991 U8 noper_type = OP( noper );
4992 U8 noper_trietype = TRIE_TYPE( noper_type );
4993 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4994 regnode * const noper_next = regnext( noper );
4995 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4996 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4999 DEBUG_TRIE_COMPILE_r({
5000 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5001 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
5003 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5005 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5006 Perl_re_printf( aTHX_ " -> %d:%s",
5007 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5010 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5011 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5012 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5014 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5015 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5016 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5020 /* Is noper a trieable nodetype that can be merged
5021 * with the current trie (if there is one)? */
5025 ( noper_trietype == NOTHING )
5026 || ( trietype == NOTHING )
5027 || ( trietype == noper_trietype )
5030 && noper_next >= tail
5034 /* Handle mergable triable node Either we are
5035 * the first node in a new trieable sequence,
5036 * in which case we do some bookkeeping,
5037 * otherwise we update the end pointer. */
5040 if ( noper_trietype == NOTHING ) {
5041 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5042 regnode * const noper_next = regnext( noper );
5043 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5044 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5047 if ( noper_next_trietype ) {
5048 trietype = noper_next_trietype;
5049 } else if (noper_next_type) {
5050 /* a NOTHING regop is 1 regop wide.
5051 * We need at least two for a trie
5052 * so we can't merge this in */
5056 trietype = noper_trietype;
5059 if ( trietype == NOTHING )
5060 trietype = noper_trietype;
5065 } /* end handle mergable triable node */
5067 /* handle unmergable node -
5068 * noper may either be a triable node which can
5069 * not be tried together with the current trie,
5070 * or a non triable node */
5072 /* If last is set and trietype is not
5073 * NOTHING then we have found at least two
5074 * triable branch sequences in a row of a
5075 * similar trietype so we can turn them
5076 * into a trie. If/when we allow NOTHING to
5077 * start a trie sequence this condition
5078 * will be required, and it isn't expensive
5079 * so we leave it in for now. */
5080 if ( trietype && trietype != NOTHING )
5081 make_trie( pRExC_state,
5082 startbranch, first, cur, tail,
5083 count, trietype, depth+1 );
5084 prev = NULL; /* note: we clear/update
5085 first, trietype etc below,
5086 so we dont do it here */
5090 && noper_next >= tail
5093 /* noper is triable, so we can start a new
5097 trietype = noper_trietype;
5099 /* if we already saw a first but the
5100 * current node is not triable then we have
5101 * to reset the first information. */
5106 } /* end handle unmergable node */
5107 } /* loop over branches */
5108 DEBUG_TRIE_COMPILE_r({
5109 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5110 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5111 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5112 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5113 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5114 PL_reg_name[trietype]
5118 if ( prev && trietype ) {
5119 if ( trietype != NOTHING ) {
5120 /* the last branch of the sequence was part of
5121 * a trie, so we have to construct it here
5122 * outside of the loop */
5123 made= make_trie( pRExC_state, startbranch,
5124 first, scan, tail, count,
5125 trietype, depth+1 );
5126 #ifdef TRIE_STUDY_OPT
5127 if ( ((made == MADE_EXACT_TRIE &&
5128 startbranch == first)
5129 || ( first_non_open == first )) &&
5131 flags |= SCF_TRIE_RESTUDY;
5132 if ( startbranch == first
5135 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5140 /* at this point we know whatever we have is a
5141 * NOTHING sequence/branch AND if 'startbranch'
5142 * is 'first' then we can turn the whole thing
5145 if ( startbranch == first ) {
5147 /* the entire thing is a NOTHING sequence,
5148 * something like this: (?:|) So we can
5149 * turn it into a plain NOTHING op. */
5150 DEBUG_TRIE_COMPILE_r({
5151 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5152 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5154 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5157 OP(startbranch)= NOTHING;
5158 NEXT_OFF(startbranch)= tail - startbranch;
5159 for ( opt= startbranch + 1; opt < tail ; opt++ )
5163 } /* end if ( prev) */
5164 } /* TRIE_MAXBUF is non zero */
5168 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5169 scan = NEXTOPER(NEXTOPER(scan));
5170 } else /* single branch is optimized. */
5171 scan = NEXTOPER(scan);
5173 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5175 regnode *start = NULL;
5176 regnode *end = NULL;
5177 U32 my_recursed_depth= recursed_depth;
5179 if (OP(scan) != SUSPEND) { /* GOSUB */
5180 /* Do setup, note this code has side effects beyond
5181 * the rest of this block. Specifically setting
5182 * RExC_recurse[] must happen at least once during
5185 RExC_recurse[ARG2L(scan)] = scan;
5186 start = REGNODE_p(RExC_open_parens[paren]);
5187 end = REGNODE_p(RExC_close_parens[paren]);
5189 /* NOTE we MUST always execute the above code, even
5190 * if we do nothing with a GOSUB */
5192 ( flags & SCF_IN_DEFINE )
5195 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5197 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5200 /* no need to do anything here if we are in a define. */
5201 /* or we are after some kind of infinite construct
5202 * so we can skip recursing into this item.
5203 * Since it is infinite we will not change the maxlen
5204 * or delta, and if we miss something that might raise
5205 * the minlen it will merely pessimise a little.
5207 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5208 * might result in a minlen of 1 and not of 4,
5209 * but this doesn't make us mismatch, just try a bit
5210 * harder than we should.
5212 * However we must assume this GOSUB is infinite, to
5213 * avoid wrongly applying other optimizations in the
5214 * enclosing scope - see GH 18096, for example.
5216 is_inf = is_inf_internal = 1;
5217 scan= regnext(scan);
5223 || !PAREN_TEST(recursed_depth - 1, paren)
5225 /* it is quite possible that there are more efficient ways
5226 * to do this. We maintain a bitmap per level of recursion
5227 * of which patterns we have entered so we can detect if a
5228 * pattern creates a possible infinite loop. When we
5229 * recurse down a level we copy the previous levels bitmap
5230 * down. When we are at recursion level 0 we zero the top
5231 * level bitmap. It would be nice to implement a different
5232 * more efficient way of doing this. In particular the top
5233 * level bitmap may be unnecessary.
5235 if (!recursed_depth) {
5236 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5238 Copy(PAREN_OFFSET(recursed_depth - 1),
5239 PAREN_OFFSET(recursed_depth),
5240 RExC_study_chunk_recursed_bytes, U8);
5242 /* we havent recursed into this paren yet, so recurse into it */
5243 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5244 PAREN_SET(recursed_depth, paren);
5245 my_recursed_depth= recursed_depth + 1;
5247 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5248 /* some form of infinite recursion, assume infinite length
5250 if (flags & SCF_DO_SUBSTR) {
5251 scan_commit(pRExC_state, data, minlenp, is_inf);
5252 data->cur_is_floating = 1;
5254 is_inf = is_inf_internal = 1;
5255 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5256 ssc_anything(data->start_class);
5257 flags &= ~SCF_DO_STCLASS;
5259 start= NULL; /* reset start so we dont recurse later on. */
5264 end = regnext(scan);
5267 scan_frame *newframe;
5269 if (!RExC_frame_last) {
5270 Newxz(newframe, 1, scan_frame);
5271 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5272 RExC_frame_head= newframe;
5274 } else if (!RExC_frame_last->next_frame) {
5275 Newxz(newframe, 1, scan_frame);
5276 RExC_frame_last->next_frame= newframe;
5277 newframe->prev_frame= RExC_frame_last;
5280 newframe= RExC_frame_last->next_frame;
5282 RExC_frame_last= newframe;
5284 newframe->next_regnode = regnext(scan);
5285 newframe->last_regnode = last;
5286 newframe->stopparen = stopparen;
5287 newframe->prev_recursed_depth = recursed_depth;
5288 newframe->this_prev_frame= frame;
5289 newframe->in_gosub = (
5290 (frame && frame->in_gosub) || OP(scan) == GOSUB
5293 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5294 DEBUG_PEEP("fnew", scan, depth, flags);
5301 recursed_depth= my_recursed_depth;
5306 else if ( OP(scan) == EXACT
5307 || OP(scan) == LEXACT
5308 || OP(scan) == EXACT_REQ8
5309 || OP(scan) == LEXACT_REQ8
5310 || OP(scan) == EXACTL)
5312 SSize_t bytelen = STR_LEN(scan), charlen;
5316 const U8 * const s = (U8*)STRING(scan);
5317 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5318 charlen = utf8_length(s, s + bytelen);
5320 uc = *((U8*)STRING(scan));
5324 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5325 /* The code below prefers earlier match for fixed
5326 offset, later match for variable offset. */
5327 if (data->last_end == -1) { /* Update the start info. */
5328 data->last_start_min = data->pos_min;
5329 data->last_start_max =
5330 is_inf ? OPTIMIZE_INFTY
5331 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5332 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5334 sv_catpvn(data->last_found, STRING(scan), bytelen);
5336 SvUTF8_on(data->last_found);
5338 SV * const sv = data->last_found;
5339 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5340 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5341 if (mg && mg->mg_len >= 0)
5342 mg->mg_len += charlen;
5344 data->last_end = data->pos_min + charlen;
5345 data->pos_min += charlen; /* As in the first entry. */
5346 data->flags &= ~SF_BEFORE_EOL;
5349 /* ANDing the code point leaves at most it, and not in locale, and
5350 * can't match null string */
5351 if (flags & SCF_DO_STCLASS_AND) {
5352 ssc_cp_and(data->start_class, uc);
5353 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5354 ssc_clear_locale(data->start_class);
5356 else if (flags & SCF_DO_STCLASS_OR) {
5357 ssc_add_cp(data->start_class, uc);
5358 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5360 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5361 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5363 flags &= ~SCF_DO_STCLASS;
5365 else if (PL_regkind[OP(scan)] == EXACT) {
5366 /* But OP != EXACT!, so is EXACTFish */
5367 SSize_t bytelen = STR_LEN(scan), charlen;
5368 const U8 * s = (U8*)STRING(scan);
5370 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5371 * with the mask set to the complement of the bit that differs
5372 * between upper and lower case, and the lowest code point of the
5373 * pair (which the '&' forces) */
5376 && ( OP(scan) == EXACTFAA
5377 || ( OP(scan) == EXACTFU
5378 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5381 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5384 ARG_SET(scan, *s & mask);
5386 /* we're not EXACTFish any more, so restudy */
5390 /* Search for fixed substrings supports EXACT only. */
5391 if (flags & SCF_DO_SUBSTR) {
5393 scan_commit(pRExC_state, data, minlenp, is_inf);
5395 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5396 if (unfolded_multi_char) {
5397 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5399 min += charlen - min_subtract;
5401 delta += min_subtract;
5402 if (flags & SCF_DO_SUBSTR) {
5403 data->pos_min += charlen - min_subtract;
5404 if (data->pos_min < 0) {
5407 data->pos_delta += min_subtract;
5409 data->cur_is_floating = 1; /* float */
5413 if (flags & SCF_DO_STCLASS) {
5414 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5416 assert(EXACTF_invlist);
5417 if (flags & SCF_DO_STCLASS_AND) {
5418 if (OP(scan) != EXACTFL)
5419 ssc_clear_locale(data->start_class);
5420 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5421 ANYOF_POSIXL_ZERO(data->start_class);
5422 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5424 else { /* SCF_DO_STCLASS_OR */
5425 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5426 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5428 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5429 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5431 flags &= ~SCF_DO_STCLASS;
5432 SvREFCNT_dec(EXACTF_invlist);
5435 else if (REGNODE_VARIES(OP(scan))) {
5436 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5437 I32 fl = 0, f = flags;
5438 regnode * const oscan = scan;
5439 regnode_ssc this_class;
5440 regnode_ssc *oclass = NULL;
5441 I32 next_is_eval = 0;
5443 switch (PL_regkind[OP(scan)]) {
5444 case WHILEM: /* End of (?:...)* . */
5445 scan = NEXTOPER(scan);
5448 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5449 next = NEXTOPER(scan);
5450 if ( OP(next) == EXACT
5451 || OP(next) == LEXACT
5452 || OP(next) == EXACT_REQ8
5453 || OP(next) == LEXACT_REQ8
5454 || OP(next) == EXACTL
5455 || (flags & SCF_DO_STCLASS))
5458 maxcount = REG_INFTY;
5459 next = regnext(scan);
5460 scan = NEXTOPER(scan);
5464 if (flags & SCF_DO_SUBSTR)
5466 /* This will bypass the formal 'min += minnext * mincount'
5467 * calculation in the do_curly path, so assumes min width
5468 * of the PLUS payload is exactly one. */
5472 next = NEXTOPER(scan);
5474 /* This temporary node can now be turned into EXACTFU, and
5475 * must, as regexec.c doesn't handle it */
5476 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5480 if ( STR_LEN(next) == 1
5481 && isALPHA_A(* STRING(next))
5482 && ( OP(next) == EXACTFAA
5483 || ( OP(next) == EXACTFU
5484 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5487 /* These differ in just one bit */
5488 U8 mask = ~ ('A' ^ 'a');
5490 assert(isALPHA_A(* STRING(next)));
5492 /* Then replace it by an ANYOFM node, with
5493 * the mask set to the complement of the
5494 * bit that differs between upper and lower
5495 * case, and the lowest code point of the
5496 * pair (which the '&' forces) */
5498 ARG_SET(next, *STRING(next) & mask);
5502 if (flags & SCF_DO_STCLASS) {
5504 maxcount = REG_INFTY;
5505 next = regnext(scan);
5506 scan = NEXTOPER(scan);
5509 if (flags & SCF_DO_SUBSTR) {
5510 scan_commit(pRExC_state, data, minlenp, is_inf);
5511 /* Cannot extend fixed substrings */
5512 data->cur_is_floating = 1; /* float */
5514 is_inf = is_inf_internal = 1;
5515 scan = regnext(scan);
5516 goto optimize_curly_tail;
5518 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5519 && (scan->flags == stopparen))
5524 mincount = ARG1(scan);
5525 maxcount = ARG2(scan);
5527 next = regnext(scan);
5528 if (OP(scan) == CURLYX) {
5529 I32 lp = (data ? *(data->last_closep) : 0);
5530 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5532 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5533 next_is_eval = (OP(scan) == EVAL);
5535 if (flags & SCF_DO_SUBSTR) {
5537 scan_commit(pRExC_state, data, minlenp, is_inf);
5538 /* Cannot extend fixed substrings */
5539 pos_before = data->pos_min;
5543 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5545 data->flags |= SF_IS_INF;
5547 if (flags & SCF_DO_STCLASS) {
5548 ssc_init(pRExC_state, &this_class);
5549 oclass = data->start_class;
5550 data->start_class = &this_class;
5551 f |= SCF_DO_STCLASS_AND;
5552 f &= ~SCF_DO_STCLASS_OR;
5554 /* Exclude from super-linear cache processing any {n,m}
5555 regops for which the combination of input pos and regex
5556 pos is not enough information to determine if a match
5559 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5560 regex pos at the \s*, the prospects for a match depend not
5561 only on the input position but also on how many (bar\s*)
5562 repeats into the {4,8} we are. */
5563 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5564 f &= ~SCF_WHILEM_VISITED_POS;
5566 /* This will finish on WHILEM, setting scan, or on NULL: */
5567 /* recurse study_chunk() on loop bodies */
5568 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5569 last, data, stopparen, recursed_depth, NULL,
5571 ? (f & ~SCF_DO_SUBSTR)
5573 , depth+1, mutate_ok);
5575 if (flags & SCF_DO_STCLASS)
5576 data->start_class = oclass;
5577 if (mincount == 0 || minnext == 0) {
5578 if (flags & SCF_DO_STCLASS_OR) {
5579 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5581 else if (flags & SCF_DO_STCLASS_AND) {
5582 /* Switch to OR mode: cache the old value of
5583 * data->start_class */
5585 StructCopy(data->start_class, and_withp, regnode_ssc);
5586 flags &= ~SCF_DO_STCLASS_AND;
5587 StructCopy(&this_class, data->start_class, regnode_ssc);
5588 flags |= SCF_DO_STCLASS_OR;
5589 ANYOF_FLAGS(data->start_class)
5590 |= SSC_MATCHES_EMPTY_STRING;
5592 } else { /* Non-zero len */
5593 if (flags & SCF_DO_STCLASS_OR) {
5594 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5595 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5597 else if (flags & SCF_DO_STCLASS_AND)
5598 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5599 flags &= ~SCF_DO_STCLASS;
5601 if (!scan) /* It was not CURLYX, but CURLY. */
5603 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5604 /* ? quantifier ok, except for (?{ ... }) */
5605 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5606 && (minnext == 0) && (deltanext == 0)
5607 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5608 && maxcount <= REG_INFTY/3) /* Complement check for big
5611 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5612 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5613 "Quantifier unexpected on zero-length expression "
5614 "in regex m/%" UTF8f "/",
5615 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5619 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5620 || min >= SSize_t_MAX - minnext * mincount )
5622 FAIL("Regexp out of space");
5625 min += minnext * mincount;
5626 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5627 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5628 is_inf |= is_inf_internal;
5630 delta = OPTIMIZE_INFTY;
5632 delta += (minnext + deltanext) * maxcount
5633 - minnext * mincount;
5635 /* Try powerful optimization CURLYX => CURLYN. */
5636 if ( OP(oscan) == CURLYX && data
5637 && data->flags & SF_IN_PAR
5638 && !(data->flags & SF_HAS_EVAL)
5639 && !deltanext && minnext == 1
5642 /* Try to optimize to CURLYN. */
5643 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5644 regnode * const nxt1 = nxt;
5651 if (!REGNODE_SIMPLE(OP(nxt))
5652 && !(PL_regkind[OP(nxt)] == EXACT
5653 && STR_LEN(nxt) == 1))
5659 if (OP(nxt) != CLOSE)
5661 if (RExC_open_parens) {
5664 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5667 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5669 /* Now we know that nxt2 is the only contents: */
5670 oscan->flags = (U8)ARG(nxt);
5672 OP(nxt1) = NOTHING; /* was OPEN. */
5675 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5676 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5677 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5678 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5679 OP(nxt + 1) = OPTIMIZED; /* was count. */
5680 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5685 /* Try optimization CURLYX => CURLYM. */
5686 if ( OP(oscan) == CURLYX && data
5687 && !(data->flags & SF_HAS_PAR)
5688 && !(data->flags & SF_HAS_EVAL)
5689 && !deltanext /* atom is fixed width */
5690 && minnext != 0 /* CURLYM can't handle zero width */
5691 /* Nor characters whose fold at run-time may be
5692 * multi-character */
5693 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5696 /* XXXX How to optimize if data == 0? */
5697 /* Optimize to a simpler form. */
5698 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5702 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5703 && (OP(nxt2) != WHILEM))
5705 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5706 /* Need to optimize away parenths. */
5707 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5708 /* Set the parenth number. */
5709 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5711 oscan->flags = (U8)ARG(nxt);
5712 if (RExC_open_parens) {
5714 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5717 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5720 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5721 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5724 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5725 OP(nxt + 1) = OPTIMIZED; /* was count. */
5726 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5727 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5730 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5731 regnode *nnxt = regnext(nxt1);
5733 if (reg_off_by_arg[OP(nxt1)])
5734 ARG_SET(nxt1, nxt2 - nxt1);
5735 else if (nxt2 - nxt1 < U16_MAX)
5736 NEXT_OFF(nxt1) = nxt2 - nxt1;
5738 OP(nxt) = NOTHING; /* Cannot beautify */
5743 /* Optimize again: */
5744 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5745 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5746 NULL, stopparen, recursed_depth, NULL, 0,
5747 depth+1, mutate_ok);
5752 else if ((OP(oscan) == CURLYX)
5753 && (flags & SCF_WHILEM_VISITED_POS)
5754 /* See the comment on a similar expression above.
5755 However, this time it's not a subexpression
5756 we care about, but the expression itself. */
5757 && (maxcount == REG_INFTY)
5759 /* This stays as CURLYX, we can put the count/of pair. */
5760 /* Find WHILEM (as in regexec.c) */
5761 regnode *nxt = oscan + NEXT_OFF(oscan);
5763 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5765 nxt = PREVOPER(nxt);
5766 if (nxt->flags & 0xf) {
5767 /* we've already set whilem count on this node */
5768 } else if (++data->whilem_c < 16) {
5769 assert(data->whilem_c <= RExC_whilem_seen);
5770 nxt->flags = (U8)(data->whilem_c
5771 | (RExC_whilem_seen << 4)); /* On WHILEM */
5774 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5776 if (flags & SCF_DO_SUBSTR) {
5777 SV *last_str = NULL;
5778 STRLEN last_chrs = 0;
5779 int counted = mincount != 0;
5781 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5783 SSize_t b = pos_before >= data->last_start_min
5784 ? pos_before : data->last_start_min;
5786 const char * const s = SvPV_const(data->last_found, l);
5787 SSize_t old = b - data->last_start_min;
5791 old = utf8_hop_forward((U8*)s, old,
5792 (U8 *) SvEND(data->last_found))
5795 /* Get the added string: */
5796 last_str = newSVpvn_utf8(s + old, l, UTF);
5797 last_chrs = UTF ? utf8_length((U8*)(s + old),
5798 (U8*)(s + old + l)) : l;
5799 if (deltanext == 0 && pos_before == b) {
5800 /* What was added is a constant string */
5803 SvGROW(last_str, (mincount * l) + 1);
5804 repeatcpy(SvPVX(last_str) + l,
5805 SvPVX_const(last_str), l,
5807 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5808 /* Add additional parts. */
5809 SvCUR_set(data->last_found,
5810 SvCUR(data->last_found) - l);
5811 sv_catsv(data->last_found, last_str);
5813 SV * sv = data->last_found;
5815 SvUTF8(sv) && SvMAGICAL(sv) ?
5816 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5817 if (mg && mg->mg_len >= 0)
5818 mg->mg_len += last_chrs * (mincount-1);
5820 last_chrs *= mincount;
5821 data->last_end += l * (mincount - 1);
5824 /* start offset must point into the last copy */
5825 data->last_start_min += minnext * (mincount - 1);
5826 data->last_start_max =
5829 : data->last_start_max +
5830 (maxcount - 1) * (minnext + data->pos_delta);
5833 /* It is counted once already... */
5834 data->pos_min += minnext * (mincount - counted);
5836 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5837 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5838 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5839 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5841 if (deltanext != OPTIMIZE_INFTY)
5842 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5843 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5844 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5846 if (deltanext == OPTIMIZE_INFTY
5847 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5848 data->pos_delta = OPTIMIZE_INFTY;
5850 data->pos_delta += - counted * deltanext +
5851 (minnext + deltanext) * maxcount - minnext * mincount;
5852 if (mincount != maxcount) {
5853 /* Cannot extend fixed substrings found inside
5855 scan_commit(pRExC_state, data, minlenp, is_inf);
5856 if (mincount && last_str) {
5857 SV * const sv = data->last_found;
5858 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5859 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5863 sv_setsv(sv, last_str);
5864 data->last_end = data->pos_min;
5865 data->last_start_min = data->pos_min - last_chrs;
5866 data->last_start_max = is_inf
5868 : data->pos_min + data->pos_delta - last_chrs;
5870 data->cur_is_floating = 1; /* float */
5872 SvREFCNT_dec(last_str);
5874 if (data && (fl & SF_HAS_EVAL))
5875 data->flags |= SF_HAS_EVAL;
5876 optimize_curly_tail:
5877 rck_elide_nothing(oscan);
5881 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5885 if (flags & SCF_DO_SUBSTR) {
5886 /* Cannot expect anything... */
5887 scan_commit(pRExC_state, data, minlenp, is_inf);
5888 data->cur_is_floating = 1; /* float */
5890 is_inf = is_inf_internal = 1;
5891 if (flags & SCF_DO_STCLASS_OR) {
5892 if (OP(scan) == CLUMP) {
5893 /* Actually is any start char, but very few code points
5894 * aren't start characters */
5895 ssc_match_all_cp(data->start_class);
5898 ssc_anything(data->start_class);
5901 flags &= ~SCF_DO_STCLASS;
5905 else if (OP(scan) == LNBREAK) {
5906 if (flags & SCF_DO_STCLASS) {
5907 if (flags & SCF_DO_STCLASS_AND) {
5908 ssc_intersection(data->start_class,
5909 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5910 ssc_clear_locale(data->start_class);
5911 ANYOF_FLAGS(data->start_class)
5912 &= ~SSC_MATCHES_EMPTY_STRING;
5914 else if (flags & SCF_DO_STCLASS_OR) {
5915 ssc_union(data->start_class,
5916 PL_XPosix_ptrs[_CC_VERTSPACE],
5918 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5920 /* See commit msg for
5921 * 749e076fceedeb708a624933726e7989f2302f6a */
5922 ANYOF_FLAGS(data->start_class)
5923 &= ~SSC_MATCHES_EMPTY_STRING;
5925 flags &= ~SCF_DO_STCLASS;
5928 if (delta != OPTIMIZE_INFTY)
5929 delta++; /* Because of the 2 char string cr-lf */
5930 if (flags & SCF_DO_SUBSTR) {
5931 /* Cannot expect anything... */
5932 scan_commit(pRExC_state, data, minlenp, is_inf);
5934 if (data->pos_delta != OPTIMIZE_INFTY) {
5935 data->pos_delta += 1;
5937 data->cur_is_floating = 1; /* float */
5940 else if (REGNODE_SIMPLE(OP(scan))) {
5942 if (flags & SCF_DO_SUBSTR) {
5943 scan_commit(pRExC_state, data, minlenp, is_inf);
5947 if (flags & SCF_DO_STCLASS) {
5949 SV* my_invlist = NULL;
5952 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5953 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5955 /* Some of the logic below assumes that switching
5956 locale on will only add false positives. */
5961 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5965 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5966 ssc_match_all_cp(data->start_class);
5971 SV* REG_ANY_invlist = _new_invlist(2);
5972 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5974 if (flags & SCF_DO_STCLASS_OR) {
5975 ssc_union(data->start_class,
5977 TRUE /* TRUE => invert, hence all but \n
5981 else if (flags & SCF_DO_STCLASS_AND) {
5982 ssc_intersection(data->start_class,
5984 TRUE /* TRUE => invert */
5986 ssc_clear_locale(data->start_class);
5988 SvREFCNT_dec_NN(REG_ANY_invlist);
6000 if (flags & SCF_DO_STCLASS_AND)
6001 ssc_and(pRExC_state, data->start_class,
6002 (regnode_charclass *) scan);
6004 ssc_or(pRExC_state, data->start_class,
6005 (regnode_charclass *) scan);
6008 case NANYOFM: /* NANYOFM already contains the inversion of the
6009 input ANYOF data, so, unlike things like
6010 NPOSIXA, don't change 'invert' to TRUE */
6014 SV* cp_list = get_ANYOFM_contents(scan);
6016 if (flags & SCF_DO_STCLASS_OR) {
6017 ssc_union(data->start_class, cp_list, invert);
6019 else if (flags & SCF_DO_STCLASS_AND) {
6020 ssc_intersection(data->start_class, cp_list, invert);
6023 SvREFCNT_dec_NN(cp_list);
6032 cp_list = _add_range_to_invlist(cp_list,
6034 ANYOFRbase(scan) + ANYOFRdelta(scan));
6036 if (flags & SCF_DO_STCLASS_OR) {
6037 ssc_union(data->start_class, cp_list, invert);
6039 else if (flags & SCF_DO_STCLASS_AND) {
6040 ssc_intersection(data->start_class, cp_list, invert);
6043 SvREFCNT_dec_NN(cp_list);
6052 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6053 if (flags & SCF_DO_STCLASS_AND) {
6054 bool was_there = cBOOL(
6055 ANYOF_POSIXL_TEST(data->start_class,
6057 ANYOF_POSIXL_ZERO(data->start_class);
6058 if (was_there) { /* Do an AND */
6059 ANYOF_POSIXL_SET(data->start_class, namedclass);
6061 /* No individual code points can now match */
6062 data->start_class->invlist
6063 = sv_2mortal(_new_invlist(0));
6066 int complement = namedclass + ((invert) ? -1 : 1);
6068 assert(flags & SCF_DO_STCLASS_OR);
6070 /* If the complement of this class was already there,
6071 * the result is that they match all code points,
6072 * (\d + \D == everything). Remove the classes from
6073 * future consideration. Locale is not relevant in
6075 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6076 ssc_match_all_cp(data->start_class);
6077 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6078 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6080 else { /* The usual case; just add this class to the
6082 ANYOF_POSIXL_SET(data->start_class, namedclass);
6087 case NPOSIXA: /* For these, we always know the exact set of
6092 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6093 goto join_posix_and_ascii;
6101 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6103 /* NPOSIXD matches all upper Latin1 code points unless the
6104 * target string being matched is UTF-8, which is
6105 * unknowable until match time. Since we are going to
6106 * invert, we want to get rid of all of them so that the
6107 * inversion will match all */
6108 if (OP(scan) == NPOSIXD) {
6109 _invlist_subtract(my_invlist, PL_UpperLatin1,
6113 join_posix_and_ascii:
6115 if (flags & SCF_DO_STCLASS_AND) {
6116 ssc_intersection(data->start_class, my_invlist, invert);
6117 ssc_clear_locale(data->start_class);
6120 assert(flags & SCF_DO_STCLASS_OR);
6121 ssc_union(data->start_class, my_invlist, invert);
6123 SvREFCNT_dec(my_invlist);
6125 if (flags & SCF_DO_STCLASS_OR)
6126 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6127 flags &= ~SCF_DO_STCLASS;
6130 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6131 data->flags |= (OP(scan) == MEOL
6134 scan_commit(pRExC_state, data, minlenp, is_inf);
6137 else if ( PL_regkind[OP(scan)] == BRANCHJ
6138 /* Lookbehind, or need to calculate parens/evals/stclass: */
6139 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6140 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6142 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6143 || OP(scan) == UNLESSM )
6145 /* Negative Lookahead/lookbehind
6146 In this case we can't do fixed string optimisation.
6149 SSize_t deltanext, minnext, fake = 0;
6154 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6156 data_fake.whilem_c = data->whilem_c;
6157 data_fake.last_closep = data->last_closep;
6160 data_fake.last_closep = &fake;
6161 data_fake.pos_delta = delta;
6162 if ( flags & SCF_DO_STCLASS && !scan->flags
6163 && OP(scan) == IFMATCH ) { /* Lookahead */
6164 ssc_init(pRExC_state, &intrnl);
6165 data_fake.start_class = &intrnl;
6166 f |= SCF_DO_STCLASS_AND;
6168 if (flags & SCF_WHILEM_VISITED_POS)
6169 f |= SCF_WHILEM_VISITED_POS;
6170 next = regnext(scan);
6171 nscan = NEXTOPER(NEXTOPER(scan));
6173 /* recurse study_chunk() for lookahead body */
6174 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6175 last, &data_fake, stopparen,
6176 recursed_depth, NULL, f, depth+1,
6180 || deltanext > (I32) U8_MAX
6181 || minnext > (I32)U8_MAX
6182 || minnext + deltanext > (I32)U8_MAX)
6184 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6188 /* The 'next_off' field has been repurposed to count the
6189 * additional starting positions to try beyond the initial
6190 * one. (This leaves it at 0 for non-variable length
6191 * matches to avoid breakage for those not using this
6194 scan->next_off = deltanext;
6195 ckWARNexperimental(RExC_parse,
6196 WARN_EXPERIMENTAL__VLB,
6197 "Variable length lookbehind is experimental");
6199 scan->flags = (U8)minnext + deltanext;
6202 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6204 if (data_fake.flags & SF_HAS_EVAL)
6205 data->flags |= SF_HAS_EVAL;
6206 data->whilem_c = data_fake.whilem_c;
6208 if (f & SCF_DO_STCLASS_AND) {
6209 if (flags & SCF_DO_STCLASS_OR) {
6210 /* OR before, AND after: ideally we would recurse with
6211 * data_fake to get the AND applied by study of the
6212 * remainder of the pattern, and then derecurse;
6213 * *** HACK *** for now just treat as "no information".
6214 * See [perl #56690].
6216 ssc_init(pRExC_state, data->start_class);
6218 /* AND before and after: combine and continue. These
6219 * assertions are zero-length, so can match an EMPTY
6221 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6222 ANYOF_FLAGS(data->start_class)
6223 |= SSC_MATCHES_EMPTY_STRING;
6227 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6229 /* Positive Lookahead/lookbehind
6230 In this case we can do fixed string optimisation,
6231 but we must be careful about it. Note in the case of
6232 lookbehind the positions will be offset by the minimum
6233 length of the pattern, something we won't know about
6234 until after the recurse.
6236 SSize_t deltanext, fake = 0;
6240 /* We use SAVEFREEPV so that when the full compile
6241 is finished perl will clean up the allocated
6242 minlens when it's all done. This way we don't
6243 have to worry about freeing them when we know
6244 they wont be used, which would be a pain.
6247 Newx( minnextp, 1, SSize_t );
6248 SAVEFREEPV(minnextp);
6251 StructCopy(data, &data_fake, scan_data_t);
6252 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6255 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6256 data_fake.last_found=newSVsv(data->last_found);
6260 data_fake.last_closep = &fake;
6261 data_fake.flags = 0;
6262 data_fake.substrs[0].flags = 0;
6263 data_fake.substrs[1].flags = 0;
6264 data_fake.pos_delta = delta;
6266 data_fake.flags |= SF_IS_INF;
6267 if ( flags & SCF_DO_STCLASS && !scan->flags
6268 && OP(scan) == IFMATCH ) { /* Lookahead */
6269 ssc_init(pRExC_state, &intrnl);
6270 data_fake.start_class = &intrnl;
6271 f |= SCF_DO_STCLASS_AND;
6273 if (flags & SCF_WHILEM_VISITED_POS)
6274 f |= SCF_WHILEM_VISITED_POS;
6275 next = regnext(scan);
6276 nscan = NEXTOPER(NEXTOPER(scan));
6278 /* positive lookahead study_chunk() recursion */
6279 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6280 &deltanext, last, &data_fake,
6281 stopparen, recursed_depth, NULL,
6282 f, depth+1, mutate_ok);
6284 assert(0); /* This code has never been tested since this
6285 is normally not compiled */
6287 || deltanext > (I32) U8_MAX
6288 || *minnextp > (I32)U8_MAX
6289 || *minnextp + deltanext > (I32)U8_MAX)
6291 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6296 scan->next_off = deltanext;
6298 scan->flags = (U8)*minnextp + deltanext;
6303 if (f & SCF_DO_STCLASS_AND) {
6304 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6305 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6308 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6310 if (data_fake.flags & SF_HAS_EVAL)
6311 data->flags |= SF_HAS_EVAL;
6312 data->whilem_c = data_fake.whilem_c;
6313 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6315 if (RExC_rx->minlen<*minnextp)
6316 RExC_rx->minlen=*minnextp;
6317 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6318 SvREFCNT_dec_NN(data_fake.last_found);
6320 for (i = 0; i < 2; i++) {
6321 if (data_fake.substrs[i].minlenp != minlenp) {
6322 data->substrs[i].min_offset =
6323 data_fake.substrs[i].min_offset;
6324 data->substrs[i].max_offset =
6325 data_fake.substrs[i].max_offset;
6326 data->substrs[i].minlenp =
6327 data_fake.substrs[i].minlenp;
6328 data->substrs[i].lookbehind += scan->flags;
6336 else if (OP(scan) == OPEN) {
6337 if (stopparen != (I32)ARG(scan))
6340 else if (OP(scan) == CLOSE) {
6341 if (stopparen == (I32)ARG(scan)) {
6344 if ((I32)ARG(scan) == is_par) {
6345 next = regnext(scan);
6347 if ( next && (OP(next) != WHILEM) && next < last)
6348 is_par = 0; /* Disable optimization */
6351 *(data->last_closep) = ARG(scan);
6353 else if (OP(scan) == EVAL) {
6355 data->flags |= SF_HAS_EVAL;
6357 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6358 if (flags & SCF_DO_SUBSTR) {
6359 scan_commit(pRExC_state, data, minlenp, is_inf);
6360 flags &= ~SCF_DO_SUBSTR;
6362 if (data && OP(scan)==ACCEPT) {
6363 data->flags |= SCF_SEEN_ACCEPT;
6368 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6370 if (flags & SCF_DO_SUBSTR) {
6371 scan_commit(pRExC_state, data, minlenp, is_inf);
6372 data->cur_is_floating = 1; /* float */
6374 is_inf = is_inf_internal = 1;
6375 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6376 ssc_anything(data->start_class);
6377 flags &= ~SCF_DO_STCLASS;
6379 else if (OP(scan) == GPOS) {
6380 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6381 !(delta || is_inf || (data && data->pos_delta)))
6383 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6384 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6385 if (RExC_rx->gofs < (STRLEN)min)
6386 RExC_rx->gofs = min;
6388 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6392 #ifdef TRIE_STUDY_OPT
6393 #ifdef FULL_TRIE_STUDY
6394 else if (PL_regkind[OP(scan)] == TRIE) {
6395 /* NOTE - There is similar code to this block above for handling
6396 BRANCH nodes on the initial study. If you change stuff here
6398 regnode *trie_node= scan;
6399 regnode *tail= regnext(scan);
6400 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6401 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6404 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6405 /* Cannot merge strings after this. */
6406 scan_commit(pRExC_state, data, minlenp, is_inf);
6408 if (flags & SCF_DO_STCLASS)
6409 ssc_init_zero(pRExC_state, &accum);
6415 const regnode *nextbranch= NULL;
6418 for ( word=1 ; word <= trie->wordcount ; word++)
6420 SSize_t deltanext=0, minnext=0, f = 0, fake;
6421 regnode_ssc this_class;
6423 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6425 data_fake.whilem_c = data->whilem_c;
6426 data_fake.last_closep = data->last_closep;
6429 data_fake.last_closep = &fake;
6430 data_fake.pos_delta = delta;
6431 if (flags & SCF_DO_STCLASS) {
6432 ssc_init(pRExC_state, &this_class);
6433 data_fake.start_class = &this_class;
6434 f = SCF_DO_STCLASS_AND;
6436 if (flags & SCF_WHILEM_VISITED_POS)
6437 f |= SCF_WHILEM_VISITED_POS;
6439 if (trie->jump[word]) {
6441 nextbranch = trie_node + trie->jump[0];
6442 scan= trie_node + trie->jump[word];
6443 /* We go from the jump point to the branch that follows
6444 it. Note this means we need the vestigal unused
6445 branches even though they arent otherwise used. */
6446 /* optimise study_chunk() for TRIE */
6447 minnext = study_chunk(pRExC_state, &scan, minlenp,
6448 &deltanext, (regnode *)nextbranch, &data_fake,
6449 stopparen, recursed_depth, NULL, f, depth+1,
6452 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6453 nextbranch= regnext((regnode*)nextbranch);
6455 if (min1 > (SSize_t)(minnext + trie->minlen))
6456 min1 = minnext + trie->minlen;
6457 if (deltanext == OPTIMIZE_INFTY) {
6458 is_inf = is_inf_internal = 1;
6459 max1 = OPTIMIZE_INFTY;
6460 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6461 max1 = minnext + deltanext + trie->maxlen;
6463 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6465 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6466 if ( stopmin > min + min1)
6467 stopmin = min + min1;
6468 flags &= ~SCF_DO_SUBSTR;
6470 data->flags |= SCF_SEEN_ACCEPT;
6473 if (data_fake.flags & SF_HAS_EVAL)
6474 data->flags |= SF_HAS_EVAL;
6475 data->whilem_c = data_fake.whilem_c;
6477 if (flags & SCF_DO_STCLASS)
6478 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6481 if (flags & SCF_DO_SUBSTR) {
6482 data->pos_min += min1;
6483 data->pos_delta += max1 - min1;
6484 if (max1 != min1 || is_inf)
6485 data->cur_is_floating = 1; /* float */
6488 if (delta != OPTIMIZE_INFTY) {
6489 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6490 delta += max1 - min1;
6492 delta = OPTIMIZE_INFTY;
6494 if (flags & SCF_DO_STCLASS_OR) {
6495 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6497 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6498 flags &= ~SCF_DO_STCLASS;
6501 else if (flags & SCF_DO_STCLASS_AND) {
6503 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6504 flags &= ~SCF_DO_STCLASS;
6507 /* Switch to OR mode: cache the old value of
6508 * data->start_class */
6510 StructCopy(data->start_class, and_withp, regnode_ssc);
6511 flags &= ~SCF_DO_STCLASS_AND;
6512 StructCopy(&accum, data->start_class, regnode_ssc);
6513 flags |= SCF_DO_STCLASS_OR;
6520 else if (PL_regkind[OP(scan)] == TRIE) {
6521 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6524 min += trie->minlen;
6525 delta += (trie->maxlen - trie->minlen);
6526 flags &= ~SCF_DO_STCLASS; /* xxx */
6527 if (flags & SCF_DO_SUBSTR) {
6528 /* Cannot expect anything... */
6529 scan_commit(pRExC_state, data, minlenp, is_inf);
6530 data->pos_min += trie->minlen;
6531 data->pos_delta += (trie->maxlen - trie->minlen);
6532 if (trie->maxlen != trie->minlen)
6533 data->cur_is_floating = 1; /* float */
6535 if (trie->jump) /* no more substrings -- for now /grr*/
6536 flags &= ~SCF_DO_SUBSTR;
6538 else if (OP(scan) == REGEX_SET) {
6539 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6540 " before optimization", reg_name[REGEX_SET]);
6543 #endif /* old or new */
6544 #endif /* TRIE_STUDY_OPT */
6546 /* Else: zero-length, ignore. */
6547 scan = regnext(scan);
6552 /* we need to unwind recursion. */
6555 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6556 DEBUG_PEEP("fend", scan, depth, flags);
6558 /* restore previous context */
6559 last = frame->last_regnode;
6560 scan = frame->next_regnode;
6561 stopparen = frame->stopparen;
6562 recursed_depth = frame->prev_recursed_depth;
6564 RExC_frame_last = frame->prev_frame;
6565 frame = frame->this_prev_frame;
6566 goto fake_study_recurse;
6570 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6573 *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6575 if (flags & SCF_DO_SUBSTR && is_inf)
6576 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6577 if (is_par > (I32)U8_MAX)
6579 if (is_par && pars==1 && data) {
6580 data->flags |= SF_IN_PAR;
6581 data->flags &= ~SF_HAS_PAR;
6583 else if (pars && data) {
6584 data->flags |= SF_HAS_PAR;
6585 data->flags &= ~SF_IN_PAR;
6587 if (flags & SCF_DO_STCLASS_OR)
6588 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6589 if (flags & SCF_TRIE_RESTUDY)
6590 data->flags |= SCF_TRIE_RESTUDY;
6592 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6594 final_minlen = min < stopmin
6597 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6598 if (final_minlen > OPTIMIZE_INFTY - delta)
6599 RExC_maxlen = OPTIMIZE_INFTY;
6600 else if (RExC_maxlen < final_minlen + delta)
6601 RExC_maxlen = final_minlen + delta;
6603 return final_minlen;
6607 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6609 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6611 PERL_ARGS_ASSERT_ADD_DATA;
6613 Renewc(RExC_rxi->data,
6614 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6615 char, struct reg_data);
6617 Renew(RExC_rxi->data->what, count + n, U8);
6619 Newx(RExC_rxi->data->what, n, U8);
6620 RExC_rxi->data->count = count + n;
6621 Copy(s, RExC_rxi->data->what + count, n, U8);
6625 /*XXX: todo make this not included in a non debugging perl, but appears to be
6626 * used anyway there, in 'use re' */
6627 #ifndef PERL_IN_XSUB_RE
6629 Perl_reginitcolors(pTHX)
6631 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6633 char *t = savepv(s);
6637 t = strchr(t, '\t');
6643 PL_colors[i] = t = (char *)"";
6648 PL_colors[i++] = (char *)"";
6655 #ifdef TRIE_STUDY_OPT
6656 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6659 (data.flags & SCF_TRIE_RESTUDY) \
6667 #define CHECK_RESTUDY_GOTO_butfirst
6671 * pregcomp - compile a regular expression into internal code
6673 * Decides which engine's compiler to call based on the hint currently in
6677 #ifndef PERL_IN_XSUB_RE
6679 /* return the currently in-scope regex engine (or the default if none) */
6681 regexp_engine const *
6682 Perl_current_re_engine(pTHX)
6684 if (IN_PERL_COMPILETIME) {
6685 HV * const table = GvHV(PL_hintgv);
6688 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6689 return &PL_core_reg_engine;
6690 ptr = hv_fetchs(table, "regcomp", FALSE);
6691 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6692 return &PL_core_reg_engine;
6693 return INT2PTR(regexp_engine*, SvIV(*ptr));
6697 if (!PL_curcop->cop_hints_hash)
6698 return &PL_core_reg_engine;
6699 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6700 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6701 return &PL_core_reg_engine;
6702 return INT2PTR(regexp_engine*, SvIV(ptr));
6708 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6710 regexp_engine const *eng = current_re_engine();
6711 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6713 PERL_ARGS_ASSERT_PREGCOMP;
6715 /* Dispatch a request to compile a regexp to correct regexp engine. */
6717 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6720 return CALLREGCOMP_ENG(eng, pattern, flags);
6724 /* public(ish) entry point for the perl core's own regex compiling code.
6725 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6726 * pattern rather than a list of OPs, and uses the internal engine rather
6727 * than the current one */
6730 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6732 SV *pat = pattern; /* defeat constness! */
6734 PERL_ARGS_ASSERT_RE_COMPILE;
6736 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6737 #ifdef PERL_IN_XSUB_RE
6740 &PL_core_reg_engine,
6742 NULL, NULL, rx_flags, 0);
6746 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6750 if (--cbs->refcnt > 0)
6752 for (n = 0; n < cbs->count; n++) {
6753 REGEXP *rx = cbs->cb[n].src_regex;
6755 cbs->cb[n].src_regex = NULL;
6756 SvREFCNT_dec_NN(rx);
6764 static struct reg_code_blocks *
6765 S_alloc_code_blocks(pTHX_ int ncode)
6767 struct reg_code_blocks *cbs;
6768 Newx(cbs, 1, struct reg_code_blocks);
6771 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6773 Newx(cbs->cb, ncode, struct reg_code_block);
6780 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6781 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6782 * point to the realloced string and length.
6784 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6788 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6789 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6791 U8 *const src = (U8*)*pat_p;
6796 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6798 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6799 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6801 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6802 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6805 while (s < *plen_p) {
6806 append_utf8_from_native_byte(src[s], &d);
6808 if (n < num_code_blocks) {
6809 assert(pRExC_state->code_blocks);
6810 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6811 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6812 assert(*(d - 1) == '(');
6815 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6816 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6817 assert(*(d - 1) == ')');
6826 *pat_p = (char*) dst;
6828 RExC_orig_utf8 = RExC_utf8 = 1;
6833 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6834 * while recording any code block indices, and handling overloading,
6835 * nested qr// objects etc. If pat is null, it will allocate a new
6836 * string, or just return the first arg, if there's only one.
6838 * Returns the malloced/updated pat.
6839 * patternp and pat_count is the array of SVs to be concatted;
6840 * oplist is the optional list of ops that generated the SVs;
6841 * recompile_p is a pointer to a boolean that will be set if
6842 * the regex will need to be recompiled.
6843 * delim, if non-null is an SV that will be inserted between each element
6847 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6848 SV *pat, SV ** const patternp, int pat_count,
6849 OP *oplist, bool *recompile_p, SV *delim)
6853 bool use_delim = FALSE;
6854 bool alloced = FALSE;
6856 /* if we know we have at least two args, create an empty string,
6857 * then concatenate args to that. For no args, return an empty string */
6858 if (!pat && pat_count != 1) {
6864 for (svp = patternp; svp < patternp + pat_count; svp++) {
6867 STRLEN orig_patlen = 0;
6869 SV *msv = use_delim ? delim : *svp;
6870 if (!msv) msv = &PL_sv_undef;
6872 /* if we've got a delimiter, we go round the loop twice for each
6873 * svp slot (except the last), using the delimiter the second
6882 if (SvTYPE(msv) == SVt_PVAV) {
6883 /* we've encountered an interpolated array within
6884 * the pattern, e.g. /...@a..../. Expand the list of elements,
6885 * then recursively append elements.
6886 * The code in this block is based on S_pushav() */
6888 AV *const av = (AV*)msv;
6889 const SSize_t maxarg = AvFILL(av) + 1;
6893 assert(oplist->op_type == OP_PADAV
6894 || oplist->op_type == OP_RV2AV);
6895 oplist = OpSIBLING(oplist);
6898 if (SvRMAGICAL(av)) {
6901 Newx(array, maxarg, SV*);
6903 for (i=0; i < maxarg; i++) {
6904 SV ** const svp = av_fetch(av, i, FALSE);
6905 array[i] = svp ? *svp : &PL_sv_undef;
6909 array = AvARRAY(av);
6911 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6912 array, maxarg, NULL, recompile_p,
6914 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6920 /* we make the assumption here that each op in the list of
6921 * op_siblings maps to one SV pushed onto the stack,
6922 * except for code blocks, with have both an OP_NULL and
6924 * This allows us to match up the list of SVs against the
6925 * list of OPs to find the next code block.
6927 * Note that PUSHMARK PADSV PADSV ..
6929 * PADRANGE PADSV PADSV ..
6930 * so the alignment still works. */
6933 if (oplist->op_type == OP_NULL
6934 && (oplist->op_flags & OPf_SPECIAL))
6936 assert(n < pRExC_state->code_blocks->count);
6937 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6938 pRExC_state->code_blocks->cb[n].block = oplist;
6939 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6942 oplist = OpSIBLING(oplist); /* skip CONST */
6945 oplist = OpSIBLING(oplist);;
6948 /* apply magic and QR overloading to arg */
6951 if (SvROK(msv) && SvAMAGIC(msv)) {
6952 SV *sv = AMG_CALLunary(msv, regexp_amg);
6956 if (SvTYPE(sv) != SVt_REGEXP)
6957 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6962 /* try concatenation overload ... */
6963 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6964 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6967 /* overloading involved: all bets are off over literal
6968 * code. Pretend we haven't seen it */
6970 pRExC_state->code_blocks->count -= n;
6974 /* ... or failing that, try "" overload */
6975 while (SvAMAGIC(msv)
6976 && (sv = AMG_CALLunary(msv, string_amg))
6980 && SvRV(msv) == SvRV(sv))
6985 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6989 /* this is a partially unrolled
6990 * sv_catsv_nomg(pat, msv);
6991 * that allows us to adjust code block indices if
6994 char *dst = SvPV_force_nomg(pat, dlen);
6996 if (SvUTF8(msv) && !SvUTF8(pat)) {
6997 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6998 sv_setpvn(pat, dst, dlen);
7001 sv_catsv_nomg(pat, msv);
7005 /* We have only one SV to process, but we need to verify
7006 * it is properly null terminated or we will fail asserts
7007 * later. In theory we probably shouldn't get such SV's,
7008 * but if we do we should handle it gracefully. */
7009 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7010 /* not a string, or a string with a trailing null */
7013 /* a string with no trailing null, we need to copy it
7014 * so it has a trailing null */
7015 pat = sv_2mortal(newSVsv(msv));
7020 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7023 /* extract any code blocks within any embedded qr//'s */
7024 if (rx && SvTYPE(rx) == SVt_REGEXP
7025 && RX_ENGINE((REGEXP*)rx)->op_comp)
7028 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7029 if (ri->code_blocks && ri->code_blocks->count) {
7031 /* the presence of an embedded qr// with code means
7032 * we should always recompile: the text of the
7033 * qr// may not have changed, but it may be a
7034 * different closure than last time */
7036 if (pRExC_state->code_blocks) {
7037 int new_count = pRExC_state->code_blocks->count
7038 + ri->code_blocks->count;
7039 Renew(pRExC_state->code_blocks->cb,
7040 new_count, struct reg_code_block);
7041 pRExC_state->code_blocks->count = new_count;
7044 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7045 ri->code_blocks->count);
7047 for (i=0; i < ri->code_blocks->count; i++) {
7048 struct reg_code_block *src, *dst;
7049 STRLEN offset = orig_patlen
7050 + ReANY((REGEXP *)rx)->pre_prefix;
7051 assert(n < pRExC_state->code_blocks->count);
7052 src = &ri->code_blocks->cb[i];
7053 dst = &pRExC_state->code_blocks->cb[n];
7054 dst->start = src->start + offset;
7055 dst->end = src->end + offset;
7056 dst->block = src->block;
7057 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7066 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7075 /* see if there are any run-time code blocks in the pattern.
7076 * False positives are allowed */
7079 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7080 char *pat, STRLEN plen)
7085 PERL_UNUSED_CONTEXT;
7087 for (s = 0; s < plen; s++) {
7088 if ( pRExC_state->code_blocks
7089 && n < pRExC_state->code_blocks->count
7090 && s == pRExC_state->code_blocks->cb[n].start)
7092 s = pRExC_state->code_blocks->cb[n].end;
7096 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7098 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7100 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7107 /* Handle run-time code blocks. We will already have compiled any direct
7108 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7109 * copy of it, but with any literal code blocks blanked out and
7110 * appropriate chars escaped; then feed it into
7112 * eval "qr'modified_pattern'"
7116 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7120 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7122 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7123 * and merge them with any code blocks of the original regexp.
7125 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7126 * instead, just save the qr and return FALSE; this tells our caller that
7127 * the original pattern needs upgrading to utf8.
7131 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7132 char *pat, STRLEN plen)
7136 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7138 if (pRExC_state->runtime_code_qr) {
7139 /* this is the second time we've been called; this should
7140 * only happen if the main pattern got upgraded to utf8
7141 * during compilation; re-use the qr we compiled first time
7142 * round (which should be utf8 too)
7144 qr = pRExC_state->runtime_code_qr;
7145 pRExC_state->runtime_code_qr = NULL;
7146 assert(RExC_utf8 && SvUTF8(qr));
7152 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7156 /* determine how many extra chars we need for ' and \ escaping */
7157 for (s = 0; s < plen; s++) {
7158 if (pat[s] == '\'' || pat[s] == '\\')
7162 Newx(newpat, newlen, char);
7164 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7166 for (s = 0; s < plen; s++) {
7167 if ( pRExC_state->code_blocks
7168 && n < pRExC_state->code_blocks->count
7169 && s == pRExC_state->code_blocks->cb[n].start)
7171 /* blank out literal code block so that they aren't
7172 * recompiled: eg change from/to:
7182 assert(pat[s] == '(');
7183 assert(pat[s+1] == '?');
7187 while (s < pRExC_state->code_blocks->cb[n].end) {
7195 if (pat[s] == '\'' || pat[s] == '\\')
7200 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7202 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7208 Perl_re_printf( aTHX_
7209 "%sre-parsing pattern for runtime code:%s %s\n",
7210 PL_colors[4], PL_colors[5], newpat);
7213 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7219 PUSHSTACKi(PERLSI_REQUIRE);
7220 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7221 * parsing qr''; normally only q'' does this. It also alters
7223 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7224 SvREFCNT_dec_NN(sv);
7229 SV * const errsv = ERRSV;
7230 if (SvTRUE_NN(errsv))
7231 /* use croak_sv ? */
7232 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7234 assert(SvROK(qr_ref));
7236 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7237 /* the leaving below frees the tmp qr_ref.
7238 * Give qr a life of its own */
7246 if (!RExC_utf8 && SvUTF8(qr)) {
7247 /* first time through; the pattern got upgraded; save the
7248 * qr for the next time through */
7249 assert(!pRExC_state->runtime_code_qr);
7250 pRExC_state->runtime_code_qr = qr;
7255 /* extract any code blocks within the returned qr// */
7258 /* merge the main (r1) and run-time (r2) code blocks into one */
7260 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7261 struct reg_code_block *new_block, *dst;
7262 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7266 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7268 SvREFCNT_dec_NN(qr);
7272 if (!r1->code_blocks)
7273 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7275 r1c = r1->code_blocks->count;
7276 r2c = r2->code_blocks->count;
7278 Newx(new_block, r1c + r2c, struct reg_code_block);
7282 while (i1 < r1c || i2 < r2c) {
7283 struct reg_code_block *src;
7287 src = &r2->code_blocks->cb[i2++];
7291 src = &r1->code_blocks->cb[i1++];
7292 else if ( r1->code_blocks->cb[i1].start
7293 < r2->code_blocks->cb[i2].start)
7295 src = &r1->code_blocks->cb[i1++];
7296 assert(src->end < r2->code_blocks->cb[i2].start);
7299 assert( r1->code_blocks->cb[i1].start
7300 > r2->code_blocks->cb[i2].start);
7301 src = &r2->code_blocks->cb[i2++];
7303 assert(src->end < r1->code_blocks->cb[i1].start);
7306 assert(pat[src->start] == '(');
7307 assert(pat[src->end] == ')');
7308 dst->start = src->start;
7309 dst->end = src->end;
7310 dst->block = src->block;
7311 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7315 r1->code_blocks->count += r2c;
7316 Safefree(r1->code_blocks->cb);
7317 r1->code_blocks->cb = new_block;
7320 SvREFCNT_dec_NN(qr);
7326 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7327 struct reg_substr_datum *rsd,
7328 struct scan_data_substrs *sub,
7329 STRLEN longest_length)
7331 /* This is the common code for setting up the floating and fixed length
7332 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7333 * as to whether succeeded or not */
7337 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7338 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7340 if (! (longest_length
7341 || (eol /* Can't have SEOL and MULTI */
7342 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7344 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7345 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7350 /* copy the information about the longest from the reg_scan_data
7351 over to the program. */
7352 if (SvUTF8(sub->str)) {
7354 rsd->utf8_substr = sub->str;
7356 rsd->substr = sub->str;
7357 rsd->utf8_substr = NULL;
7359 /* end_shift is how many chars that must be matched that
7360 follow this item. We calculate it ahead of time as once the
7361 lookbehind offset is added in we lose the ability to correctly
7363 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7364 rsd->end_shift = ml - sub->min_offset
7366 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7368 + (SvTAIL(sub->str) != 0)
7372 t = (eol/* Can't have SEOL and MULTI */
7373 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7374 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7380 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7382 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7383 * properly wrapped with the right modifiers */
7385 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7386 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7387 != REGEX_DEPENDS_CHARSET);
7389 /* The caret is output if there are any defaults: if not all the STD
7390 * flags are set, or if no character set specifier is needed */
7392 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7394 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7395 == REG_RUN_ON_COMMENT_SEEN);
7396 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7397 >> RXf_PMf_STD_PMMOD_SHIFT);
7398 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7400 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7402 /* We output all the necessary flags; we never output a minus, as all
7403 * those are defaults, so are
7404 * covered by the caret */
7405 const STRLEN wraplen = pat_len + has_p + has_runon
7406 + has_default /* If needs a caret */
7407 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7409 /* If needs a character set specifier */
7410 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7411 + (sizeof("(?:)") - 1);
7413 PERL_ARGS_ASSERT_SET_REGEX_PV;
7415 /* make sure PL_bitcount bounds not exceeded */
7416 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7418 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7421 SvFLAGS(Rx) |= SVf_UTF8;
7424 /* If a default, cover it using the caret */
7426 *p++= DEFAULT_PAT_MOD;
7432 name = get_regex_charset_name(RExC_rx->extflags, &len);
7433 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7435 name = UNICODE_PAT_MODS;
7436 len = sizeof(UNICODE_PAT_MODS) - 1;
7438 Copy(name, p, len, char);
7442 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7445 while((ch = *fptr++)) {
7453 Copy(RExC_precomp, p, pat_len, char);
7454 assert ((RX_WRAPPED(Rx) - p) < 16);
7455 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7458 /* Adding a trailing \n causes this to compile properly:
7459 my $R = qr / A B C # D E/x; /($R)/
7460 Otherwise the parens are considered part of the comment */
7465 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7469 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7470 * regular expression into internal code.
7471 * The pattern may be passed either as:
7472 * a list of SVs (patternp plus pat_count)
7473 * a list of OPs (expr)
7474 * If both are passed, the SV list is used, but the OP list indicates
7475 * which SVs are actually pre-compiled code blocks
7477 * The SVs in the list have magic and qr overloading applied to them (and
7478 * the list may be modified in-place with replacement SVs in the latter
7481 * If the pattern hasn't changed from old_re, then old_re will be
7484 * eng is the current engine. If that engine has an op_comp method, then
7485 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7486 * do the initial concatenation of arguments and pass on to the external
7489 * If is_bare_re is not null, set it to a boolean indicating whether the
7490 * arg list reduced (after overloading) to a single bare regex which has
7491 * been returned (i.e. /$qr/).
7493 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7495 * pm_flags contains the PMf_* flags, typically based on those from the
7496 * pm_flags field of the related PMOP. Currently we're only interested in
7497 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7499 * For many years this code had an initial sizing pass that calculated
7500 * (sometimes incorrectly, leading to security holes) the size needed for the
7501 * compiled pattern. That was changed by commit
7502 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7503 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7504 * references to this sizing pass.
7506 * Now, an initial crude guess as to the size needed is made, based on the
7507 * length of the pattern. Patches welcome to improve that guess. That amount
7508 * of space is malloc'd and then immediately freed, and then clawed back node
7509 * by node. This design is to minimze, to the extent possible, memory churn
7510 * when doing the reallocs.
7512 * A separate parentheses counting pass may be needed in some cases.
7513 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7516 * The existence of a sizing pass necessitated design decisions that are no
7517 * longer needed. There are potential areas of simplification.
7519 * Beware that the optimization-preparation code in here knows about some
7520 * of the structure of the compiled regexp. [I'll say.]
7524 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7525 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7526 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7528 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7536 SV** new_patternp = patternp;
7538 /* these are all flags - maybe they should be turned
7539 * into a single int with different bit masks */
7540 I32 sawlookahead = 0;
7545 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7547 bool runtime_code = 0;
7549 RExC_state_t RExC_state;
7550 RExC_state_t * const pRExC_state = &RExC_state;
7551 #ifdef TRIE_STUDY_OPT
7553 RExC_state_t copyRExC_state;
7555 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7557 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7559 DEBUG_r(if (!PL_colorset) reginitcolors());
7562 pRExC_state->warn_text = NULL;
7563 pRExC_state->unlexed_names = NULL;
7564 pRExC_state->code_blocks = NULL;
7567 *is_bare_re = FALSE;
7569 if (expr && (expr->op_type == OP_LIST ||
7570 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7571 /* allocate code_blocks if needed */
7575 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7576 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7577 ncode++; /* count of DO blocks */
7580 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7584 /* compile-time pattern with just OP_CONSTs and DO blocks */
7589 /* find how many CONSTs there are */
7592 if (expr->op_type == OP_CONST)
7595 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7596 if (o->op_type == OP_CONST)
7600 /* fake up an SV array */
7602 assert(!new_patternp);
7603 Newx(new_patternp, n, SV*);
7604 SAVEFREEPV(new_patternp);
7608 if (expr->op_type == OP_CONST)
7609 new_patternp[n] = cSVOPx_sv(expr);
7611 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7612 if (o->op_type == OP_CONST)
7613 new_patternp[n++] = cSVOPo_sv;
7618 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7619 "Assembling pattern from %d elements%s\n", pat_count,
7620 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7622 /* set expr to the first arg op */
7624 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7625 && expr->op_type != OP_CONST)
7627 expr = cLISTOPx(expr)->op_first;
7628 assert( expr->op_type == OP_PUSHMARK
7629 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7630 || expr->op_type == OP_PADRANGE);
7631 expr = OpSIBLING(expr);
7634 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7635 expr, &recompile, NULL);
7637 /* handle bare (possibly after overloading) regex: foo =~ $re */
7642 if (SvTYPE(re) == SVt_REGEXP) {
7646 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7647 "Precompiled pattern%s\n",
7648 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7654 exp = SvPV_nomg(pat, plen);
7656 if (!eng->op_comp) {
7657 if ((SvUTF8(pat) && IN_BYTES)
7658 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7660 /* make a temporary copy; either to convert to bytes,
7661 * or to avoid repeating get-magic / overloaded stringify */
7662 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7663 (IN_BYTES ? 0 : SvUTF8(pat)));
7665 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7668 /* ignore the utf8ness if the pattern is 0 length */
7669 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7670 RExC_uni_semantics = 0;
7671 RExC_contains_locale = 0;
7672 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7673 RExC_in_script_run = 0;
7674 RExC_study_started = 0;
7675 pRExC_state->runtime_code_qr = NULL;
7676 RExC_frame_head= NULL;
7677 RExC_frame_last= NULL;
7678 RExC_frame_count= 0;
7679 RExC_latest_warn_offset = 0;
7680 RExC_use_BRANCHJ = 0;
7681 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7682 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7683 RExC_total_parens = 0;
7684 RExC_open_parens = NULL;
7685 RExC_close_parens = NULL;
7686 RExC_paren_names = NULL;
7688 RExC_seen_d_op = FALSE;
7690 RExC_paren_name_list = NULL;
7694 RExC_mysv1= sv_newmortal();
7695 RExC_mysv2= sv_newmortal();
7699 SV *dsv= sv_newmortal();
7700 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7701 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7702 PL_colors[4], PL_colors[5], s);
7705 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7708 if ((pm_flags & PMf_USE_RE_EVAL)
7709 /* this second condition covers the non-regex literal case,
7710 * i.e. $foo =~ '(?{})'. */
7711 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7713 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7716 /* return old regex if pattern hasn't changed */
7717 /* XXX: note in the below we have to check the flags as well as the
7720 * Things get a touch tricky as we have to compare the utf8 flag
7721 * independently from the compile flags. */
7725 && !!RX_UTF8(old_re) == !!RExC_utf8
7726 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7727 && RX_PRECOMP(old_re)
7728 && RX_PRELEN(old_re) == plen
7729 && memEQ(RX_PRECOMP(old_re), exp, plen)
7730 && !runtime_code /* with runtime code, always recompile */ )
7733 SV *dsv= sv_newmortal();
7734 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7735 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
7736 PL_colors[4], PL_colors[5], s);
7741 /* Allocate the pattern's SV */
7742 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7743 RExC_rx = ReANY(Rx);
7744 if ( RExC_rx == NULL )
7745 FAIL("Regexp out of space");
7747 rx_flags = orig_rx_flags;
7749 if ( (UTF || RExC_uni_semantics)
7750 && initial_charset == REGEX_DEPENDS_CHARSET)
7753 /* Set to use unicode semantics if the pattern is in utf8 and has the
7754 * 'depends' charset specified, as it means unicode when utf8 */
7755 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7756 RExC_uni_semantics = 1;
7759 RExC_pm_flags = pm_flags;
7762 assert(TAINTING_get || !TAINT_get);
7764 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7766 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7767 /* whoops, we have a non-utf8 pattern, whilst run-time code
7768 * got compiled as utf8. Try again with a utf8 pattern */
7769 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7770 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7774 assert(!pRExC_state->runtime_code_qr);
7780 RExC_in_lookbehind = 0;
7781 RExC_in_lookahead = 0;
7782 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7783 RExC_recode_x_to_native = 0;
7784 RExC_in_multi_char_class = 0;
7786 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7787 RExC_precomp_end = RExC_end = exp + plen;
7789 RExC_whilem_seen = 0;
7791 RExC_recurse = NULL;
7792 RExC_study_chunk_recursed = NULL;
7793 RExC_study_chunk_recursed_bytes= 0;
7794 RExC_recurse_count = 0;
7795 RExC_sets_depth = 0;
7796 pRExC_state->code_index = 0;
7798 /* Initialize the string in the compiled pattern. This is so that there is
7799 * something to output if necessary */
7800 set_regex_pv(pRExC_state, Rx);
7803 Perl_re_printf( aTHX_
7804 "Starting parse and generation\n");
7806 RExC_lastparse=NULL;
7809 /* Allocate space and zero-initialize. Note, the two step process
7810 of zeroing when in debug mode, thus anything assigned has to
7811 happen after that */
7814 /* On the first pass of the parse, we guess how big this will be. Then
7815 * we grow in one operation to that amount and then give it back. As
7816 * we go along, we re-allocate what we need.
7818 * XXX Currently the guess is essentially that the pattern will be an
7819 * EXACT node with one byte input, one byte output. This is crude, and
7820 * better heuristics are welcome.
7822 * On any subsequent passes, we guess what we actually computed in the
7823 * latest earlier pass. Such a pass probably didn't complete so is
7824 * missing stuff. We could improve those guesses by knowing where the
7825 * parse stopped, and use the length so far plus apply the above
7826 * assumption to what's left. */
7827 RExC_size = STR_SZ(RExC_end - RExC_start);
7830 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7831 if ( RExC_rxi == NULL )
7832 FAIL("Regexp out of space");
7834 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7835 RXi_SET( RExC_rx, RExC_rxi );
7837 /* We start from 0 (over from 0 in the case this is a reparse. The first
7838 * node parsed will give back any excess memory we have allocated so far).
7842 /* non-zero initialization begins here */
7843 RExC_rx->engine= eng;
7844 RExC_rx->extflags = rx_flags;
7845 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7847 if (pm_flags & PMf_IS_QR) {
7848 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7849 if (RExC_rxi->code_blocks) {
7850 RExC_rxi->code_blocks->refcnt++;
7854 RExC_rx->intflags = 0;
7856 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7859 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7860 * code makes sure the final byte is an uncounted NUL. But should this
7861 * ever not be the case, lots of things could read beyond the end of the
7862 * buffer: loops like
7863 * while(isFOO(*RExC_parse)) RExC_parse++;
7864 * strchr(RExC_parse, "foo");
7865 * etc. So it is worth noting. */
7866 assert(*RExC_end == '\0');
7870 RExC_parens_buf_size = 0;
7871 RExC_emit_start = RExC_rxi->program;
7872 pRExC_state->code_index = 0;
7874 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7878 if (reg(pRExC_state, 0, &flags, 1)) {
7880 /* Success!, But we may need to redo the parse knowing how many parens
7881 * there actually are */
7882 if (IN_PARENS_PASS) {
7883 flags |= RESTART_PARSE;
7886 /* We have that number in RExC_npar */
7887 RExC_total_parens = RExC_npar;
7889 else if (! MUST_RESTART(flags)) {
7891 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7894 /* Here, we either have success, or we have to redo the parse for some reason */
7895 if (MUST_RESTART(flags)) {
7897 /* It's possible to write a regexp in ascii that represents Unicode
7898 codepoints outside of the byte range, such as via \x{100}. If we
7899 detect such a sequence we have to convert the entire pattern to utf8
7900 and then recompile, as our sizing calculation will have been based
7901 on 1 byte == 1 character, but we will need to use utf8 to encode
7902 at least some part of the pattern, and therefore must convert the whole
7905 if (flags & NEED_UTF8) {
7907 /* We have stored the offset of the final warning output so far.
7908 * That must be adjusted. Any variant characters between the start
7909 * of the pattern and this warning count for 2 bytes in the final,
7910 * so just add them again */
7911 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7912 RExC_latest_warn_offset +=
7913 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7914 + RExC_latest_warn_offset);
7916 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7917 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7918 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7921 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7924 if (ALL_PARENS_COUNTED) {
7925 /* Make enough room for all the known parens, and zero it */
7926 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7927 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7928 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7930 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7931 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7933 else { /* Parse did not complete. Reinitialize the parentheses
7935 RExC_total_parens = 0;
7936 if (RExC_open_parens) {
7937 Safefree(RExC_open_parens);
7938 RExC_open_parens = NULL;
7940 if (RExC_close_parens) {
7941 Safefree(RExC_close_parens);
7942 RExC_close_parens = NULL;
7946 /* Clean up what we did in this parse */
7947 SvREFCNT_dec_NN(RExC_rx_sv);
7952 /* Here, we have successfully parsed and generated the pattern's program
7953 * for the regex engine. We are ready to finish things up and look for
7956 /* Update the string to compile, with correct modifiers, etc */
7957 set_regex_pv(pRExC_state, Rx);
7959 RExC_rx->nparens = RExC_total_parens - 1;
7961 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7962 if (RExC_whilem_seen > 15)
7963 RExC_whilem_seen = 15;
7966 Perl_re_printf( aTHX_
7967 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7969 RExC_lastparse=NULL;
7972 #ifdef RE_TRACK_PATTERN_OFFSETS
7973 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7974 "%s %" UVuf " bytes for offset annotations.\n",
7975 RExC_offsets ? "Got" : "Couldn't get",
7976 (UV)((RExC_offsets[0] * 2 + 1))));
7977 DEBUG_OFFSETS_r(if (RExC_offsets) {
7978 const STRLEN len = RExC_offsets[0];
7980 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7981 Perl_re_printf( aTHX_
7982 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7983 for (i = 1; i <= len; i++) {
7984 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7985 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7986 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7988 Perl_re_printf( aTHX_ "\n");
7992 SetProgLen(RExC_rxi,RExC_size);
7995 DEBUG_DUMP_PRE_OPTIMIZE_r({
7996 SV * const sv = sv_newmortal();
7997 RXi_GET_DECL(RExC_rx, ri);
7999 Perl_re_printf( aTHX_ "Program before optimization:\n");
8001 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
8006 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
8009 /* XXXX To minimize changes to RE engine we always allocate
8010 3-units-long substrs field. */
8011 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8012 if (RExC_recurse_count) {
8013 Newx(RExC_recurse, RExC_recurse_count, regnode *);
8014 SAVEFREEPV(RExC_recurse);
8017 if (RExC_seen & REG_RECURSE_SEEN) {
8018 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8019 * So its 1 if there are no parens. */
8020 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8021 ((RExC_total_parens & 0x07) != 0);
8022 Newx(RExC_study_chunk_recursed,
8023 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8024 SAVEFREEPV(RExC_study_chunk_recursed);
8028 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8030 RExC_study_chunk_recursed_count= 0;
8032 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8033 if (RExC_study_chunk_recursed) {
8034 Zero(RExC_study_chunk_recursed,
8035 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8039 #ifdef TRIE_STUDY_OPT
8041 StructCopy(&zero_scan_data, &data, scan_data_t);
8042 copyRExC_state = RExC_state;
8045 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8047 RExC_state = copyRExC_state;
8048 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8049 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8051 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8052 StructCopy(&zero_scan_data, &data, scan_data_t);
8055 StructCopy(&zero_scan_data, &data, scan_data_t);
8058 /* Dig out information for optimizations. */
8059 RExC_rx->extflags = RExC_flags; /* was pm_op */
8060 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8063 SvUTF8_on(Rx); /* Unicode in it? */
8064 RExC_rxi->regstclass = NULL;
8065 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8066 RExC_rx->intflags |= PREGf_NAUGHTY;
8067 scan = RExC_rxi->program + 1; /* First BRANCH. */
8069 /* testing for BRANCH here tells us whether there is "must appear"
8070 data in the pattern. If there is then we can use it for optimisations */
8071 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8074 STRLEN longest_length[2];
8075 regnode_ssc ch_class; /* pointed to by data */
8077 SSize_t last_close = 0; /* pointed to by data */
8078 regnode *first= scan;
8079 regnode *first_next= regnext(first);
8083 * Skip introductions and multiplicators >= 1
8084 * so that we can extract the 'meat' of the pattern that must
8085 * match in the large if() sequence following.
8086 * NOTE that EXACT is NOT covered here, as it is normally
8087 * picked up by the optimiser separately.
8089 * This is unfortunate as the optimiser isnt handling lookahead
8090 * properly currently.
8093 while ((OP(first) == OPEN && (sawopen = 1)) ||
8094 /* An OR of *one* alternative - should not happen now. */
8095 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8096 /* for now we can't handle lookbehind IFMATCH*/
8097 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8098 (OP(first) == PLUS) ||
8099 (OP(first) == MINMOD) ||
8100 /* An {n,m} with n>0 */
8101 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8102 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8105 * the only op that could be a regnode is PLUS, all the rest
8106 * will be regnode_1 or regnode_2.
8108 * (yves doesn't think this is true)
8110 if (OP(first) == PLUS)
8113 if (OP(first) == MINMOD)
8115 first += regarglen[OP(first)];
8117 first = NEXTOPER(first);
8118 first_next= regnext(first);
8121 /* Starting-point info. */
8123 DEBUG_PEEP("first:", first, 0, 0);
8124 /* Ignore EXACT as we deal with it later. */
8125 if (PL_regkind[OP(first)] == EXACT) {
8126 if ( OP(first) == EXACT
8127 || OP(first) == LEXACT
8128 || OP(first) == EXACT_REQ8
8129 || OP(first) == LEXACT_REQ8
8130 || OP(first) == EXACTL)
8132 NOOP; /* Empty, get anchored substr later. */
8135 RExC_rxi->regstclass = first;
8138 else if (PL_regkind[OP(first)] == TRIE &&
8139 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8141 /* this can happen only on restudy */
8142 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8145 else if (REGNODE_SIMPLE(OP(first)))
8146 RExC_rxi->regstclass = first;
8147 else if (PL_regkind[OP(first)] == BOUND ||
8148 PL_regkind[OP(first)] == NBOUND)
8149 RExC_rxi->regstclass = first;
8150 else if (PL_regkind[OP(first)] == BOL) {
8151 RExC_rx->intflags |= (OP(first) == MBOL
8154 first = NEXTOPER(first);
8157 else if (OP(first) == GPOS) {
8158 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8159 first = NEXTOPER(first);
8162 else if ((!sawopen || !RExC_sawback) &&
8164 (OP(first) == STAR &&
8165 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8166 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8168 /* turn .* into ^.* with an implied $*=1 */
8170 (OP(NEXTOPER(first)) == REG_ANY)
8173 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8174 first = NEXTOPER(first);
8177 if (sawplus && !sawminmod && !sawlookahead
8178 && (!sawopen || !RExC_sawback)
8179 && !pRExC_state->code_blocks) /* May examine pos and $& */
8180 /* x+ must match at the 1st pos of run of x's */
8181 RExC_rx->intflags |= PREGf_SKIP;
8183 /* Scan is after the zeroth branch, first is atomic matcher. */
8184 #ifdef TRIE_STUDY_OPT
8187 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8188 (IV)(first - scan + 1))
8192 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8193 (IV)(first - scan + 1))
8199 * If there's something expensive in the r.e., find the
8200 * longest literal string that must appear and make it the
8201 * regmust. Resolve ties in favor of later strings, since
8202 * the regstart check works with the beginning of the r.e.
8203 * and avoiding duplication strengthens checking. Not a
8204 * strong reason, but sufficient in the absence of others.
8205 * [Now we resolve ties in favor of the earlier string if
8206 * it happens that c_offset_min has been invalidated, since the
8207 * earlier string may buy us something the later one won't.]
8210 data.substrs[0].str = newSVpvs("");
8211 data.substrs[1].str = newSVpvs("");
8212 data.last_found = newSVpvs("");
8213 data.cur_is_floating = 0; /* initially any found substring is fixed */
8214 ENTER_with_name("study_chunk");
8215 SAVEFREESV(data.substrs[0].str);
8216 SAVEFREESV(data.substrs[1].str);
8217 SAVEFREESV(data.last_found);
8219 if (!RExC_rxi->regstclass) {
8220 ssc_init(pRExC_state, &ch_class);
8221 data.start_class = &ch_class;
8222 stclass_flag = SCF_DO_STCLASS_AND;
8223 } else /* XXXX Check for BOUND? */
8225 data.last_closep = &last_close;
8229 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8230 * (NO top level branches)
8232 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8233 scan + RExC_size, /* Up to end */
8235 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8236 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8240 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8243 if ( RExC_total_parens == 1 && !data.cur_is_floating
8244 && data.last_start_min == 0 && data.last_end > 0
8245 && !RExC_seen_zerolen
8246 && !(RExC_seen & REG_VERBARG_SEEN)
8247 && !(RExC_seen & REG_GPOS_SEEN)
8249 RExC_rx->extflags |= RXf_CHECK_ALL;
8251 scan_commit(pRExC_state, &data,&minlen, 0);
8254 /* XXX this is done in reverse order because that's the way the
8255 * code was before it was parameterised. Don't know whether it
8256 * actually needs doing in reverse order. DAPM */
8257 for (i = 1; i >= 0; i--) {
8258 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8261 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8262 && data.substrs[0].min_offset
8263 == data.substrs[1].min_offset
8264 && SvCUR(data.substrs[0].str)
8265 == SvCUR(data.substrs[1].str)
8267 && S_setup_longest (aTHX_ pRExC_state,
8268 &(RExC_rx->substrs->data[i]),
8272 RExC_rx->substrs->data[i].min_offset =
8273 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8275 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8276 /* Don't offset infinity */
8277 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8278 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8279 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8282 RExC_rx->substrs->data[i].substr = NULL;
8283 RExC_rx->substrs->data[i].utf8_substr = NULL;
8284 longest_length[i] = 0;
8288 LEAVE_with_name("study_chunk");
8290 if (RExC_rxi->regstclass
8291 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8292 RExC_rxi->regstclass = NULL;
8294 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8295 || RExC_rx->substrs->data[0].min_offset)
8297 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8298 && is_ssc_worth_it(pRExC_state, data.start_class))
8300 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8302 ssc_finalize(pRExC_state, data.start_class);
8304 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8305 StructCopy(data.start_class,
8306 (regnode_ssc*)RExC_rxi->data->data[n],
8308 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8309 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8310 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8311 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8312 Perl_re_printf( aTHX_
8313 "synthetic stclass \"%s\".\n",
8314 SvPVX_const(sv));});
8315 data.start_class = NULL;
8318 /* A temporary algorithm prefers floated substr to fixed one of
8319 * same length to dig more info. */
8320 i = (longest_length[0] <= longest_length[1]);
8321 RExC_rx->substrs->check_ix = i;
8322 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8323 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8324 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8325 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8326 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8327 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8328 RExC_rx->intflags |= PREGf_NOSCAN;
8330 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8331 RExC_rx->extflags |= RXf_USE_INTUIT;
8332 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8333 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8336 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8337 if ( (STRLEN)minlen < longest_length[1] )
8338 minlen= longest_length[1];
8339 if ( (STRLEN)minlen < longest_length[0] )
8340 minlen= longest_length[0];
8344 /* Several toplevels. Best we can is to set minlen. */
8346 regnode_ssc ch_class;
8347 SSize_t last_close = 0;
8349 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8351 scan = RExC_rxi->program + 1;
8352 ssc_init(pRExC_state, &ch_class);
8353 data.start_class = &ch_class;
8354 data.last_closep = &last_close;
8358 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8359 * (patterns WITH top level branches)
8361 minlen = study_chunk(pRExC_state,
8362 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8363 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8364 ? SCF_TRIE_DOING_RESTUDY
8368 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8370 RExC_rx->check_substr = NULL;
8371 RExC_rx->check_utf8 = NULL;
8372 RExC_rx->substrs->data[0].substr = NULL;
8373 RExC_rx->substrs->data[0].utf8_substr = NULL;
8374 RExC_rx->substrs->data[1].substr = NULL;
8375 RExC_rx->substrs->data[1].utf8_substr = NULL;
8377 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8378 && is_ssc_worth_it(pRExC_state, data.start_class))
8380 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8382 ssc_finalize(pRExC_state, data.start_class);
8384 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8385 StructCopy(data.start_class,
8386 (regnode_ssc*)RExC_rxi->data->data[n],
8388 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8389 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8390 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8391 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8392 Perl_re_printf( aTHX_
8393 "synthetic stclass \"%s\".\n",
8394 SvPVX_const(sv));});
8395 data.start_class = NULL;
8399 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8400 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8401 RExC_rx->maxlen = REG_INFTY;
8404 RExC_rx->maxlen = RExC_maxlen;
8407 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8408 the "real" pattern. */
8410 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8411 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8413 RExC_rx->minlenret = minlen;
8414 if (RExC_rx->minlen < minlen)
8415 RExC_rx->minlen = minlen;
8417 if (RExC_seen & REG_RECURSE_SEEN ) {
8418 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8419 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8421 if (RExC_seen & REG_GPOS_SEEN)
8422 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8423 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8424 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8426 if (pRExC_state->code_blocks)
8427 RExC_rx->extflags |= RXf_EVAL_SEEN;
8428 if (RExC_seen & REG_VERBARG_SEEN)
8430 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8431 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8433 if (RExC_seen & REG_CUTGROUP_SEEN)
8434 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8435 if (pm_flags & PMf_USE_RE_EVAL)
8436 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8437 if (RExC_paren_names)
8438 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8440 RXp_PAREN_NAMES(RExC_rx) = NULL;
8442 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8443 * so it can be used in pp.c */
8444 if (RExC_rx->intflags & PREGf_ANCH)
8445 RExC_rx->extflags |= RXf_IS_ANCHORED;
8449 /* this is used to identify "special" patterns that might result
8450 * in Perl NOT calling the regex engine and instead doing the match "itself",
8451 * particularly special cases in split//. By having the regex compiler
8452 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8453 * we avoid weird issues with equivalent patterns resulting in different behavior,
8454 * AND we allow non Perl engines to get the same optimizations by the setting the
8455 * flags appropriately - Yves */
8456 regnode *first = RExC_rxi->program + 1;
8458 regnode *next = regnext(first);
8461 if (PL_regkind[fop] == NOTHING && nop == END)
8462 RExC_rx->extflags |= RXf_NULL;
8463 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8464 /* when fop is SBOL first->flags will be true only when it was
8465 * produced by parsing /\A/, and not when parsing /^/. This is
8466 * very important for the split code as there we want to
8467 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8468 * See rt #122761 for more details. -- Yves */
8469 RExC_rx->extflags |= RXf_START_ONLY;
8470 else if (fop == PLUS
8471 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8473 RExC_rx->extflags |= RXf_WHITE;
8474 else if ( RExC_rx->extflags & RXf_SPLIT
8475 && ( fop == EXACT || fop == LEXACT
8476 || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8478 && STR_LEN(first) == 1
8479 && *(STRING(first)) == ' '
8481 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8485 if (RExC_contains_locale) {
8486 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8490 if (RExC_paren_names) {
8491 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8492 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8493 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8496 RExC_rxi->name_list_idx = 0;
8498 while ( RExC_recurse_count > 0 ) {
8499 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8501 * This data structure is set up in study_chunk() and is used
8502 * to calculate the distance between a GOSUB regopcode and
8503 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8506 * If for some reason someone writes code that optimises
8507 * away a GOSUB opcode then the assert should be changed to
8508 * an if(scan) to guard the ARG2L_SET() - Yves
8511 assert(scan && OP(scan) == GOSUB);
8512 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8515 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8516 /* assume we don't need to swap parens around before we match */
8518 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8519 (unsigned long)RExC_study_chunk_recursed_count);
8523 Perl_re_printf( aTHX_ "Final program:\n");
8527 if (RExC_open_parens) {
8528 Safefree(RExC_open_parens);
8529 RExC_open_parens = NULL;
8531 if (RExC_close_parens) {
8532 Safefree(RExC_close_parens);
8533 RExC_close_parens = NULL;
8537 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8538 * by setting the regexp SV to readonly-only instead. If the
8539 * pattern's been recompiled, the USEDness should remain. */
8540 if (old_re && SvREADONLY(old_re))
8548 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8551 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8553 PERL_UNUSED_ARG(value);
8555 if (flags & RXapif_FETCH) {
8556 return reg_named_buff_fetch(rx, key, flags);
8557 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8558 Perl_croak_no_modify();
8560 } else if (flags & RXapif_EXISTS) {
8561 return reg_named_buff_exists(rx, key, flags)
8564 } else if (flags & RXapif_REGNAMES) {
8565 return reg_named_buff_all(rx, flags);
8566 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8567 return reg_named_buff_scalar(rx, flags);
8569 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8575 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8578 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8579 PERL_UNUSED_ARG(lastkey);
8581 if (flags & RXapif_FIRSTKEY)
8582 return reg_named_buff_firstkey(rx, flags);
8583 else if (flags & RXapif_NEXTKEY)
8584 return reg_named_buff_nextkey(rx, flags);
8586 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8593 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8597 struct regexp *const rx = ReANY(r);
8599 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8601 if (rx && RXp_PAREN_NAMES(rx)) {
8602 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8605 SV* sv_dat=HeVAL(he_str);
8606 I32 *nums=(I32*)SvPVX(sv_dat);
8607 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8608 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8609 if ((I32)(rx->nparens) >= nums[i]
8610 && rx->offs[nums[i]].start != -1
8611 && rx->offs[nums[i]].end != -1)
8614 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8619 ret = newSVsv(&PL_sv_undef);
8622 av_push(retarray, ret);
8625 return newRV_noinc(MUTABLE_SV(retarray));
8632 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8635 struct regexp *const rx = ReANY(r);
8637 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8639 if (rx && RXp_PAREN_NAMES(rx)) {
8640 if (flags & RXapif_ALL) {
8641 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8643 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8645 SvREFCNT_dec_NN(sv);
8657 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8659 struct regexp *const rx = ReANY(r);
8661 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8663 if ( rx && RXp_PAREN_NAMES(rx) ) {
8664 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8666 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8673 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8675 struct regexp *const rx = ReANY(r);
8676 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8678 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8680 if (rx && RXp_PAREN_NAMES(rx)) {
8681 HV *hv = RXp_PAREN_NAMES(rx);
8683 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8686 SV* sv_dat = HeVAL(temphe);
8687 I32 *nums = (I32*)SvPVX(sv_dat);
8688 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8689 if ((I32)(rx->lastparen) >= nums[i] &&
8690 rx->offs[nums[i]].start != -1 &&
8691 rx->offs[nums[i]].end != -1)
8697 if (parno || flags & RXapif_ALL) {
8698 return newSVhek(HeKEY_hek(temphe));
8706 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8711 struct regexp *const rx = ReANY(r);
8713 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8715 if (rx && RXp_PAREN_NAMES(rx)) {
8716 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8717 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8718 } else if (flags & RXapif_ONE) {
8719 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8720 av = MUTABLE_AV(SvRV(ret));
8721 length = av_count(av);
8722 SvREFCNT_dec_NN(ret);
8723 return newSViv(length);
8725 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8730 return &PL_sv_undef;
8734 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8736 struct regexp *const rx = ReANY(r);
8739 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8741 if (rx && RXp_PAREN_NAMES(rx)) {
8742 HV *hv= RXp_PAREN_NAMES(rx);
8744 (void)hv_iterinit(hv);
8745 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8748 SV* sv_dat = HeVAL(temphe);
8749 I32 *nums = (I32*)SvPVX(sv_dat);
8750 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8751 if ((I32)(rx->lastparen) >= nums[i] &&
8752 rx->offs[nums[i]].start != -1 &&
8753 rx->offs[nums[i]].end != -1)
8759 if (parno || flags & RXapif_ALL) {
8760 av_push(av, newSVhek(HeKEY_hek(temphe)));
8765 return newRV_noinc(MUTABLE_SV(av));
8769 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8772 struct regexp *const rx = ReANY(r);
8778 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8780 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8781 || n == RX_BUFF_IDX_CARET_FULLMATCH
8782 || n == RX_BUFF_IDX_CARET_POSTMATCH
8785 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8787 /* on something like
8790 * the KEEPCOPY is set on the PMOP rather than the regex */
8791 if (PL_curpm && r == PM_GETRE(PL_curpm))
8792 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8801 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8802 /* no need to distinguish between them any more */
8803 n = RX_BUFF_IDX_FULLMATCH;
8805 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8806 && rx->offs[0].start != -1)
8808 /* $`, ${^PREMATCH} */
8809 i = rx->offs[0].start;
8813 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8814 && rx->offs[0].end != -1)
8816 /* $', ${^POSTMATCH} */
8817 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8818 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8821 if (inRANGE(n, 0, (I32)rx->nparens) &&
8822 (s1 = rx->offs[n].start) != -1 &&
8823 (t1 = rx->offs[n].end) != -1)
8825 /* $&, ${^MATCH}, $1 ... */
8827 s = rx->subbeg + s1 - rx->suboffset;
8832 assert(s >= rx->subbeg);
8833 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8835 #ifdef NO_TAINT_SUPPORT
8836 sv_setpvn(sv, s, i);
8838 const int oldtainted = TAINT_get;
8840 sv_setpvn(sv, s, i);
8841 TAINT_set(oldtainted);
8843 if (RXp_MATCH_UTF8(rx))
8848 if (RXp_MATCH_TAINTED(rx)) {
8849 if (SvTYPE(sv) >= SVt_PVMG) {
8850 MAGIC* const mg = SvMAGIC(sv);
8853 SvMAGIC_set(sv, mg->mg_moremagic);
8855 if ((mgt = SvMAGIC(sv))) {
8856 mg->mg_moremagic = mgt;
8857 SvMAGIC_set(sv, mg);
8874 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8875 SV const * const value)
8877 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8879 PERL_UNUSED_ARG(rx);
8880 PERL_UNUSED_ARG(paren);
8881 PERL_UNUSED_ARG(value);
8884 Perl_croak_no_modify();
8888 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8891 struct regexp *const rx = ReANY(r);
8895 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8897 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8898 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8899 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8902 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8904 /* on something like
8907 * the KEEPCOPY is set on the PMOP rather than the regex */
8908 if (PL_curpm && r == PM_GETRE(PL_curpm))
8909 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8915 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8917 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8918 case RX_BUFF_IDX_PREMATCH: /* $` */
8919 if (rx->offs[0].start != -1) {
8920 i = rx->offs[0].start;
8929 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8930 case RX_BUFF_IDX_POSTMATCH: /* $' */
8931 if (rx->offs[0].end != -1) {
8932 i = rx->sublen - rx->offs[0].end;
8934 s1 = rx->offs[0].end;
8941 default: /* $& / ${^MATCH}, $1, $2, ... */
8942 if (paren <= (I32)rx->nparens &&
8943 (s1 = rx->offs[paren].start) != -1 &&
8944 (t1 = rx->offs[paren].end) != -1)
8950 if (ckWARN(WARN_UNINITIALIZED))
8951 report_uninit((const SV *)sv);
8956 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8957 const char * const s = rx->subbeg - rx->suboffset + s1;
8962 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8969 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8971 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8972 PERL_UNUSED_ARG(rx);
8976 return newSVpvs("Regexp");
8979 /* Scans the name of a named buffer from the pattern.
8980 * If flags is REG_RSN_RETURN_NULL returns null.
8981 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8982 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8983 * to the parsed name as looked up in the RExC_paren_names hash.
8984 * If there is an error throws a vFAIL().. type exception.
8987 #define REG_RSN_RETURN_NULL 0
8988 #define REG_RSN_RETURN_NAME 1
8989 #define REG_RSN_RETURN_DATA 2
8992 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8994 char *name_start = RExC_parse;
8997 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8999 assert (RExC_parse <= RExC_end);
9000 if (RExC_parse == RExC_end) NOOP;
9001 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
9002 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
9003 * using do...while */
9006 RExC_parse += UTF8SKIP(RExC_parse);
9007 } while ( RExC_parse < RExC_end
9008 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9012 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9014 RExC_parse++; /* so the <- from the vFAIL is after the offending
9016 vFAIL("Group name must start with a non-digit word character");
9018 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9019 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9020 if ( flags == REG_RSN_RETURN_NAME)
9022 else if (flags==REG_RSN_RETURN_DATA) {
9025 if ( ! sv_name ) /* should not happen*/
9026 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9027 if (RExC_paren_names)
9028 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9030 sv_dat = HeVAL(he_str);
9031 if ( ! sv_dat ) { /* Didn't find group */
9033 /* It might be a forward reference; we can't fail until we
9034 * know, by completing the parse to get all the groups, and
9036 if (ALL_PARENS_COUNTED) {
9037 vFAIL("Reference to nonexistent named group");
9040 REQUIRE_PARENS_PASS;
9046 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9047 (unsigned long) flags);
9050 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9051 if (RExC_lastparse!=RExC_parse) { \
9052 Perl_re_printf( aTHX_ "%s", \
9053 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9054 RExC_end - RExC_parse, 16, \
9056 PERL_PV_ESCAPE_UNI_DETECT | \
9057 PERL_PV_PRETTY_ELLIPSES | \
9058 PERL_PV_PRETTY_LTGT | \
9059 PERL_PV_ESCAPE_RE | \
9060 PERL_PV_PRETTY_EXACTSIZE \
9064 Perl_re_printf( aTHX_ "%16s",""); \
9066 if (RExC_lastnum!=RExC_emit) \
9067 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9069 Perl_re_printf( aTHX_ "|%4s",""); \
9070 Perl_re_printf( aTHX_ "|%*s%-4s", \
9071 (int)((depth*2)), "", \
9074 RExC_lastnum=RExC_emit; \
9075 RExC_lastparse=RExC_parse; \
9080 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9081 DEBUG_PARSE_MSG((funcname)); \
9082 Perl_re_printf( aTHX_ "%4s","\n"); \
9084 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9085 DEBUG_PARSE_MSG((funcname)); \
9086 Perl_re_printf( aTHX_ fmt "\n",args); \
9089 /* This section of code defines the inversion list object and its methods. The
9090 * interfaces are highly subject to change, so as much as possible is static to
9091 * this file. An inversion list is here implemented as a malloc'd C UV array
9092 * as an SVt_INVLIST scalar.
9094 * An inversion list for Unicode is an array of code points, sorted by ordinal
9095 * number. Each element gives the code point that begins a range that extends
9096 * up-to but not including the code point given by the next element. The final
9097 * element gives the first code point of a range that extends to the platform's
9098 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9099 * ...) give ranges whose code points are all in the inversion list. We say
9100 * that those ranges are in the set. The odd-numbered elements give ranges
9101 * whose code points are not in the inversion list, and hence not in the set.
9102 * Thus, element [0] is the first code point in the list. Element [1]
9103 * is the first code point beyond that not in the list; and element [2] is the
9104 * first code point beyond that that is in the list. In other words, the first
9105 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9106 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9107 * all code points in that range are not in the inversion list. The third
9108 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9109 * list, and so forth. Thus every element whose index is divisible by two
9110 * gives the beginning of a range that is in the list, and every element whose
9111 * index is not divisible by two gives the beginning of a range not in the
9112 * list. If the final element's index is divisible by two, the inversion list
9113 * extends to the platform's infinity; otherwise the highest code point in the
9114 * inversion list is the contents of that element minus 1.
9116 * A range that contains just a single code point N will look like
9118 * invlist[i+1] == N+1
9120 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9121 * impossible to represent, so element [i+1] is omitted. The single element
9123 * invlist[0] == UV_MAX
9124 * contains just UV_MAX, but is interpreted as matching to infinity.
9126 * Taking the complement (inverting) an inversion list is quite simple, if the
9127 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9128 * This implementation reserves an element at the beginning of each inversion
9129 * list to always contain 0; there is an additional flag in the header which
9130 * indicates if the list begins at the 0, or is offset to begin at the next
9131 * element. This means that the inversion list can be inverted without any
9132 * copying; just flip the flag.
9134 * More about inversion lists can be found in "Unicode Demystified"
9135 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9137 * The inversion list data structure is currently implemented as an SV pointing
9138 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9139 * array of UV whose memory management is automatically handled by the existing
9140 * facilities for SV's.
9142 * Some of the methods should always be private to the implementation, and some
9143 * should eventually be made public */
9145 /* The header definitions are in F<invlist_inline.h> */
9147 #ifndef PERL_IN_XSUB_RE
9149 PERL_STATIC_INLINE UV*
9150 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9152 /* Returns a pointer to the first element in the inversion list's array.
9153 * This is called upon initialization of an inversion list. Where the
9154 * array begins depends on whether the list has the code point U+0000 in it
9155 * or not. The other parameter tells it whether the code that follows this
9156 * call is about to put a 0 in the inversion list or not. The first
9157 * element is either the element reserved for 0, if TRUE, or the element
9158 * after it, if FALSE */
9160 bool* offset = get_invlist_offset_addr(invlist);
9161 UV* zero_addr = (UV *) SvPVX(invlist);
9163 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9166 assert(! _invlist_len(invlist));
9170 /* 1^1 = 0; 1^0 = 1 */
9171 *offset = 1 ^ will_have_0;
9172 return zero_addr + *offset;
9176 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9178 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9179 * steals the list from 'src', so 'src' is made to have a NULL list. This
9180 * is similar to what SvSetMagicSV() would do, if it were implemented on
9181 * inversion lists, though this routine avoids a copy */
9183 const UV src_len = _invlist_len(src);
9184 const bool src_offset = *get_invlist_offset_addr(src);
9185 const STRLEN src_byte_len = SvLEN(src);
9186 char * array = SvPVX(src);
9188 const int oldtainted = TAINT_get;
9190 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9192 assert(is_invlist(src));
9193 assert(is_invlist(dest));
9194 assert(! invlist_is_iterating(src));
9195 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9197 /* Make sure it ends in the right place with a NUL, as our inversion list
9198 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9200 array[src_byte_len - 1] = '\0';
9202 TAINT_NOT; /* Otherwise it breaks */
9203 sv_usepvn_flags(dest,
9207 /* This flag is documented to cause a copy to be avoided */
9208 SV_HAS_TRAILING_NUL);
9209 TAINT_set(oldtainted);
9214 /* Finish up copying over the other fields in an inversion list */
9215 *get_invlist_offset_addr(dest) = src_offset;
9216 invlist_set_len(dest, src_len, src_offset);
9217 *get_invlist_previous_index_addr(dest) = 0;
9218 invlist_iterfinish(dest);
9221 PERL_STATIC_INLINE IV*
9222 S_get_invlist_previous_index_addr(SV* invlist)
9224 /* Return the address of the IV that is reserved to hold the cached index
9226 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9228 assert(is_invlist(invlist));
9230 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9233 PERL_STATIC_INLINE IV
9234 S_invlist_previous_index(SV* const invlist)
9236 /* Returns cached index of previous search */
9238 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9240 return *get_invlist_previous_index_addr(invlist);
9243 PERL_STATIC_INLINE void
9244 S_invlist_set_previous_index(SV* const invlist, const IV index)
9246 /* Caches <index> for later retrieval */
9248 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9250 assert(index == 0 || index < (int) _invlist_len(invlist));
9252 *get_invlist_previous_index_addr(invlist) = index;
9255 PERL_STATIC_INLINE void
9256 S_invlist_trim(SV* invlist)
9258 /* Free the not currently-being-used space in an inversion list */
9260 /* But don't free up the space needed for the 0 UV that is always at the
9261 * beginning of the list, nor the trailing NUL */
9262 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9264 PERL_ARGS_ASSERT_INVLIST_TRIM;
9266 assert(is_invlist(invlist));
9268 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9271 PERL_STATIC_INLINE void
9272 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9274 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9276 assert(is_invlist(invlist));
9278 invlist_set_len(invlist, 0, 0);
9279 invlist_trim(invlist);
9282 #endif /* ifndef PERL_IN_XSUB_RE */
9284 PERL_STATIC_INLINE bool
9285 S_invlist_is_iterating(SV* const invlist)
9287 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9289 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9292 #ifndef PERL_IN_XSUB_RE
9294 PERL_STATIC_INLINE UV
9295 S_invlist_max(SV* const invlist)
9297 /* Returns the maximum number of elements storable in the inversion list's
9298 * array, without having to realloc() */
9300 PERL_ARGS_ASSERT_INVLIST_MAX;
9302 assert(is_invlist(invlist));
9304 /* Assumes worst case, in which the 0 element is not counted in the
9305 * inversion list, so subtracts 1 for that */
9306 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9307 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9308 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9312 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9314 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9316 /* First 1 is in case the zero element isn't in the list; second 1 is for
9318 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9319 invlist_set_len(invlist, 0, 0);
9321 /* Force iterinit() to be used to get iteration to work */
9322 invlist_iterfinish(invlist);
9324 *get_invlist_previous_index_addr(invlist) = 0;
9325 SvPOK_on(invlist); /* This allows B to extract the PV */
9329 Perl__new_invlist(pTHX_ IV initial_size)
9332 /* Return a pointer to a newly constructed inversion list, with enough
9333 * space to store 'initial_size' elements. If that number is negative, a
9334 * system default is used instead */
9338 if (initial_size < 0) {
9342 new_list = newSV_type(SVt_INVLIST);
9343 initialize_invlist_guts(new_list, initial_size);
9349 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9351 /* Return a pointer to a newly constructed inversion list, initialized to
9352 * point to <list>, which has to be in the exact correct inversion list
9353 * form, including internal fields. Thus this is a dangerous routine that
9354 * should not be used in the wrong hands. The passed in 'list' contains
9355 * several header fields at the beginning that are not part of the
9356 * inversion list body proper */
9358 const STRLEN length = (STRLEN) list[0];
9359 const UV version_id = list[1];
9360 const bool offset = cBOOL(list[2]);
9361 #define HEADER_LENGTH 3
9362 /* If any of the above changes in any way, you must change HEADER_LENGTH
9363 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9364 * perl -E 'say int(rand 2**31-1)'
9366 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9367 data structure type, so that one being
9368 passed in can be validated to be an
9369 inversion list of the correct vintage.
9372 SV* invlist = newSV_type(SVt_INVLIST);
9374 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9376 if (version_id != INVLIST_VERSION_ID) {
9377 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9380 /* The generated array passed in includes header elements that aren't part
9381 * of the list proper, so start it just after them */
9382 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9384 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9385 shouldn't touch it */
9387 *(get_invlist_offset_addr(invlist)) = offset;
9389 /* The 'length' passed to us is the physical number of elements in the
9390 * inversion list. But if there is an offset the logical number is one
9392 invlist_set_len(invlist, length - offset, offset);
9394 invlist_set_previous_index(invlist, 0);
9396 /* Initialize the iteration pointer. */
9397 invlist_iterfinish(invlist);
9399 SvREADONLY_on(invlist);
9406 S__append_range_to_invlist(pTHX_ SV* const invlist,
9407 const UV start, const UV end)
9409 /* Subject to change or removal. Append the range from 'start' to 'end' at
9410 * the end of the inversion list. The range must be above any existing
9414 UV max = invlist_max(invlist);
9415 UV len = _invlist_len(invlist);
9418 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9420 if (len == 0) { /* Empty lists must be initialized */
9421 offset = start != 0;
9422 array = _invlist_array_init(invlist, ! offset);
9425 /* Here, the existing list is non-empty. The current max entry in the
9426 * list is generally the first value not in the set, except when the
9427 * set extends to the end of permissible values, in which case it is
9428 * the first entry in that final set, and so this call is an attempt to
9429 * append out-of-order */
9431 UV final_element = len - 1;
9432 array = invlist_array(invlist);
9433 if ( array[final_element] > start
9434 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9436 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9437 array[final_element], start,
9438 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9441 /* Here, it is a legal append. If the new range begins 1 above the end
9442 * of the range below it, it is extending the range below it, so the
9443 * new first value not in the set is one greater than the newly
9444 * extended range. */
9445 offset = *get_invlist_offset_addr(invlist);
9446 if (array[final_element] == start) {
9447 if (end != UV_MAX) {
9448 array[final_element] = end + 1;
9451 /* But if the end is the maximum representable on the machine,
9452 * assume that infinity was actually what was meant. Just let
9453 * the range that this would extend to have no end */
9454 invlist_set_len(invlist, len - 1, offset);
9460 /* Here the new range doesn't extend any existing set. Add it */
9462 len += 2; /* Includes an element each for the start and end of range */
9464 /* If wll overflow the existing space, extend, which may cause the array to
9467 invlist_extend(invlist, len);
9469 /* Have to set len here to avoid assert failure in invlist_array() */
9470 invlist_set_len(invlist, len, offset);
9472 array = invlist_array(invlist);
9475 invlist_set_len(invlist, len, offset);
9478 /* The next item on the list starts the range, the one after that is
9479 * one past the new range. */
9480 array[len - 2] = start;
9481 if (end != UV_MAX) {
9482 array[len - 1] = end + 1;
9485 /* But if the end is the maximum representable on the machine, just let
9486 * the range have no end */
9487 invlist_set_len(invlist, len - 1, offset);
9492 Perl__invlist_search(SV* const invlist, const UV cp)
9494 /* Searches the inversion list for the entry that contains the input code
9495 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9496 * return value is the index into the list's array of the range that
9497 * contains <cp>, that is, 'i' such that
9498 * array[i] <= cp < array[i+1]
9503 IV high = _invlist_len(invlist);
9504 const IV highest_element = high - 1;
9507 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9509 /* If list is empty, return failure. */
9514 /* (We can't get the array unless we know the list is non-empty) */
9515 array = invlist_array(invlist);
9517 mid = invlist_previous_index(invlist);
9519 if (mid > highest_element) {
9520 mid = highest_element;
9523 /* <mid> contains the cache of the result of the previous call to this
9524 * function (0 the first time). See if this call is for the same result,
9525 * or if it is for mid-1. This is under the theory that calls to this
9526 * function will often be for related code points that are near each other.
9527 * And benchmarks show that caching gives better results. We also test
9528 * here if the code point is within the bounds of the list. These tests
9529 * replace others that would have had to be made anyway to make sure that
9530 * the array bounds were not exceeded, and these give us extra information
9531 * at the same time */
9532 if (cp >= array[mid]) {
9533 if (cp >= array[highest_element]) {
9534 return highest_element;
9537 /* Here, array[mid] <= cp < array[highest_element]. This means that
9538 * the final element is not the answer, so can exclude it; it also
9539 * means that <mid> is not the final element, so can refer to 'mid + 1'
9541 if (cp < array[mid + 1]) {
9547 else { /* cp < aray[mid] */
9548 if (cp < array[0]) { /* Fail if outside the array */
9552 if (cp >= array[mid - 1]) {
9557 /* Binary search. What we are looking for is <i> such that
9558 * array[i] <= cp < array[i+1]
9559 * The loop below converges on the i+1. Note that there may not be an
9560 * (i+1)th element in the array, and things work nonetheless */
9561 while (low < high) {
9562 mid = (low + high) / 2;
9563 assert(mid <= highest_element);
9564 if (array[mid] <= cp) { /* cp >= array[mid] */
9567 /* We could do this extra test to exit the loop early.
9568 if (cp < array[low]) {
9573 else { /* cp < array[mid] */
9580 invlist_set_previous_index(invlist, high);
9585 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9586 const bool complement_b, SV** output)
9588 /* Take the union of two inversion lists and point '*output' to it. On
9589 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9590 * even 'a' or 'b'). If to an inversion list, the contents of the original
9591 * list will be replaced by the union. The first list, 'a', may be
9592 * NULL, in which case a copy of the second list is placed in '*output'.
9593 * If 'complement_b' is TRUE, the union is taken of the complement
9594 * (inversion) of 'b' instead of b itself.
9596 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9597 * Richard Gillam, published by Addison-Wesley, and explained at some
9598 * length there. The preface says to incorporate its examples into your
9599 * code at your own risk.
9601 * The algorithm is like a merge sort. */
9603 const UV* array_a; /* a's array */
9605 UV len_a; /* length of a's array */
9608 SV* u; /* the resulting union */
9612 UV i_a = 0; /* current index into a's array */
9616 /* running count, as explained in the algorithm source book; items are
9617 * stopped accumulating and are output when the count changes to/from 0.
9618 * The count is incremented when we start a range that's in an input's set,
9619 * and decremented when we start a range that's not in a set. So this
9620 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9621 * and hence nothing goes into the union; 1, just one of the inputs is in
9622 * its set (and its current range gets added to the union); and 2 when both
9623 * inputs are in their sets. */
9626 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9628 assert(*output == NULL || is_invlist(*output));
9630 len_b = _invlist_len(b);
9633 /* Here, 'b' is empty, hence it's complement is all possible code
9634 * points. So if the union includes the complement of 'b', it includes
9635 * everything, and we need not even look at 'a'. It's easiest to
9636 * create a new inversion list that matches everything. */
9638 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9640 if (*output == NULL) { /* If the output didn't exist, just point it
9642 *output = everything;
9644 else { /* Otherwise, replace its contents with the new list */
9645 invlist_replace_list_destroys_src(*output, everything);
9646 SvREFCNT_dec_NN(everything);
9652 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9653 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9654 * output will be empty */
9656 if (a == NULL || _invlist_len(a) == 0) {
9657 if (*output == NULL) {
9658 *output = _new_invlist(0);
9661 invlist_clear(*output);
9666 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9667 * union. We can just return a copy of 'a' if '*output' doesn't point
9668 * to an existing list */
9669 if (*output == NULL) {
9670 *output = invlist_clone(a, NULL);
9674 /* If the output is to overwrite 'a', we have a no-op, as it's
9680 /* Here, '*output' is to be overwritten by 'a' */
9681 u = invlist_clone(a, NULL);
9682 invlist_replace_list_destroys_src(*output, u);
9688 /* Here 'b' is not empty. See about 'a' */
9690 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9692 /* Here, 'a' is empty (and b is not). That means the union will come
9693 * entirely from 'b'. If '*output' is NULL, we can directly return a
9694 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9697 SV ** dest = (*output == NULL) ? output : &u;
9698 *dest = invlist_clone(b, NULL);
9700 _invlist_invert(*dest);
9704 invlist_replace_list_destroys_src(*output, u);
9711 /* Here both lists exist and are non-empty */
9712 array_a = invlist_array(a);
9713 array_b = invlist_array(b);
9715 /* If are to take the union of 'a' with the complement of b, set it
9716 * up so are looking at b's complement. */
9719 /* To complement, we invert: if the first element is 0, remove it. To
9720 * do this, we just pretend the array starts one later */
9721 if (array_b[0] == 0) {
9727 /* But if the first element is not zero, we pretend the list starts
9728 * at the 0 that is always stored immediately before the array. */
9734 /* Size the union for the worst case: that the sets are completely
9736 u = _new_invlist(len_a + len_b);
9738 /* Will contain U+0000 if either component does */
9739 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9740 || (len_b > 0 && array_b[0] == 0));
9742 /* Go through each input list item by item, stopping when have exhausted
9744 while (i_a < len_a && i_b < len_b) {
9745 UV cp; /* The element to potentially add to the union's array */
9746 bool cp_in_set; /* is it in the input list's set or not */
9748 /* We need to take one or the other of the two inputs for the union.
9749 * Since we are merging two sorted lists, we take the smaller of the
9750 * next items. In case of a tie, we take first the one that is in its
9751 * set. If we first took the one not in its set, it would decrement
9752 * the count, possibly to 0 which would cause it to be output as ending
9753 * the range, and the next time through we would take the same number,
9754 * and output it again as beginning the next range. By doing it the
9755 * opposite way, there is no possibility that the count will be
9756 * momentarily decremented to 0, and thus the two adjoining ranges will
9757 * be seamlessly merged. (In a tie and both are in the set or both not
9758 * in the set, it doesn't matter which we take first.) */
9759 if ( array_a[i_a] < array_b[i_b]
9760 || ( array_a[i_a] == array_b[i_b]
9761 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9763 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9764 cp = array_a[i_a++];
9767 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9768 cp = array_b[i_b++];
9771 /* Here, have chosen which of the two inputs to look at. Only output
9772 * if the running count changes to/from 0, which marks the
9773 * beginning/end of a range that's in the set */
9776 array_u[i_u++] = cp;
9783 array_u[i_u++] = cp;
9789 /* The loop above increments the index into exactly one of the input lists
9790 * each iteration, and ends when either index gets to its list end. That
9791 * means the other index is lower than its end, and so something is
9792 * remaining in that one. We decrement 'count', as explained below, if
9793 * that list is in its set. (i_a and i_b each currently index the element
9794 * beyond the one we care about.) */
9795 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9796 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9801 /* Above we decremented 'count' if the list that had unexamined elements in
9802 * it was in its set. This has made it so that 'count' being non-zero
9803 * means there isn't anything left to output; and 'count' equal to 0 means
9804 * that what is left to output is precisely that which is left in the
9805 * non-exhausted input list.
9807 * To see why, note first that the exhausted input obviously has nothing
9808 * left to add to the union. If it was in its set at its end, that means
9809 * the set extends from here to the platform's infinity, and hence so does
9810 * the union and the non-exhausted set is irrelevant. The exhausted set
9811 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9812 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9813 * 'count' remains at 1. This is consistent with the decremented 'count'
9814 * != 0 meaning there's nothing left to add to the union.
9816 * But if the exhausted input wasn't in its set, it contributed 0 to
9817 * 'count', and the rest of the union will be whatever the other input is.
9818 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9819 * otherwise it gets decremented to 0. This is consistent with 'count'
9820 * == 0 meaning the remainder of the union is whatever is left in the
9821 * non-exhausted list. */
9826 IV copy_count = len_a - i_a;
9827 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9828 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9830 else { /* The non-exhausted input is b */
9831 copy_count = len_b - i_b;
9832 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9834 len_u = i_u + copy_count;
9837 /* Set the result to the final length, which can change the pointer to
9838 * array_u, so re-find it. (Note that it is unlikely that this will
9839 * change, as we are shrinking the space, not enlarging it) */
9840 if (len_u != _invlist_len(u)) {
9841 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9843 array_u = invlist_array(u);
9846 if (*output == NULL) { /* Simply return the new inversion list */
9850 /* Otherwise, overwrite the inversion list that was in '*output'. We
9851 * could instead free '*output', and then set it to 'u', but experience
9852 * has shown [perl #127392] that if the input is a mortal, we can get a
9853 * huge build-up of these during regex compilation before they get
9855 invlist_replace_list_destroys_src(*output, u);
9863 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9864 const bool complement_b, SV** i)
9866 /* Take the intersection of two inversion lists and point '*i' to it. On
9867 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9868 * even 'a' or 'b'). If to an inversion list, the contents of the original
9869 * list will be replaced by the intersection. The first list, 'a', may be
9870 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9871 * TRUE, the result will be the intersection of 'a' and the complement (or
9872 * inversion) of 'b' instead of 'b' directly.
9874 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9875 * Richard Gillam, published by Addison-Wesley, and explained at some
9876 * length there. The preface says to incorporate its examples into your
9877 * code at your own risk. In fact, it had bugs
9879 * The algorithm is like a merge sort, and is essentially the same as the
9883 const UV* array_a; /* a's array */
9885 UV len_a; /* length of a's array */
9888 SV* r; /* the resulting intersection */
9892 UV i_a = 0; /* current index into a's array */
9896 /* running count of how many of the two inputs are postitioned at ranges
9897 * that are in their sets. As explained in the algorithm source book,
9898 * items are stopped accumulating and are output when the count changes
9899 * to/from 2. The count is incremented when we start a range that's in an
9900 * input's set, and decremented when we start a range that's not in a set.
9901 * Only when it is 2 are we in the intersection. */
9904 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9906 assert(*i == NULL || is_invlist(*i));
9908 /* Special case if either one is empty */
9909 len_a = (a == NULL) ? 0 : _invlist_len(a);
9910 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9911 if (len_a != 0 && complement_b) {
9913 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9914 * must be empty. Here, also we are using 'b's complement, which
9915 * hence must be every possible code point. Thus the intersection
9918 if (*i == a) { /* No-op */
9923 *i = invlist_clone(a, NULL);
9927 r = invlist_clone(a, NULL);
9928 invlist_replace_list_destroys_src(*i, r);
9933 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9934 * intersection must be empty */
9936 *i = _new_invlist(0);
9944 /* Here both lists exist and are non-empty */
9945 array_a = invlist_array(a);
9946 array_b = invlist_array(b);
9948 /* If are to take the intersection of 'a' with the complement of b, set it
9949 * up so are looking at b's complement. */
9952 /* To complement, we invert: if the first element is 0, remove it. To
9953 * do this, we just pretend the array starts one later */
9954 if (array_b[0] == 0) {
9960 /* But if the first element is not zero, we pretend the list starts
9961 * at the 0 that is always stored immediately before the array. */
9967 /* Size the intersection for the worst case: that the intersection ends up
9968 * fragmenting everything to be completely disjoint */
9969 r= _new_invlist(len_a + len_b);
9971 /* Will contain U+0000 iff both components do */
9972 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9973 && len_b > 0 && array_b[0] == 0);
9975 /* Go through each list item by item, stopping when have exhausted one of
9977 while (i_a < len_a && i_b < len_b) {
9978 UV cp; /* The element to potentially add to the intersection's
9980 bool cp_in_set; /* Is it in the input list's set or not */
9982 /* We need to take one or the other of the two inputs for the
9983 * intersection. Since we are merging two sorted lists, we take the
9984 * smaller of the next items. In case of a tie, we take first the one
9985 * that is not in its set (a difference from the union algorithm). If
9986 * we first took the one in its set, it would increment the count,
9987 * possibly to 2 which would cause it to be output as starting a range
9988 * in the intersection, and the next time through we would take that
9989 * same number, and output it again as ending the set. By doing the
9990 * opposite of this, there is no possibility that the count will be
9991 * momentarily incremented to 2. (In a tie and both are in the set or
9992 * both not in the set, it doesn't matter which we take first.) */
9993 if ( array_a[i_a] < array_b[i_b]
9994 || ( array_a[i_a] == array_b[i_b]
9995 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9997 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9998 cp = array_a[i_a++];
10001 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10002 cp= array_b[i_b++];
10005 /* Here, have chosen which of the two inputs to look at. Only output
10006 * if the running count changes to/from 2, which marks the
10007 * beginning/end of a range that's in the intersection */
10011 array_r[i_r++] = cp;
10016 array_r[i_r++] = cp;
10023 /* The loop above increments the index into exactly one of the input lists
10024 * each iteration, and ends when either index gets to its list end. That
10025 * means the other index is lower than its end, and so something is
10026 * remaining in that one. We increment 'count', as explained below, if the
10027 * exhausted list was in its set. (i_a and i_b each currently index the
10028 * element beyond the one we care about.) */
10029 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10030 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10035 /* Above we incremented 'count' if the exhausted list was in its set. This
10036 * has made it so that 'count' being below 2 means there is nothing left to
10037 * output; otheriwse what's left to add to the intersection is precisely
10038 * that which is left in the non-exhausted input list.
10040 * To see why, note first that the exhausted input obviously has nothing
10041 * left to affect the intersection. If it was in its set at its end, that
10042 * means the set extends from here to the platform's infinity, and hence
10043 * anything in the non-exhausted's list will be in the intersection, and
10044 * anything not in it won't be. Hence, the rest of the intersection is
10045 * precisely what's in the non-exhausted list The exhausted set also
10046 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10047 * it means 'count' is now at least 2. This is consistent with the
10048 * incremented 'count' being >= 2 means to add the non-exhausted list to
10049 * the intersection.
10051 * But if the exhausted input wasn't in its set, it contributed 0 to
10052 * 'count', and the intersection can't include anything further; the
10053 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10054 * incremented. This is consistent with 'count' being < 2 meaning nothing
10055 * further to add to the intersection. */
10056 if (count < 2) { /* Nothing left to put in the intersection. */
10059 else { /* copy the non-exhausted list, unchanged. */
10060 IV copy_count = len_a - i_a;
10061 if (copy_count > 0) { /* a is the one with stuff left */
10062 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10064 else { /* b is the one with stuff left */
10065 copy_count = len_b - i_b;
10066 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10068 len_r = i_r + copy_count;
10071 /* Set the result to the final length, which can change the pointer to
10072 * array_r, so re-find it. (Note that it is unlikely that this will
10073 * change, as we are shrinking the space, not enlarging it) */
10074 if (len_r != _invlist_len(r)) {
10075 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10077 array_r = invlist_array(r);
10080 if (*i == NULL) { /* Simply return the calculated intersection */
10083 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10084 instead free '*i', and then set it to 'r', but experience has
10085 shown [perl #127392] that if the input is a mortal, we can get a
10086 huge build-up of these during regex compilation before they get
10089 invlist_replace_list_destroys_src(*i, r);
10094 SvREFCNT_dec_NN(r);
10101 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10103 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10104 * set. A pointer to the inversion list is returned. This may actually be
10105 * a new list, in which case the passed in one has been destroyed. The
10106 * passed-in inversion list can be NULL, in which case a new one is created
10107 * with just the one range in it. The new list is not necessarily
10108 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10109 * result of this function. The gain would not be large, and in many
10110 * cases, this is called multiple times on a single inversion list, so
10111 * anything freed may almost immediately be needed again.
10113 * This used to mostly call the 'union' routine, but that is much more
10114 * heavyweight than really needed for a single range addition */
10116 UV* array; /* The array implementing the inversion list */
10117 UV len; /* How many elements in 'array' */
10118 SSize_t i_s; /* index into the invlist array where 'start'
10120 SSize_t i_e = 0; /* And the index where 'end' should go */
10121 UV cur_highest; /* The highest code point in the inversion list
10122 upon entry to this function */
10124 /* This range becomes the whole inversion list if none already existed */
10125 if (invlist == NULL) {
10126 invlist = _new_invlist(2);
10127 _append_range_to_invlist(invlist, start, end);
10131 /* Likewise, if the inversion list is currently empty */
10132 len = _invlist_len(invlist);
10134 _append_range_to_invlist(invlist, start, end);
10138 /* Starting here, we have to know the internals of the list */
10139 array = invlist_array(invlist);
10141 /* If the new range ends higher than the current highest ... */
10142 cur_highest = invlist_highest(invlist);
10143 if (end > cur_highest) {
10145 /* If the whole range is higher, we can just append it */
10146 if (start > cur_highest) {
10147 _append_range_to_invlist(invlist, start, end);
10151 /* Otherwise, add the portion that is higher ... */
10152 _append_range_to_invlist(invlist, cur_highest + 1, end);
10154 /* ... and continue on below to handle the rest. As a result of the
10155 * above append, we know that the index of the end of the range is the
10156 * final even numbered one of the array. Recall that the final element
10157 * always starts a range that extends to infinity. If that range is in
10158 * the set (meaning the set goes from here to infinity), it will be an
10159 * even index, but if it isn't in the set, it's odd, and the final
10160 * range in the set is one less, which is even. */
10161 if (end == UV_MAX) {
10169 /* We have dealt with appending, now see about prepending. If the new
10170 * range starts lower than the current lowest ... */
10171 if (start < array[0]) {
10173 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10174 * Let the union code handle it, rather than having to know the
10175 * trickiness in two code places. */
10176 if (UNLIKELY(start == 0)) {
10179 range_invlist = _new_invlist(2);
10180 _append_range_to_invlist(range_invlist, start, end);
10182 _invlist_union(invlist, range_invlist, &invlist);
10184 SvREFCNT_dec_NN(range_invlist);
10189 /* If the whole new range comes before the first entry, and doesn't
10190 * extend it, we have to insert it as an additional range */
10191 if (end < array[0] - 1) {
10193 goto splice_in_new_range;
10196 /* Here the new range adjoins the existing first range, extending it
10200 /* And continue on below to handle the rest. We know that the index of
10201 * the beginning of the range is the first one of the array */
10204 else { /* Not prepending any part of the new range to the existing list.
10205 * Find where in the list it should go. This finds i_s, such that:
10206 * invlist[i_s] <= start < array[i_s+1]
10208 i_s = _invlist_search(invlist, start);
10211 /* At this point, any extending before the beginning of the inversion list
10212 * and/or after the end has been done. This has made it so that, in the
10213 * code below, each endpoint of the new range is either in a range that is
10214 * in the set, or is in a gap between two ranges that are. This means we
10215 * don't have to worry about exceeding the array bounds.
10217 * Find where in the list the new range ends (but we can skip this if we
10218 * have already determined what it is, or if it will be the same as i_s,
10219 * which we already have computed) */
10221 i_e = (start == end)
10223 : _invlist_search(invlist, end);
10226 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10227 * is a range that goes to infinity there is no element at invlist[i_e+1],
10228 * so only the first relation holds. */
10230 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10232 /* Here, the ranges on either side of the beginning of the new range
10233 * are in the set, and this range starts in the gap between them.
10235 * The new range extends the range above it downwards if the new range
10236 * ends at or above that range's start */
10237 const bool extends_the_range_above = ( end == UV_MAX
10238 || end + 1 >= array[i_s+1]);
10240 /* The new range extends the range below it upwards if it begins just
10241 * after where that range ends */
10242 if (start == array[i_s]) {
10244 /* If the new range fills the entire gap between the other ranges,
10245 * they will get merged together. Other ranges may also get
10246 * merged, depending on how many of them the new range spans. In
10247 * the general case, we do the merge later, just once, after we
10248 * figure out how many to merge. But in the case where the new
10249 * range exactly spans just this one gap (possibly extending into
10250 * the one above), we do the merge here, and an early exit. This
10251 * is done here to avoid having to special case later. */
10252 if (i_e - i_s <= 1) {
10254 /* If i_e - i_s == 1, it means that the new range terminates
10255 * within the range above, and hence 'extends_the_range_above'
10256 * must be true. (If the range above it extends to infinity,
10257 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10258 * will be 0, so no harm done.) */
10259 if (extends_the_range_above) {
10260 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10261 invlist_set_len(invlist,
10263 *(get_invlist_offset_addr(invlist)));
10267 /* Here, i_e must == i_s. We keep them in sync, as they apply
10268 * to the same range, and below we are about to decrement i_s
10273 /* Here, the new range is adjacent to the one below. (It may also
10274 * span beyond the range above, but that will get resolved later.)
10275 * Extend the range below to include this one. */
10276 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10278 start = array[i_s];
10280 else if (extends_the_range_above) {
10282 /* Here the new range only extends the range above it, but not the
10283 * one below. It merges with the one above. Again, we keep i_e
10284 * and i_s in sync if they point to the same range */
10289 array[i_s] = start;
10293 /* Here, we've dealt with the new range start extending any adjoining
10296 * If the new range extends to infinity, it is now the final one,
10297 * regardless of what was there before */
10298 if (UNLIKELY(end == UV_MAX)) {
10299 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10303 /* If i_e started as == i_s, it has also been dealt with,
10304 * and been updated to the new i_s, which will fail the following if */
10305 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10307 /* Here, the ranges on either side of the end of the new range are in
10308 * the set, and this range ends in the gap between them.
10310 * If this range is adjacent to (hence extends) the range above it, it
10311 * becomes part of that range; likewise if it extends the range below,
10312 * it becomes part of that range */
10313 if (end + 1 == array[i_e+1]) {
10315 array[i_e] = start;
10317 else if (start <= array[i_e]) {
10318 array[i_e] = end + 1;
10325 /* If the range fits entirely in an existing range (as possibly already
10326 * extended above), it doesn't add anything new */
10327 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10331 /* Here, no part of the range is in the list. Must add it. It will
10332 * occupy 2 more slots */
10333 splice_in_new_range:
10335 invlist_extend(invlist, len + 2);
10336 array = invlist_array(invlist);
10337 /* Move the rest of the array down two slots. Don't include any
10339 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10341 /* Do the actual splice */
10342 array[i_e+1] = start;
10343 array[i_e+2] = end + 1;
10344 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10348 /* Here the new range crossed the boundaries of a pre-existing range. The
10349 * code above has adjusted things so that both ends are in ranges that are
10350 * in the set. This means everything in between must also be in the set.
10351 * Just squash things together */
10352 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10353 invlist_set_len(invlist,
10355 *(get_invlist_offset_addr(invlist)));
10361 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10362 UV** other_elements_ptr)
10364 /* Create and return an inversion list whose contents are to be populated
10365 * by the caller. The caller gives the number of elements (in 'size') and
10366 * the very first element ('element0'). This function will set
10367 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10368 * are to be placed.
10370 * Obviously there is some trust involved that the caller will properly
10371 * fill in the other elements of the array.
10373 * (The first element needs to be passed in, as the underlying code does
10374 * things differently depending on whether it is zero or non-zero) */
10376 SV* invlist = _new_invlist(size);
10379 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10381 invlist = add_cp_to_invlist(invlist, element0);
10382 offset = *get_invlist_offset_addr(invlist);
10384 invlist_set_len(invlist, size, offset);
10385 *other_elements_ptr = invlist_array(invlist) + 1;
10391 #ifndef PERL_IN_XSUB_RE
10393 Perl__invlist_invert(pTHX_ SV* const invlist)
10395 /* Complement the input inversion list. This adds a 0 if the list didn't
10396 * have a zero; removes it otherwise. As described above, the data
10397 * structure is set up so that this is very efficient */
10399 PERL_ARGS_ASSERT__INVLIST_INVERT;
10401 assert(! invlist_is_iterating(invlist));
10403 /* The inverse of matching nothing is matching everything */
10404 if (_invlist_len(invlist) == 0) {
10405 _append_range_to_invlist(invlist, 0, UV_MAX);
10409 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10413 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10415 /* Return a new inversion list that is a copy of the input one, which is
10416 * unchanged. The new list will not be mortal even if the old one was. */
10418 const STRLEN nominal_length = _invlist_len(invlist);
10419 const STRLEN physical_length = SvCUR(invlist);
10420 const bool offset = *(get_invlist_offset_addr(invlist));
10422 PERL_ARGS_ASSERT_INVLIST_CLONE;
10424 if (new_invlist == NULL) {
10425 new_invlist = _new_invlist(nominal_length);
10428 sv_upgrade(new_invlist, SVt_INVLIST);
10429 initialize_invlist_guts(new_invlist, nominal_length);
10432 *(get_invlist_offset_addr(new_invlist)) = offset;
10433 invlist_set_len(new_invlist, nominal_length, offset);
10434 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10436 return new_invlist;
10441 PERL_STATIC_INLINE UV
10442 S_invlist_lowest(SV* const invlist)
10444 /* Returns the lowest code point that matches an inversion list. This API
10445 * has an ambiguity, as it returns 0 under either the lowest is actually
10446 * 0, or if the list is empty. If this distinction matters to you, check
10447 * for emptiness before calling this function */
10449 UV len = _invlist_len(invlist);
10452 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10458 array = invlist_array(invlist);
10464 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10466 /* Get the contents of an inversion list into a string SV so that they can
10467 * be printed out. If 'traditional_style' is TRUE, it uses the format
10468 * traditionally done for debug tracing; otherwise it uses a format
10469 * suitable for just copying to the output, with blanks between ranges and
10470 * a dash between range components */
10474 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10475 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10477 if (traditional_style) {
10478 output = newSVpvs("\n");
10481 output = newSVpvs("");
10484 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10486 assert(! invlist_is_iterating(invlist));
10488 invlist_iterinit(invlist);
10489 while (invlist_iternext(invlist, &start, &end)) {
10490 if (end == UV_MAX) {
10491 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10492 start, intra_range_delimiter,
10493 inter_range_delimiter);
10495 else if (end != start) {
10496 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10498 intra_range_delimiter,
10499 end, inter_range_delimiter);
10502 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10503 start, inter_range_delimiter);
10507 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10508 SvCUR_set(output, SvCUR(output) - 1);
10514 #ifndef PERL_IN_XSUB_RE
10516 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10517 const char * const indent, SV* const invlist)
10519 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10520 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10521 * the string 'indent'. The output looks like this:
10522 [0] 0x000A .. 0x000D
10524 [4] 0x2028 .. 0x2029
10525 [6] 0x3104 .. INFTY
10526 * This means that the first range of code points matched by the list are
10527 * 0xA through 0xD; the second range contains only the single code point
10528 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10529 * are used to define each range (except if the final range extends to
10530 * infinity, only a single element is needed). The array index of the
10531 * first element for the corresponding range is given in brackets. */
10536 PERL_ARGS_ASSERT__INVLIST_DUMP;
10538 if (invlist_is_iterating(invlist)) {
10539 Perl_dump_indent(aTHX_ level, file,
10540 "%sCan't dump inversion list because is in middle of iterating\n",
10545 invlist_iterinit(invlist);
10546 while (invlist_iternext(invlist, &start, &end)) {
10547 if (end == UV_MAX) {
10548 Perl_dump_indent(aTHX_ level, file,
10549 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10550 indent, (UV)count, start);
10552 else if (end != start) {
10553 Perl_dump_indent(aTHX_ level, file,
10554 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10555 indent, (UV)count, start, end);
10558 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10559 indent, (UV)count, start);
10567 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10569 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10571 /* Return a boolean as to if the two passed in inversion lists are
10572 * identical. The final argument, if TRUE, says to take the complement of
10573 * the second inversion list before doing the comparison */
10575 const UV len_a = _invlist_len(a);
10576 UV len_b = _invlist_len(b);
10578 const UV* array_a = NULL;
10579 const UV* array_b = NULL;
10581 PERL_ARGS_ASSERT__INVLISTEQ;
10583 /* This code avoids accessing the arrays unless it knows the length is
10588 return ! complement_b;
10592 array_a = invlist_array(a);
10596 array_b = invlist_array(b);
10599 /* If are to compare 'a' with the complement of b, set it
10600 * up so are looking at b's complement. */
10601 if (complement_b) {
10603 /* The complement of nothing is everything, so <a> would have to have
10604 * just one element, starting at zero (ending at infinity) */
10606 return (len_a == 1 && array_a[0] == 0);
10608 if (array_b[0] == 0) {
10610 /* Otherwise, to complement, we invert. Here, the first element is
10611 * 0, just remove it. To do this, we just pretend the array starts
10619 /* But if the first element is not zero, we pretend the list starts
10620 * at the 0 that is always stored immediately before the array. */
10626 return len_a == len_b
10627 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10633 * As best we can, determine the characters that can match the start of
10634 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10635 * can be false positive matches
10637 * Returns the invlist as a new SV*; it is the caller's responsibility to
10638 * call SvREFCNT_dec() when done with it.
10641 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10643 const U8 * s = (U8*)STRING(node);
10644 SSize_t bytelen = STR_LEN(node);
10646 /* Start out big enough for 2 separate code points */
10647 SV* invlist = _new_invlist(4);
10649 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10654 /* We punt and assume can match anything if the node begins
10655 * with a multi-character fold. Things are complicated. For
10656 * example, /ffi/i could match any of:
10657 * "\N{LATIN SMALL LIGATURE FFI}"
10658 * "\N{LATIN SMALL LIGATURE FF}I"
10659 * "F\N{LATIN SMALL LIGATURE FI}"
10660 * plus several other things; and making sure we have all the
10661 * possibilities is hard. */
10662 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10663 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10666 /* Any Latin1 range character can potentially match any
10667 * other depending on the locale, and in Turkic locales, U+130 and
10669 if (OP(node) == EXACTFL) {
10670 _invlist_union(invlist, PL_Latin1, &invlist);
10671 invlist = add_cp_to_invlist(invlist,
10672 LATIN_SMALL_LETTER_DOTLESS_I);
10673 invlist = add_cp_to_invlist(invlist,
10674 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10677 /* But otherwise, it matches at least itself. We can
10678 * quickly tell if it has a distinct fold, and if so,
10679 * it matches that as well */
10680 invlist = add_cp_to_invlist(invlist, uc);
10681 if (IS_IN_SOME_FOLD_L1(uc))
10682 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10685 /* Some characters match above-Latin1 ones under /i. This
10686 * is true of EXACTFL ones when the locale is UTF-8 */
10687 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10688 && (! isASCII(uc) || (OP(node) != EXACTFAA
10689 && OP(node) != EXACTFAA_NO_TRIE)))
10691 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10695 else { /* Pattern is UTF-8 */
10696 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10697 const U8* e = s + bytelen;
10700 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10702 /* The only code points that aren't folded in a UTF EXACTFish
10703 * node are the problematic ones in EXACTFL nodes */
10704 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10705 /* We need to check for the possibility that this EXACTFL
10706 * node begins with a multi-char fold. Therefore we fold
10707 * the first few characters of it so that we can make that
10713 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10715 *(d++) = (U8) toFOLD(*s);
10716 if (fc < 0) { /* Save the first fold */
10723 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10724 if (fc < 0) { /* Save the first fold */
10732 /* And set up so the code below that looks in this folded
10733 * buffer instead of the node's string */
10738 /* When we reach here 's' points to the fold of the first
10739 * character(s) of the node; and 'e' points to far enough along
10740 * the folded string to be just past any possible multi-char
10743 * Unlike the non-UTF-8 case, the macro for determining if a
10744 * string is a multi-char fold requires all the characters to
10745 * already be folded. This is because of all the complications
10746 * if not. Note that they are folded anyway, except in EXACTFL
10747 * nodes. Like the non-UTF case above, we punt if the node
10748 * begins with a multi-char fold */
10750 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10751 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10753 else { /* Single char fold */
10756 const U32 * remaining_folds;
10757 Size_t folds_count;
10759 /* It matches itself */
10760 invlist = add_cp_to_invlist(invlist, fc);
10762 /* ... plus all the things that fold to it, which are found in
10763 * PL_utf8_foldclosures */
10764 folds_count = _inverse_folds(fc, &first_fold,
10766 for (k = 0; k < folds_count; k++) {
10767 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10769 /* /aa doesn't allow folds between ASCII and non- */
10770 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10771 && isASCII(c) != isASCII(fc))
10776 invlist = add_cp_to_invlist(invlist, c);
10779 if (OP(node) == EXACTFL) {
10781 /* If either [iI] are present in an EXACTFL node the above code
10782 * should have added its normal case pair, but under a Turkish
10783 * locale they could match instead the case pairs from it. Add
10784 * those as potential matches as well */
10785 if (isALPHA_FOLD_EQ(fc, 'I')) {
10786 invlist = add_cp_to_invlist(invlist,
10787 LATIN_SMALL_LETTER_DOTLESS_I);
10788 invlist = add_cp_to_invlist(invlist,
10789 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10791 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10792 invlist = add_cp_to_invlist(invlist, 'I');
10794 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10795 invlist = add_cp_to_invlist(invlist, 'i');
10804 #undef HEADER_LENGTH
10805 #undef TO_INTERNAL_SIZE
10806 #undef FROM_INTERNAL_SIZE
10807 #undef INVLIST_VERSION_ID
10809 /* End of inversion list object */
10812 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10814 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10815 * constructs, and updates RExC_flags with them. On input, RExC_parse
10816 * should point to the first flag; it is updated on output to point to the
10817 * final ')' or ':'. There needs to be at least one flag, or this will
10820 /* for (?g), (?gc), and (?o) warnings; warning
10821 about (?c) will warn about (?g) -- japhy */
10823 #define WASTED_O 0x01
10824 #define WASTED_G 0x02
10825 #define WASTED_C 0x04
10826 #define WASTED_GC (WASTED_G|WASTED_C)
10827 I32 wastedflags = 0x00;
10828 U32 posflags = 0, negflags = 0;
10829 U32 *flagsp = &posflags;
10830 char has_charset_modifier = '\0';
10832 bool has_use_defaults = FALSE;
10833 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10834 int x_mod_count = 0;
10836 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10838 /* '^' as an initial flag sets certain defaults */
10839 if (UCHARAT(RExC_parse) == '^') {
10841 has_use_defaults = TRUE;
10842 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10843 cs = (RExC_uni_semantics)
10844 ? REGEX_UNICODE_CHARSET
10845 : REGEX_DEPENDS_CHARSET;
10846 set_regex_charset(&RExC_flags, cs);
10849 cs = get_regex_charset(RExC_flags);
10850 if ( cs == REGEX_DEPENDS_CHARSET
10851 && RExC_uni_semantics)
10853 cs = REGEX_UNICODE_CHARSET;
10857 while (RExC_parse < RExC_end) {
10858 /* && memCHRs("iogcmsx", *RExC_parse) */
10859 /* (?g), (?gc) and (?o) are useless here
10860 and must be globally applied -- japhy */
10861 if ((RExC_pm_flags & PMf_WILDCARD)) {
10862 if (flagsp == & negflags) {
10863 if (*RExC_parse == 'm') {
10865 /* diag_listed_as: Use of %s is not allowed in Unicode
10866 property wildcard subpatterns in regex; marked by <--
10868 vFAIL("Use of modifier '-m' is not allowed in Unicode"
10869 " property wildcard subpatterns");
10873 if (*RExC_parse == 's') {
10874 goto modifier_illegal_in_wildcard;
10879 switch (*RExC_parse) {
10881 /* Code for the imsxn flags */
10882 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10884 case LOCALE_PAT_MOD:
10885 if (has_charset_modifier) {
10886 goto excess_modifier;
10888 else if (flagsp == &negflags) {
10891 cs = REGEX_LOCALE_CHARSET;
10892 has_charset_modifier = LOCALE_PAT_MOD;
10894 case UNICODE_PAT_MOD:
10895 if (has_charset_modifier) {
10896 goto excess_modifier;
10898 else if (flagsp == &negflags) {
10901 cs = REGEX_UNICODE_CHARSET;
10902 has_charset_modifier = UNICODE_PAT_MOD;
10904 case ASCII_RESTRICT_PAT_MOD:
10905 if (flagsp == &negflags) {
10908 if (has_charset_modifier) {
10909 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10910 goto excess_modifier;
10912 /* Doubled modifier implies more restricted */
10913 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10916 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10918 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10920 case DEPENDS_PAT_MOD:
10921 if (has_use_defaults) {
10922 goto fail_modifiers;
10924 else if (flagsp == &negflags) {
10927 else if (has_charset_modifier) {
10928 goto excess_modifier;
10931 /* The dual charset means unicode semantics if the
10932 * pattern (or target, not known until runtime) are
10933 * utf8, or something in the pattern indicates unicode
10935 cs = (RExC_uni_semantics)
10936 ? REGEX_UNICODE_CHARSET
10937 : REGEX_DEPENDS_CHARSET;
10938 has_charset_modifier = DEPENDS_PAT_MOD;
10942 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10943 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10945 else if (has_charset_modifier == *(RExC_parse - 1)) {
10946 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10947 *(RExC_parse - 1));
10950 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10952 NOT_REACHED; /*NOTREACHED*/
10955 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10956 *(RExC_parse - 1));
10957 NOT_REACHED; /*NOTREACHED*/
10958 case GLOBAL_PAT_MOD: /* 'g' */
10959 if (RExC_pm_flags & PMf_WILDCARD) {
10960 goto modifier_illegal_in_wildcard;
10963 case ONCE_PAT_MOD: /* 'o' */
10964 if (ckWARN(WARN_REGEXP)) {
10965 const I32 wflagbit = *RExC_parse == 'o'
10968 if (! (wastedflags & wflagbit) ) {
10969 wastedflags |= wflagbit;
10970 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10973 "Useless (%s%c) - %suse /%c modifier",
10974 flagsp == &negflags ? "?-" : "?",
10976 flagsp == &negflags ? "don't " : "",
10983 case CONTINUE_PAT_MOD: /* 'c' */
10984 if (RExC_pm_flags & PMf_WILDCARD) {
10985 goto modifier_illegal_in_wildcard;
10987 if (ckWARN(WARN_REGEXP)) {
10988 if (! (wastedflags & WASTED_C) ) {
10989 wastedflags |= WASTED_GC;
10990 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10993 "Useless (%sc) - %suse /gc modifier",
10994 flagsp == &negflags ? "?-" : "?",
10995 flagsp == &negflags ? "don't " : ""
11000 case KEEPCOPY_PAT_MOD: /* 'p' */
11001 if (RExC_pm_flags & PMf_WILDCARD) {
11002 goto modifier_illegal_in_wildcard;
11004 if (flagsp == &negflags) {
11005 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
11007 *flagsp |= RXf_PMf_KEEPCOPY;
11011 /* A flag is a default iff it is following a minus, so
11012 * if there is a minus, it means will be trying to
11013 * re-specify a default which is an error */
11014 if (has_use_defaults || flagsp == &negflags) {
11015 goto fail_modifiers;
11017 flagsp = &negflags;
11018 wastedflags = 0; /* reset so (?g-c) warns twice */
11024 if ( (RExC_pm_flags & PMf_WILDCARD)
11025 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11028 /* diag_listed_as: Use of %s is not allowed in Unicode
11029 property wildcard subpatterns in regex; marked by <--
11031 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11032 " property wildcard subpatterns",
11033 has_charset_modifier);
11036 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11037 negflags |= RXf_PMf_EXTENDED_MORE;
11039 RExC_flags |= posflags;
11041 if (negflags & RXf_PMf_EXTENDED) {
11042 negflags |= RXf_PMf_EXTENDED_MORE;
11044 RExC_flags &= ~negflags;
11045 set_regex_charset(&RExC_flags, cs);
11050 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11051 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11052 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11053 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11054 NOT_REACHED; /*NOTREACHED*/
11057 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11060 vFAIL("Sequence (?... not terminated");
11062 modifier_illegal_in_wildcard:
11064 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11065 subpatterns in regex; marked by <-- HERE in m/%s/ */
11066 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11067 " subpatterns", *(RExC_parse - 1));
11071 - reg - regular expression, i.e. main body or parenthesized thing
11073 * Caller must absorb opening parenthesis.
11075 * Combining parenthesis handling with the base level of regular expression
11076 * is a trifle forced, but the need to tie the tails of the branches to what
11077 * follows makes it hard to avoid.
11079 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11081 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11083 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11086 STATIC regnode_offset
11087 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11089 char * parse_start,
11093 regnode_offset ret;
11094 char* name_start = RExC_parse;
11096 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11097 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11099 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11101 if (RExC_parse == name_start || *RExC_parse != ch) {
11102 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11103 vFAIL2("Sequence %.3s... not terminated", parse_start);
11107 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11108 RExC_rxi->data->data[num]=(void*)sv_dat;
11109 SvREFCNT_inc_simple_void_NN(sv_dat);
11112 ret = reganode(pRExC_state,
11115 : (ASCII_FOLD_RESTRICTED)
11117 : (AT_LEAST_UNI_SEMANTICS)
11123 *flagp |= HASWIDTH;
11125 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11126 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11128 nextchar(pRExC_state);
11132 /* On success, returns the offset at which any next node should be placed into
11133 * the regex engine program being compiled.
11135 * Returns 0 otherwise, with *flagp set to indicate why:
11136 * TRYAGAIN at the end of (?) that only sets flags.
11137 * RESTART_PARSE if the parse needs to be restarted, or'd with
11138 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11139 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11141 STATIC regnode_offset
11142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11143 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11144 * 2 is like 1, but indicates that nextchar() has been called to advance
11145 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11146 * this flag alerts us to the need to check for that */
11148 regnode_offset ret = 0; /* Will be the head of the group. */
11150 regnode_offset lastbr;
11151 regnode_offset ender = 0;
11154 U32 oregflags = RExC_flags;
11155 bool have_branch = 0;
11157 I32 freeze_paren = 0;
11158 I32 after_freeze = 0;
11159 I32 num; /* numeric backreferences */
11160 SV * max_open; /* Max number of unclosed parens */
11162 char * parse_start = RExC_parse; /* MJD */
11163 char * const oregcomp_parse = RExC_parse;
11165 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11167 PERL_ARGS_ASSERT_REG;
11168 DEBUG_PARSE("reg ");
11170 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11172 if (!SvIOK(max_open)) {
11173 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11175 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11177 vFAIL("Too many nested open parens");
11180 *flagp = 0; /* Initialize. */
11182 if (RExC_in_lookbehind) {
11183 RExC_in_lookbehind++;
11185 if (RExC_in_lookahead) {
11186 RExC_in_lookahead++;
11189 /* Having this true makes it feasible to have a lot fewer tests for the
11190 * parse pointer being in scope. For example, we can write
11191 * while(isFOO(*RExC_parse)) RExC_parse++;
11193 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11195 assert(*RExC_end == '\0');
11197 /* Make an OPEN node, if parenthesized. */
11200 /* Under /x, space and comments can be gobbled up between the '(' and
11201 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11202 * intervening space, as the sequence is a token, and a token should be
11204 bool has_intervening_patws = (paren == 2)
11205 && *(RExC_parse - 1) != '(';
11207 if (RExC_parse >= RExC_end) {
11208 vFAIL("Unmatched (");
11211 if (paren == 'r') { /* Atomic script run */
11215 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11216 char *start_verb = RExC_parse + 1;
11218 char *start_arg = NULL;
11219 unsigned char op = 0;
11220 int arg_required = 0;
11221 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11222 bool has_upper = FALSE;
11224 if (has_intervening_patws) {
11225 RExC_parse++; /* past the '*' */
11227 /* For strict backwards compatibility, don't change the message
11228 * now that we also have lowercase operands */
11229 if (isUPPER(*RExC_parse)) {
11230 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11233 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11236 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11237 if ( *RExC_parse == ':' ) {
11238 start_arg = RExC_parse + 1;
11242 if (isUPPER(*RExC_parse)) {
11248 RExC_parse += UTF8SKIP(RExC_parse);
11251 verb_len = RExC_parse - start_verb;
11253 if (RExC_parse >= RExC_end) {
11254 goto unterminated_verb_pattern;
11257 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11258 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11259 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11261 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11262 unterminated_verb_pattern:
11264 vFAIL("Unterminated verb pattern argument");
11267 vFAIL("Unterminated '(*...' argument");
11271 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11273 vFAIL("Unterminated verb pattern");
11276 vFAIL("Unterminated '(*...' construct");
11281 /* Here, we know that RExC_parse < RExC_end */
11283 switch ( *start_verb ) {
11284 case 'A': /* (*ACCEPT) */
11285 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11287 internal_argval = RExC_nestroot;
11290 case 'C': /* (*COMMIT) */
11291 if ( memEQs(start_verb, verb_len,"COMMIT") )
11294 case 'F': /* (*FAIL) */
11295 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11299 case ':': /* (*:NAME) */
11300 case 'M': /* (*MARK:NAME) */
11301 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11306 case 'P': /* (*PRUNE) */
11307 if ( memEQs(start_verb, verb_len,"PRUNE") )
11310 case 'S': /* (*SKIP) */
11311 if ( memEQs(start_verb, verb_len,"SKIP") )
11314 case 'T': /* (*THEN) */
11315 /* [19:06] <TimToady> :: is then */
11316 if ( memEQs(start_verb, verb_len,"THEN") ) {
11318 RExC_seen |= REG_CUTGROUP_SEEN;
11322 if ( memEQs(start_verb, verb_len, "asr")
11323 || memEQs(start_verb, verb_len, "atomic_script_run"))
11325 paren = 'r'; /* Mnemonic: recursed run */
11328 else if (memEQs(start_verb, verb_len, "atomic")) {
11329 paren = 't'; /* AtOMIC */
11330 goto alpha_assertions;
11334 if ( memEQs(start_verb, verb_len, "plb")
11335 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11338 goto lookbehind_alpha_assertions;
11340 else if ( memEQs(start_verb, verb_len, "pla")
11341 || memEQs(start_verb, verb_len, "positive_lookahead"))
11344 goto alpha_assertions;
11348 if ( memEQs(start_verb, verb_len, "nlb")
11349 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11352 goto lookbehind_alpha_assertions;
11354 else if ( memEQs(start_verb, verb_len, "nla")
11355 || memEQs(start_verb, verb_len, "negative_lookahead"))
11358 goto alpha_assertions;
11362 if ( memEQs(start_verb, verb_len, "sr")
11363 || memEQs(start_verb, verb_len, "script_run"))
11365 regnode_offset atomic;
11371 /* This indicates Unicode rules. */
11372 REQUIRE_UNI_RULES(flagp, 0);
11378 RExC_parse = start_arg;
11380 if (RExC_in_script_run) {
11382 /* Nested script runs are treated as no-ops, because
11383 * if the nested one fails, the outer one must as
11384 * well. It could fail sooner, and avoid (??{} with
11385 * side effects, but that is explicitly documented as
11386 * undefined behavior. */
11390 if (paren == 's') {
11395 /* But, the atomic part of a nested atomic script run
11396 * isn't a no-op, but can be treated just like a '(?>'
11402 if (paren == 's') {
11403 /* Here, we're starting a new regular script run */
11404 ret = reg_node(pRExC_state, SROPEN);
11405 RExC_in_script_run = 1;
11410 /* Here, we are starting an atomic script run. This is
11411 * handled by recursing to deal with the atomic portion
11412 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11414 ret = reg_node(pRExC_state, SROPEN);
11416 RExC_in_script_run = 1;
11418 atomic = reg(pRExC_state, 'r', &flags, depth);
11419 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11420 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11424 if (! REGTAIL(pRExC_state, ret, atomic)) {
11425 REQUIRE_BRANCHJ(flagp, 0);
11428 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11431 REQUIRE_BRANCHJ(flagp, 0);
11434 RExC_in_script_run = 0;
11440 lookbehind_alpha_assertions:
11441 RExC_seen |= REG_LOOKBEHIND_SEEN;
11442 RExC_in_lookbehind++;
11447 RExC_seen_zerolen++;
11453 /* An empty negative lookahead assertion simply is failure */
11454 if (paren == 'A' && RExC_parse == start_arg) {
11455 ret=reganode(pRExC_state, OPFAIL, 0);
11456 nextchar(pRExC_state);
11460 RExC_parse = start_arg;
11465 "'(*%" UTF8f "' requires a terminating ':'",
11466 UTF8fARG(UTF, verb_len, start_verb));
11467 NOT_REACHED; /*NOTREACHED*/
11469 } /* End of switch */
11472 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11474 if (has_upper || verb_len == 0) {
11476 "Unknown verb pattern '%" UTF8f "'",
11477 UTF8fARG(UTF, verb_len, start_verb));
11481 "Unknown '(*...)' construct '%" UTF8f "'",
11482 UTF8fARG(UTF, verb_len, start_verb));
11485 if ( RExC_parse == start_arg ) {
11488 if ( arg_required && !start_arg ) {
11489 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11490 (int) verb_len, start_verb);
11492 if (internal_argval == -1) {
11493 ret = reganode(pRExC_state, op, 0);
11495 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11497 RExC_seen |= REG_VERBARG_SEEN;
11499 SV *sv = newSVpvn( start_arg,
11500 RExC_parse - start_arg);
11501 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11502 STR_WITH_LEN("S"));
11503 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11504 FLAGS(REGNODE_p(ret)) = 1;
11506 FLAGS(REGNODE_p(ret)) = 0;
11508 if ( internal_argval != -1 )
11509 ARG2L_SET(REGNODE_p(ret), internal_argval);
11510 nextchar(pRExC_state);
11513 else if (*RExC_parse == '?') { /* (?...) */
11514 bool is_logical = 0;
11515 const char * const seqstart = RExC_parse;
11516 const char * endptr;
11517 const char non_existent_group_msg[]
11518 = "Reference to nonexistent group";
11519 const char impossible_group[] = "Invalid reference to group";
11521 if (has_intervening_patws) {
11523 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11526 RExC_parse++; /* past the '?' */
11527 paren = *RExC_parse; /* might be a trailing NUL, if not
11529 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11530 if (RExC_parse > RExC_end) {
11533 ret = 0; /* For look-ahead/behind. */
11536 case 'P': /* (?P...) variants for those used to PCRE/Python */
11537 paren = *RExC_parse;
11538 if ( paren == '<') { /* (?P<...>) named capture */
11540 if (RExC_parse >= RExC_end) {
11541 vFAIL("Sequence (?P<... not terminated");
11543 goto named_capture;
11545 else if (paren == '>') { /* (?P>name) named recursion */
11547 if (RExC_parse >= RExC_end) {
11548 vFAIL("Sequence (?P>... not terminated");
11550 goto named_recursion;
11552 else if (paren == '=') { /* (?P=...) named backref */
11554 return handle_named_backref(pRExC_state, flagp,
11557 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11558 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11559 vFAIL3("Sequence (%.*s...) not recognized",
11560 (int) (RExC_parse - seqstart), seqstart);
11561 NOT_REACHED; /*NOTREACHED*/
11562 case '<': /* (?<...) */
11563 /* If you want to support (?<*...), first reconcile with GH #17363 */
11564 if (*RExC_parse == '!')
11566 else if (*RExC_parse != '=')
11573 case '\'': /* (?'...') */
11574 name_start = RExC_parse;
11575 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11576 if ( RExC_parse == name_start
11577 || RExC_parse >= RExC_end
11578 || *RExC_parse != paren)
11580 vFAIL2("Sequence (?%c... not terminated",
11581 paren=='>' ? '<' : (char) paren);
11586 if (!svname) /* shouldn't happen */
11588 "panic: reg_scan_name returned NULL");
11589 if (!RExC_paren_names) {
11590 RExC_paren_names= newHV();
11591 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11593 RExC_paren_name_list= newAV();
11594 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11597 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11599 sv_dat = HeVAL(he_str);
11601 /* croak baby croak */
11603 "panic: paren_name hash element allocation failed");
11604 } else if ( SvPOK(sv_dat) ) {
11605 /* (?|...) can mean we have dupes so scan to check
11606 its already been stored. Maybe a flag indicating
11607 we are inside such a construct would be useful,
11608 but the arrays are likely to be quite small, so
11609 for now we punt -- dmq */
11610 IV count = SvIV(sv_dat);
11611 I32 *pv = (I32*)SvPVX(sv_dat);
11613 for ( i = 0 ; i < count ; i++ ) {
11614 if ( pv[i] == RExC_npar ) {
11620 pv = (I32*)SvGROW(sv_dat,
11621 SvCUR(sv_dat) + sizeof(I32)+1);
11622 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11623 pv[count] = RExC_npar;
11624 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11627 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11628 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11631 SvIV_set(sv_dat, 1);
11634 /* Yes this does cause a memory leak in debugging Perls
11636 if (!av_store(RExC_paren_name_list,
11637 RExC_npar, SvREFCNT_inc_NN(svname)))
11638 SvREFCNT_dec_NN(svname);
11641 /*sv_dump(sv_dat);*/
11643 nextchar(pRExC_state);
11645 goto capturing_parens;
11648 RExC_seen |= REG_LOOKBEHIND_SEEN;
11649 RExC_in_lookbehind++;
11651 if (RExC_parse >= RExC_end) {
11652 vFAIL("Sequence (?... not terminated");
11654 RExC_seen_zerolen++;
11656 case '=': /* (?=...) */
11657 RExC_seen_zerolen++;
11658 RExC_in_lookahead++;
11660 case '!': /* (?!...) */
11661 RExC_seen_zerolen++;
11662 /* check if we're really just a "FAIL" assertion */
11663 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11664 FALSE /* Don't force to /x */ );
11665 if (*RExC_parse == ')') {
11666 ret=reganode(pRExC_state, OPFAIL, 0);
11667 nextchar(pRExC_state);
11671 case '|': /* (?|...) */
11672 /* branch reset, behave like a (?:...) except that
11673 buffers in alternations share the same numbers */
11675 after_freeze = freeze_paren = RExC_npar;
11677 /* XXX This construct currently requires an extra pass.
11678 * Investigation would be required to see if that could be
11680 REQUIRE_PARENS_PASS;
11682 case ':': /* (?:...) */
11683 case '>': /* (?>...) */
11685 case '$': /* (?$...) */
11686 case '@': /* (?@...) */
11687 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11689 case '0' : /* (?0) */
11690 case 'R' : /* (?R) */
11691 if (RExC_parse == RExC_end || *RExC_parse != ')')
11692 FAIL("Sequence (?R) not terminated");
11694 RExC_seen |= REG_RECURSE_SEEN;
11696 /* XXX These constructs currently require an extra pass.
11697 * It probably could be changed */
11698 REQUIRE_PARENS_PASS;
11700 *flagp |= POSTPONED;
11701 goto gen_recurse_regop;
11703 /* named and numeric backreferences */
11704 case '&': /* (?&NAME) */
11705 parse_start = RExC_parse - 1;
11708 SV *sv_dat = reg_scan_name(pRExC_state,
11709 REG_RSN_RETURN_DATA);
11710 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11712 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11713 vFAIL("Sequence (?&... not terminated");
11714 goto gen_recurse_regop;
11717 if (! inRANGE(RExC_parse[0], '1', '9')) {
11719 vFAIL("Illegal pattern");
11721 goto parse_recursion;
11723 case '-': /* (?-1) */
11724 if (! inRANGE(RExC_parse[0], '1', '9')) {
11725 RExC_parse--; /* rewind to let it be handled later */
11729 case '1': case '2': case '3': case '4': /* (?1) */
11730 case '5': case '6': case '7': case '8': case '9':
11731 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11734 bool is_neg = FALSE;
11736 parse_start = RExC_parse - 1; /* MJD */
11737 if (*RExC_parse == '-') {
11742 if (grok_atoUV(RExC_parse, &unum, &endptr)
11746 RExC_parse = (char*)endptr;
11748 else { /* Overflow, or something like that. Position
11749 beyond all digits for the message */
11750 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
11753 vFAIL(impossible_group);
11756 /* -num is always representable on 1 and 2's complement
11761 if (*RExC_parse!=')')
11762 vFAIL("Expecting close bracket");
11765 if (paren == '-' || paren == '+') {
11767 /* Don't overflow */
11768 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11770 vFAIL(impossible_group);
11774 Diagram of capture buffer numbering.
11775 Top line is the normal capture buffer numbers
11776 Bottom line is the negative indexing as from
11780 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11781 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11782 - 5 4 3 2 1 X Y x x
11784 Resolve to absolute group. Recall that RExC_npar is +1 of
11785 the actual parenthesis group number. For lookahead, we
11786 have to compensate for that. Using the above example, when
11787 we get to Y in the parse, num is 2 and RExC_npar is 6. We
11788 want 7 for +2, and 4 for -2.
11790 if ( paren == '+' ) {
11796 if (paren == '-' && num < 1) {
11798 vFAIL(non_existent_group_msg);
11802 if (num >= RExC_npar) {
11804 /* It might be a forward reference; we can't fail until we
11805 * know, by completing the parse to get all the groups, and
11806 * then reparsing */
11807 if (ALL_PARENS_COUNTED) {
11808 if (num >= RExC_total_parens) {
11810 vFAIL(non_existent_group_msg);
11814 REQUIRE_PARENS_PASS;
11818 /* We keep track how many GOSUB items we have produced.
11819 To start off the ARG2L() of the GOSUB holds its "id",
11820 which is used later in conjunction with RExC_recurse
11821 to calculate the offset we need to jump for the GOSUB,
11822 which it will store in the final representation.
11823 We have to defer the actual calculation until much later
11824 as the regop may move.
11826 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11827 RExC_recurse_count++;
11828 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11829 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11830 22, "| |", (int)(depth * 2 + 1), "",
11831 (UV)ARG(REGNODE_p(ret)),
11832 (IV)ARG2L(REGNODE_p(ret))));
11833 RExC_seen |= REG_RECURSE_SEEN;
11835 Set_Node_Length(REGNODE_p(ret),
11836 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11837 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11839 *flagp |= POSTPONED;
11840 assert(*RExC_parse == ')');
11841 nextchar(pRExC_state);
11846 case '?': /* (??...) */
11848 if (*RExC_parse != '{') {
11849 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11850 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11852 "Sequence (%" UTF8f "...) not recognized",
11853 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11854 NOT_REACHED; /*NOTREACHED*/
11856 *flagp |= POSTPONED;
11860 case '{': /* (?{...}) */
11863 struct reg_code_block *cb;
11866 RExC_seen_zerolen++;
11868 if ( !pRExC_state->code_blocks
11869 || pRExC_state->code_index
11870 >= pRExC_state->code_blocks->count
11871 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11872 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11875 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11876 FAIL("panic: Sequence (?{...}): no code block found\n");
11877 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11879 /* this is a pre-compiled code block (?{...}) */
11880 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11881 RExC_parse = RExC_start + cb->end;
11883 if (cb->src_regex) {
11884 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11885 RExC_rxi->data->data[n] =
11886 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11887 RExC_rxi->data->data[n+1] = (void*)o;
11890 n = add_data(pRExC_state,
11891 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11892 RExC_rxi->data->data[n] = (void*)o;
11894 pRExC_state->code_index++;
11895 nextchar(pRExC_state);
11898 regnode_offset eval;
11899 ret = reg_node(pRExC_state, LOGICAL);
11901 eval = reg2Lanode(pRExC_state, EVAL,
11904 /* for later propagation into (??{})
11906 RExC_flags & RXf_PMf_COMPILETIME
11908 FLAGS(REGNODE_p(ret)) = 2;
11909 if (! REGTAIL(pRExC_state, ret, eval)) {
11910 REQUIRE_BRANCHJ(flagp, 0);
11912 /* deal with the length of this later - MJD */
11915 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11916 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11917 Set_Node_Offset(REGNODE_p(ret), parse_start);
11920 case '(': /* (?(?{...})...) and (?(?=...)...) */
11923 const int DEFINE_len = sizeof("DEFINE") - 1;
11924 if ( RExC_parse < RExC_end - 1
11925 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11926 && ( RExC_parse[1] == '='
11927 || RExC_parse[1] == '!'
11928 || RExC_parse[1] == '<'
11929 || RExC_parse[1] == '{'))
11930 || ( RExC_parse[0] == '*' /* (?(*...)) */
11931 && ( memBEGINs(RExC_parse + 1,
11932 (Size_t) (RExC_end - (RExC_parse + 1)),
11934 || memBEGINs(RExC_parse + 1,
11935 (Size_t) (RExC_end - (RExC_parse + 1)),
11937 || memBEGINs(RExC_parse + 1,
11938 (Size_t) (RExC_end - (RExC_parse + 1)),
11940 || memBEGINs(RExC_parse + 1,
11941 (Size_t) (RExC_end - (RExC_parse + 1)),
11943 || memBEGINs(RExC_parse + 1,
11944 (Size_t) (RExC_end - (RExC_parse + 1)),
11945 "positive_lookahead:")
11946 || memBEGINs(RExC_parse + 1,
11947 (Size_t) (RExC_end - (RExC_parse + 1)),
11948 "positive_lookbehind:")
11949 || memBEGINs(RExC_parse + 1,
11950 (Size_t) (RExC_end - (RExC_parse + 1)),
11951 "negative_lookahead:")
11952 || memBEGINs(RExC_parse + 1,
11953 (Size_t) (RExC_end - (RExC_parse + 1)),
11954 "negative_lookbehind:"))))
11955 ) { /* Lookahead or eval. */
11957 regnode_offset tail;
11959 ret = reg_node(pRExC_state, LOGICAL);
11960 FLAGS(REGNODE_p(ret)) = 1;
11962 tail = reg(pRExC_state, 1, &flag, depth+1);
11963 RETURN_FAIL_ON_RESTART(flag, flagp);
11964 if (! REGTAIL(pRExC_state, ret, tail)) {
11965 REQUIRE_BRANCHJ(flagp, 0);
11969 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11970 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11972 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11973 char *name_start= RExC_parse++;
11975 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11976 if ( RExC_parse == name_start
11977 || RExC_parse >= RExC_end
11978 || *RExC_parse != ch)
11980 vFAIL2("Sequence (?(%c... not terminated",
11981 (ch == '>' ? '<' : ch));
11985 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11986 RExC_rxi->data->data[num]=(void*)sv_dat;
11987 SvREFCNT_inc_simple_void_NN(sv_dat);
11989 ret = reganode(pRExC_state, GROUPPN, num);
11990 goto insert_if_check_paren;
11992 else if (memBEGINs(RExC_parse,
11993 (STRLEN) (RExC_end - RExC_parse),
11996 ret = reganode(pRExC_state, DEFINEP, 0);
11997 RExC_parse += DEFINE_len;
11999 goto insert_if_check_paren;
12001 else if (RExC_parse[0] == 'R') {
12003 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
12004 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
12005 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
12008 if (RExC_parse[0] == '0') {
12012 else if (inRANGE(RExC_parse[0], '1', '9')) {
12015 if (grok_atoUV(RExC_parse, &uv, &endptr)
12018 parno = (I32)uv + 1;
12019 RExC_parse = (char*)endptr;
12021 /* else "Switch condition not recognized" below */
12022 } else if (RExC_parse[0] == '&') {
12025 sv_dat = reg_scan_name(pRExC_state,
12026 REG_RSN_RETURN_DATA);
12028 parno = 1 + *((I32 *)SvPVX(sv_dat));
12030 ret = reganode(pRExC_state, INSUBP, parno);
12031 goto insert_if_check_paren;
12033 else if (inRANGE(RExC_parse[0], '1', '9')) {
12038 if (grok_atoUV(RExC_parse, &uv, &endptr)
12042 RExC_parse = (char*)endptr;
12045 vFAIL("panic: grok_atoUV returned FALSE");
12047 ret = reganode(pRExC_state, GROUPP, parno);
12049 insert_if_check_paren:
12050 if (UCHARAT(RExC_parse) != ')') {
12052 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12054 vFAIL("Switch condition not recognized");
12056 nextchar(pRExC_state);
12058 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12061 REQUIRE_BRANCHJ(flagp, 0);
12063 br = regbranch(pRExC_state, &flags, 1, depth+1);
12065 RETURN_FAIL_ON_RESTART(flags,flagp);
12066 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12069 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12072 REQUIRE_BRANCHJ(flagp, 0);
12074 c = UCHARAT(RExC_parse);
12075 nextchar(pRExC_state);
12076 if (flags&HASWIDTH)
12077 *flagp |= HASWIDTH;
12080 vFAIL("(?(DEFINE)....) does not allow branches");
12082 /* Fake one for optimizer. */
12083 lastbr = reganode(pRExC_state, IFTHEN, 0);
12085 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12086 RETURN_FAIL_ON_RESTART(flags, flagp);
12087 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12090 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12091 REQUIRE_BRANCHJ(flagp, 0);
12093 if (flags&HASWIDTH)
12094 *flagp |= HASWIDTH;
12095 c = UCHARAT(RExC_parse);
12096 nextchar(pRExC_state);
12101 if (RExC_parse >= RExC_end)
12102 vFAIL("Switch (?(condition)... not terminated");
12104 vFAIL("Switch (?(condition)... contains too many branches");
12106 ender = reg_node(pRExC_state, TAIL);
12107 if (! REGTAIL(pRExC_state, br, ender)) {
12108 REQUIRE_BRANCHJ(flagp, 0);
12111 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12112 REQUIRE_BRANCHJ(flagp, 0);
12114 if (! REGTAIL(pRExC_state,
12117 NEXTOPER(REGNODE_p(lastbr)))),
12120 REQUIRE_BRANCHJ(flagp, 0);
12124 if (! REGTAIL(pRExC_state, ret, ender)) {
12125 REQUIRE_BRANCHJ(flagp, 0);
12127 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12128 RExC_size++; /* XXX WHY do we need this?!!
12129 For large programs it seems to be required
12130 but I can't figure out why. -- dmq*/
12135 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12137 vFAIL("Unknown switch condition (?(...))");
12139 case '[': /* (?[ ... ]) */
12140 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12142 case 0: /* A NUL */
12143 RExC_parse--; /* for vFAIL to print correctly */
12144 vFAIL("Sequence (? incomplete");
12148 if (RExC_strict) { /* [perl #132851] */
12149 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12152 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12154 default: /* e.g., (?i) */
12155 RExC_parse = (char *) seqstart + 1;
12157 parse_lparen_question_flags(pRExC_state);
12158 if (UCHARAT(RExC_parse) != ':') {
12159 if (RExC_parse < RExC_end)
12160 nextchar(pRExC_state);
12165 nextchar(pRExC_state);
12170 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12174 if (! ALL_PARENS_COUNTED) {
12175 /* If we are in our first pass through (and maybe only pass),
12176 * we need to allocate memory for the capturing parentheses
12180 if (!RExC_parens_buf_size) {
12181 /* first guess at number of parens we might encounter */
12182 RExC_parens_buf_size = 10;
12184 /* setup RExC_open_parens, which holds the address of each
12185 * OPEN tag, and to make things simpler for the 0 index the
12186 * start of the program - this is used later for offsets */
12187 Newxz(RExC_open_parens, RExC_parens_buf_size,
12189 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12191 /* setup RExC_close_parens, which holds the address of each
12192 * CLOSE tag, and to make things simpler for the 0 index
12193 * the end of the program - this is used later for offsets
12195 Newxz(RExC_close_parens, RExC_parens_buf_size,
12197 /* we dont know where end op starts yet, so we dont need to
12198 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12201 else if (RExC_npar > RExC_parens_buf_size) {
12202 I32 old_size = RExC_parens_buf_size;
12204 RExC_parens_buf_size *= 2;
12206 Renew(RExC_open_parens, RExC_parens_buf_size,
12208 Zero(RExC_open_parens + old_size,
12209 RExC_parens_buf_size - old_size, regnode_offset);
12211 Renew(RExC_close_parens, RExC_parens_buf_size,
12213 Zero(RExC_close_parens + old_size,
12214 RExC_parens_buf_size - old_size, regnode_offset);
12218 ret = reganode(pRExC_state, OPEN, parno);
12219 if (!RExC_nestroot)
12220 RExC_nestroot = parno;
12221 if (RExC_open_parens && !RExC_open_parens[parno])
12223 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12224 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12225 22, "| |", (int)(depth * 2 + 1), "",
12227 RExC_open_parens[parno]= ret;
12230 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12231 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12234 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12243 /* Pick up the branches, linking them together. */
12244 parse_start = RExC_parse; /* MJD */
12245 br = regbranch(pRExC_state, &flags, 1, depth+1);
12247 /* branch_len = (paren != 0); */
12250 RETURN_FAIL_ON_RESTART(flags, flagp);
12251 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12253 if (*RExC_parse == '|') {
12254 if (RExC_use_BRANCHJ) {
12255 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12258 reginsert(pRExC_state, BRANCH, br, depth+1);
12259 Set_Node_Length(REGNODE_p(br), paren != 0);
12260 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12264 else if (paren == ':') {
12265 *flagp |= flags&SIMPLE;
12267 if (is_open) { /* Starts with OPEN. */
12268 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12269 REQUIRE_BRANCHJ(flagp, 0);
12272 else if (paren != '?') /* Not Conditional */
12274 *flagp |= flags & (HASWIDTH | POSTPONED);
12276 while (*RExC_parse == '|') {
12277 if (RExC_use_BRANCHJ) {
12280 ender = reganode(pRExC_state, LONGJMP, 0);
12282 /* Append to the previous. */
12283 shut_gcc_up = REGTAIL(pRExC_state,
12284 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12286 PERL_UNUSED_VAR(shut_gcc_up);
12288 nextchar(pRExC_state);
12289 if (freeze_paren) {
12290 if (RExC_npar > after_freeze)
12291 after_freeze = RExC_npar;
12292 RExC_npar = freeze_paren;
12294 br = regbranch(pRExC_state, &flags, 0, depth+1);
12297 RETURN_FAIL_ON_RESTART(flags, flagp);
12298 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12300 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12301 REQUIRE_BRANCHJ(flagp, 0);
12304 *flagp |= flags & (HASWIDTH | POSTPONED);
12307 if (have_branch || paren != ':') {
12310 /* Make a closing node, and hook it on the end. */
12313 ender = reg_node(pRExC_state, TAIL);
12316 ender = reganode(pRExC_state, CLOSE, parno);
12317 if ( RExC_close_parens ) {
12318 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12319 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12320 22, "| |", (int)(depth * 2 + 1), "",
12321 (IV)parno, ender));
12322 RExC_close_parens[parno]= ender;
12323 if (RExC_nestroot == parno)
12326 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12327 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12330 ender = reg_node(pRExC_state, SRCLOSE);
12331 RExC_in_script_run = 0;
12341 *flagp &= ~HASWIDTH;
12343 case 't': /* aTomic */
12345 ender = reg_node(pRExC_state, SUCCEED);
12348 ender = reg_node(pRExC_state, END);
12349 assert(!RExC_end_op); /* there can only be one! */
12350 RExC_end_op = REGNODE_p(ender);
12351 if (RExC_close_parens) {
12352 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12353 "%*s%*s Setting close paren #0 (END) to %zu\n",
12354 22, "| |", (int)(depth * 2 + 1), "",
12357 RExC_close_parens[0]= ender;
12362 DEBUG_PARSE_MSG("lsbr");
12363 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12364 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12365 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12366 SvPV_nolen_const(RExC_mysv1),
12368 SvPV_nolen_const(RExC_mysv2),
12370 (IV)(ender - lastbr)
12373 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12374 REQUIRE_BRANCHJ(flagp, 0);
12378 char is_nothing= 1;
12380 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12382 /* Hook the tails of the branches to the closing node. */
12383 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12384 const U8 op = PL_regkind[OP(br)];
12385 if (op == BRANCH) {
12386 if (! REGTAIL_STUDY(pRExC_state,
12387 REGNODE_OFFSET(NEXTOPER(br)),
12390 REQUIRE_BRANCHJ(flagp, 0);
12392 if ( OP(NEXTOPER(br)) != NOTHING
12393 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12396 else if (op == BRANCHJ) {
12397 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12398 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12400 PERL_UNUSED_VAR(shut_gcc_up);
12401 /* for now we always disable this optimisation * /
12402 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12403 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12409 regnode * ret_as_regnode = REGNODE_p(ret);
12410 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12411 ? regnext(ret_as_regnode)
12414 DEBUG_PARSE_MSG("NADA");
12415 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12416 NULL, pRExC_state);
12417 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12418 NULL, pRExC_state);
12419 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12420 SvPV_nolen_const(RExC_mysv1),
12421 (IV)REG_NODE_NUM(ret_as_regnode),
12422 SvPV_nolen_const(RExC_mysv2),
12428 if (OP(REGNODE_p(ender)) == TAIL) {
12430 RExC_emit= REGNODE_OFFSET(br) + 1;
12433 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12434 OP(opt)= OPTIMIZED;
12435 NEXT_OFF(br)= REGNODE_p(ender) - br;
12443 /* Even/odd or x=don't care: 010101x10x */
12444 static const char parens[] = "=!aA<,>Bbt";
12445 /* flag below is set to 0 up through 'A'; 1 for larger */
12447 if (paren && (p = strchr(parens, paren))) {
12448 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12449 int flag = (p - parens) > 3;
12451 if (paren == '>' || paren == 't') {
12452 node = SUSPEND, flag = 0;
12455 reginsert(pRExC_state, node, ret, depth+1);
12456 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12457 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12458 FLAGS(REGNODE_p(ret)) = flag;
12459 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12461 REQUIRE_BRANCHJ(flagp, 0);
12466 /* Check for proper termination. */
12468 /* restore original flags, but keep (?p) and, if we've encountered
12469 * something in the parse that changes /d rules into /u, keep the /u */
12470 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12471 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12472 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12474 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12475 RExC_parse = oregcomp_parse;
12476 vFAIL("Unmatched (");
12478 nextchar(pRExC_state);
12480 else if (!paren && RExC_parse < RExC_end) {
12481 if (*RExC_parse == ')') {
12483 vFAIL("Unmatched )");
12486 FAIL("Junk on end of regexp"); /* "Can't happen". */
12487 NOT_REACHED; /* NOTREACHED */
12490 if (RExC_in_lookbehind) {
12491 RExC_in_lookbehind--;
12493 if (RExC_in_lookahead) {
12494 RExC_in_lookahead--;
12496 if (after_freeze > RExC_npar)
12497 RExC_npar = after_freeze;
12502 - regbranch - one alternative of an | operator
12504 * Implements the concatenation operator.
12506 * On success, returns the offset at which any next node should be placed into
12507 * the regex engine program being compiled.
12509 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12510 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12513 STATIC regnode_offset
12514 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12516 regnode_offset ret;
12517 regnode_offset chain = 0;
12518 regnode_offset latest;
12519 I32 flags = 0, c = 0;
12520 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12522 PERL_ARGS_ASSERT_REGBRANCH;
12524 DEBUG_PARSE("brnc");
12529 if (RExC_use_BRANCHJ)
12530 ret = reganode(pRExC_state, BRANCHJ, 0);
12532 ret = reg_node(pRExC_state, BRANCH);
12533 Set_Node_Length(REGNODE_p(ret), 1);
12537 *flagp = 0; /* Initialize. */
12539 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12540 FALSE /* Don't force to /x */ );
12541 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12542 flags &= ~TRYAGAIN;
12543 latest = regpiece(pRExC_state, &flags, depth+1);
12545 if (flags & TRYAGAIN)
12547 RETURN_FAIL_ON_RESTART(flags, flagp);
12548 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12552 *flagp |= flags&(HASWIDTH|POSTPONED);
12554 /* FIXME adding one for every branch after the first is probably
12555 * excessive now we have TRIE support. (hv) */
12557 if (! REGTAIL(pRExC_state, chain, latest)) {
12558 /* XXX We could just redo this branch, but figuring out what
12559 * bookkeeping needs to be reset is a pain, and it's likely
12560 * that other branches that goto END will also be too large */
12561 REQUIRE_BRANCHJ(flagp, 0);
12567 if (chain == 0) { /* Loop ran zero times. */
12568 chain = reg_node(pRExC_state, NOTHING);
12573 *flagp |= flags&SIMPLE;
12580 - regpiece - something followed by possible quantifier * + ? {n,m}
12582 * Note that the branching code sequences used for ? and the general cases
12583 * of * and + are somewhat optimized: they use the same NOTHING node as
12584 * both the endmarker for their branch list and the body of the last branch.
12585 * It might seem that this node could be dispensed with entirely, but the
12586 * endmarker role is not redundant.
12588 * On success, returns the offset at which any next node should be placed into
12589 * the regex engine program being compiled.
12591 * Returns 0 otherwise, with *flagp set to indicate why:
12592 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12593 * RESTART_PARSE if the parse needs to be restarted, or'd with
12594 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12596 STATIC regnode_offset
12597 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12599 regnode_offset ret;
12603 const char * const origparse = RExC_parse;
12605 I32 max = REG_INFTY;
12606 #ifdef RE_TRACK_PATTERN_OFFSETS
12609 const char *maxpos = NULL;
12612 /* Save the original in case we change the emitted regop to a FAIL. */
12613 const regnode_offset orig_emit = RExC_emit;
12615 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12617 PERL_ARGS_ASSERT_REGPIECE;
12619 DEBUG_PARSE("piec");
12621 ret = regatom(pRExC_state, &flags, depth+1);
12623 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12624 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12627 if (! ISMULT2(RExC_parse)) {
12632 /* Here we know the input is a legal quantifier, including {m,n} */
12636 #ifdef RE_TRACK_PATTERN_OFFSETS
12637 parse_start = RExC_parse;
12641 nextchar(pRExC_state);
12646 else if (op == '+') {
12649 else if (op == '?') {
12653 else { /* is '{' */
12654 const char* endptr;
12657 next = RExC_parse + 1;
12658 while (isDIGIT(*next) || *next == ',') {
12659 if (*next == ',') {
12668 assert(*next == '}');
12673 if (isDIGIT(*RExC_parse)) {
12675 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12676 vFAIL("Invalid quantifier in {,}");
12677 if (uv >= REG_INFTY)
12678 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12683 if (*maxpos == ',')
12686 maxpos = RExC_parse;
12687 if (isDIGIT(*maxpos)) {
12689 if (!grok_atoUV(maxpos, &uv, &endptr))
12690 vFAIL("Invalid quantifier in {,}");
12691 if (uv >= REG_INFTY)
12692 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12695 max = REG_INFTY; /* meaning "infinity" */
12698 nextchar(pRExC_state);
12699 if (max < min) { /* If can't match, warn and optimize to fail
12701 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12702 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12703 NEXT_OFF(REGNODE_p(orig_emit)) =
12704 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12707 else if (min == max && *RExC_parse == '?')
12709 ckWARN2reg(RExC_parse + 1,
12710 "Useless use of greediness modifier '%c'",
12715 /* Here we have a quantifier, and have calculated 'min' and 'max'.
12717 * Check and possibly adjust a zero width operand */
12718 if (! (flags & (HASWIDTH|POSTPONED))) {
12719 if (max > REG_INFTY/3) {
12720 if (origparse[0] == '\\' && origparse[1] == 'K') {
12722 "%" UTF8f " is forbidden - matches null string"
12724 UTF8fARG(UTF, (RExC_parse >= origparse
12725 ? RExC_parse - origparse
12729 ckWARN2reg(RExC_parse,
12730 "%" UTF8f " matches null string many times",
12731 UTF8fARG(UTF, (RExC_parse >= origparse
12732 ? RExC_parse - origparse
12738 /* There's no point in trying to match something 0 length more than
12739 * once except for extra side effects, which we don't have here since
12749 /* If this is a code block pass it up */
12750 *flagp |= (flags & POSTPONED);
12753 *flagp |= (flags & HASWIDTH);
12755 if ((flags&SIMPLE)) {
12756 if (min == 0 && max == REG_INFTY) {
12758 /* Going from 0..inf is currently forbidden in wildcard
12759 * subpatterns. The only reason is to make it harder to
12760 * write patterns that take a long long time to halt, and
12761 * because the use of this construct isn't necessary in
12762 * matching Unicode property values */
12763 if (RExC_pm_flags & PMf_WILDCARD) {
12765 /* diag_listed_as: Use of %s is not allowed in Unicode
12766 property wildcard subpatterns in regex; marked by
12767 <-- HERE in m/%s/ */
12768 vFAIL("Use of quantifier '*' is not allowed in"
12769 " Unicode property wildcard subpatterns");
12770 /* Note, don't need to worry about {0,}, as a '}' isn't
12771 * legal at all in wildcards, so wouldn't get this far
12774 reginsert(pRExC_state, STAR, ret, depth+1);
12776 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12779 if (min == 1 && max == REG_INFTY) {
12780 reginsert(pRExC_state, PLUS, ret, depth+1);
12782 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12785 MARK_NAUGHTY_EXP(2, 2);
12786 reginsert(pRExC_state, CURLY, ret, depth+1);
12787 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12788 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12791 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12793 FLAGS(REGNODE_p(w)) = 0;
12794 if (! REGTAIL(pRExC_state, ret, w)) {
12795 REQUIRE_BRANCHJ(flagp, 0);
12797 if (RExC_use_BRANCHJ) {
12798 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12799 reginsert(pRExC_state, NOTHING, ret, depth+1);
12800 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12802 reginsert(pRExC_state, CURLYX, ret, depth+1);
12804 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12805 Set_Node_Length(REGNODE_p(ret),
12806 op == '{' ? (RExC_parse - parse_start) : 1);
12808 if (RExC_use_BRANCHJ)
12809 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12811 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12814 REQUIRE_BRANCHJ(flagp, 0);
12816 RExC_whilem_seen++;
12817 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12819 FLAGS(REGNODE_p(ret)) = 0;
12821 ARG1_SET(REGNODE_p(ret), (U16)min);
12822 ARG2_SET(REGNODE_p(ret), (U16)max);
12823 if (max == REG_INFTY)
12824 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12828 if (*RExC_parse == '?') {
12829 nextchar(pRExC_state);
12830 reginsert(pRExC_state, MINMOD, ret, depth+1);
12831 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12832 REQUIRE_BRANCHJ(flagp, 0);
12835 else if (*RExC_parse == '+') {
12836 regnode_offset ender;
12837 nextchar(pRExC_state);
12838 ender = reg_node(pRExC_state, SUCCEED);
12839 if (! REGTAIL(pRExC_state, ret, ender)) {
12840 REQUIRE_BRANCHJ(flagp, 0);
12842 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12843 ender = reg_node(pRExC_state, TAIL);
12844 if (! REGTAIL(pRExC_state, ret, ender)) {
12845 REQUIRE_BRANCHJ(flagp, 0);
12849 if (ISMULT2(RExC_parse)) {
12851 vFAIL("Nested quantifiers");
12858 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12859 regnode_offset * node_p,
12867 /* This routine teases apart the various meanings of \N and returns
12868 * accordingly. The input parameters constrain which meaning(s) is/are valid
12869 * in the current context.
12871 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12873 * If <code_point_p> is not NULL, the context is expecting the result to be a
12874 * single code point. If this \N instance turns out to a single code point,
12875 * the function returns TRUE and sets *code_point_p to that code point.
12877 * If <node_p> is not NULL, the context is expecting the result to be one of
12878 * the things representable by a regnode. If this \N instance turns out to be
12879 * one such, the function generates the regnode, returns TRUE and sets *node_p
12880 * to point to the offset of that regnode into the regex engine program being
12883 * If this instance of \N isn't legal in any context, this function will
12884 * generate a fatal error and not return.
12886 * On input, RExC_parse should point to the first char following the \N at the
12887 * time of the call. On successful return, RExC_parse will have been updated
12888 * to point to just after the sequence identified by this routine. Also
12889 * *flagp has been updated as needed.
12891 * When there is some problem with the current context and this \N instance,
12892 * the function returns FALSE, without advancing RExC_parse, nor setting
12893 * *node_p, nor *code_point_p, nor *flagp.
12895 * If <cp_count> is not NULL, the caller wants to know the length (in code
12896 * points) that this \N sequence matches. This is set, and the input is
12897 * parsed for errors, even if the function returns FALSE, as detailed below.
12899 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12901 * Probably the most common case is for the \N to specify a single code point.
12902 * *cp_count will be set to 1, and *code_point_p will be set to that code
12905 * Another possibility is for the input to be an empty \N{}. This is no
12906 * longer accepted, and will generate a fatal error.
12908 * Another possibility is for a custom charnames handler to be in effect which
12909 * translates the input name to an empty string. *cp_count will be set to 0.
12910 * *node_p will be set to a generated NOTHING node.
12912 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12913 * set to 0. *node_p will be set to a generated REG_ANY node.
12915 * The fifth possibility is that \N resolves to a sequence of more than one
12916 * code points. *cp_count will be set to the number of code points in the
12917 * sequence. *node_p will be set to a generated node returned by this
12918 * function calling S_reg().
12920 * The final possibility is that it is premature to be calling this function;
12921 * the parse needs to be restarted. This can happen when this changes from
12922 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12923 * latter occurs only when the fifth possibility would otherwise be in
12924 * effect, and is because one of those code points requires the pattern to be
12925 * recompiled as UTF-8. The function returns FALSE, and sets the
12926 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12927 * happens, the caller needs to desist from continuing parsing, and return
12928 * this information to its caller. This is not set for when there is only one
12929 * code point, as this can be called as part of an ANYOF node, and they can
12930 * store above-Latin1 code points without the pattern having to be in UTF-8.
12932 * For non-single-quoted regexes, the tokenizer has resolved character and
12933 * sequence names inside \N{...} into their Unicode values, normalizing the
12934 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12935 * hex-represented code points in the sequence. This is done there because
12936 * the names can vary based on what charnames pragma is in scope at the time,
12937 * so we need a way to take a snapshot of what they resolve to at the time of
12938 * the original parse. [perl #56444].
12940 * That parsing is skipped for single-quoted regexes, so here we may get
12941 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12942 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12943 * the native character set for non-ASCII platforms. The other possibilities
12944 * are already native, so no translation is done. */
12946 char * endbrace; /* points to '}' following the name */
12947 char* p = RExC_parse; /* Temporary */
12949 SV * substitute_parse = NULL;
12954 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12956 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12958 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12959 assert(! (node_p && cp_count)); /* At most 1 should be set */
12961 if (cp_count) { /* Initialize return for the most common case */
12965 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12966 * modifier. The other meanings do not, so use a temporary until we find
12967 * out which we are being called with */
12968 skip_to_be_ignored_text(pRExC_state, &p,
12969 FALSE /* Don't force to /x */ );
12971 /* Disambiguate between \N meaning a named character versus \N meaning
12972 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12973 * quantifier, or if there is no '{' at all */
12974 if (*p != '{' || regcurly(p)) {
12984 *node_p = reg_node(pRExC_state, REG_ANY);
12985 *flagp |= HASWIDTH|SIMPLE;
12987 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12991 /* The test above made sure that the next real character is a '{', but
12992 * under the /x modifier, it could be separated by space (or a comment and
12993 * \n) and this is not allowed (for consistency with \x{...} and the
12994 * tokenizer handling of \N{NAME}). */
12995 if (*RExC_parse != '{') {
12996 vFAIL("Missing braces on \\N{}");
12999 RExC_parse++; /* Skip past the '{' */
13001 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13002 if (! endbrace) { /* no trailing brace */
13003 vFAIL2("Missing right brace on \\%c{}", 'N');
13006 /* Here, we have decided it should be a named character or sequence. These
13007 * imply Unicode semantics */
13008 REQUIRE_UNI_RULES(flagp, FALSE);
13010 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13011 * nothing at all (not allowed under strict) */
13012 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13013 RExC_parse = endbrace;
13015 RExC_parse++; /* Position after the "}" */
13016 vFAIL("Zero length \\N{}");
13022 nextchar(pRExC_state);
13027 *node_p = reg_node(pRExC_state, NOTHING);
13031 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13033 /* Here, the name isn't of the form U+.... This can happen if the
13034 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13035 * is the time to find out what the name means */
13037 const STRLEN name_len = endbrace - RExC_parse;
13038 SV * value_sv; /* What does this name evaluate to */
13040 const U8 * value; /* string of name's value */
13041 STRLEN value_len; /* and its length */
13043 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13044 * toke.c, and their values. Make sure is initialized */
13045 if (! RExC_unlexed_names) {
13046 RExC_unlexed_names = newHV();
13049 /* If we have already seen this name in this pattern, use that. This
13050 * allows us to only call the charnames handler once per name per
13051 * pattern. A broken or malicious handler could return something
13052 * different each time, which could cause the results to vary depending
13053 * on if something gets added or subtracted from the pattern that
13054 * causes the number of passes to change, for example */
13055 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13058 value_sv = *value_svp;
13060 else { /* Otherwise we have to go out and get the name */
13061 const char * error_msg = NULL;
13062 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13066 RExC_parse = endbrace;
13070 /* If no error message, should have gotten a valid return */
13073 /* Save the name's meaning for later use */
13074 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13077 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13081 /* Here, we have the value the name evaluates to in 'value_sv' */
13082 value = (U8 *) SvPV(value_sv, value_len);
13084 /* See if the result is one code point vs 0 or multiple */
13085 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13089 /* Here, exactly one code point. If that isn't what is wanted,
13091 if (! code_point_p) {
13096 /* Convert from string to numeric code point */
13097 *code_point_p = (SvUTF8(value_sv))
13098 ? valid_utf8_to_uvchr(value, NULL)
13101 /* Have parsed this entire single code point \N{...}. *cp_count
13102 * has already been set to 1, so don't do it again. */
13103 RExC_parse = endbrace;
13104 nextchar(pRExC_state);
13106 } /* End of is a single code point */
13108 /* Count the code points, if caller desires. The API says to do this
13109 * even if we will later return FALSE */
13113 *cp_count = (SvUTF8(value_sv))
13114 ? utf8_length(value, value + value_len)
13118 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13119 * But don't back the pointer up if the caller wants to know how many
13120 * code points there are (they need to handle it themselves in this
13129 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13130 * reg recursively to parse it. That way, it retains its atomicness,
13131 * while not having to worry about any special handling that some code
13132 * points may have. */
13134 substitute_parse = newSVpvs("?:");
13135 sv_catsv(substitute_parse, value_sv);
13136 sv_catpv(substitute_parse, ")");
13138 /* The value should already be native, so no need to convert on EBCDIC
13140 assert(! RExC_recode_x_to_native);
13143 else { /* \N{U+...} */
13144 Size_t count = 0; /* code point count kept internally */
13146 /* We can get to here when the input is \N{U+...} or when toke.c has
13147 * converted a name to the \N{U+...} form. This include changing a
13148 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13150 RExC_parse += 2; /* Skip past the 'U+' */
13152 /* Code points are separated by dots. The '}' terminates the whole
13155 do { /* Loop until the ending brace */
13156 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13157 | PERL_SCAN_SILENT_ILLDIGIT
13158 | PERL_SCAN_NOTIFY_ILLDIGIT
13159 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13160 | PERL_SCAN_DISALLOW_PREFIX;
13161 STRLEN len = endbrace - RExC_parse;
13163 char * start_digit = RExC_parse;
13164 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13169 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13174 if (cp > MAX_LEGAL_CP) {
13175 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13178 if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13183 /* Here, is a single code point; fail if doesn't want that */
13184 if (! code_point_p) {
13189 /* A single code point is easy to handle; just return it */
13190 *code_point_p = UNI_TO_NATIVE(cp);
13191 RExC_parse = endbrace;
13192 nextchar(pRExC_state);
13196 /* Here, the parse stopped bfore the ending brace. This is legal
13197 * only if that character is a dot separating code points, like a
13198 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13199 * So the next character must be a dot (and the one after that
13200 * can't be the endbrace, or we'd have something like \N{U+100.} )
13202 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13203 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13204 ? UTF8SKIP(RExC_parse)
13206 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13211 /* Here, looks like its really a multiple character sequence. Fail
13212 * if that's not what the caller wants. But continue with counting
13213 * and error checking if they still want a count */
13214 if (! node_p && ! cp_count) {
13218 /* What is done here is to convert this to a sub-pattern of the
13219 * form \x{char1}\x{char2}... and then call reg recursively to
13220 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13221 * atomicness, while not having to worry about special handling
13222 * that some code points may have. We don't create a subpattern,
13223 * but go through the motions of code point counting and error
13224 * checking, if the caller doesn't want a node returned. */
13226 if (node_p && ! substitute_parse) {
13227 substitute_parse = newSVpvs("?:");
13233 /* Convert to notation the rest of the code understands */
13234 sv_catpvs(substitute_parse, "\\x{");
13235 sv_catpvn(substitute_parse, start_digit,
13236 RExC_parse - start_digit);
13237 sv_catpvs(substitute_parse, "}");
13240 /* Move to after the dot (or ending brace the final time through.)
13245 } while (RExC_parse < endbrace);
13247 if (! node_p) { /* Doesn't want the node */
13254 sv_catpvs(substitute_parse, ")");
13256 /* The values are Unicode, and therefore have to be converted to native
13257 * on a non-Unicode (meaning non-ASCII) platform. */
13258 SET_recode_x_to_native(1);
13261 /* Here, we have the string the name evaluates to, ready to be parsed,
13262 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13263 * constructs. This can be called from within a substitute parse already.
13264 * The error reporting mechanism doesn't work for 2 levels of this, but the
13265 * code above has validated this new construct, so there should be no
13266 * errors generated by the below. And this isn' an exact copy, so the
13267 * mechanism to seamlessly deal with this won't work, so turn off warnings
13269 save_start = RExC_start;
13270 orig_end = RExC_end;
13272 RExC_parse = RExC_start = SvPVX(substitute_parse);
13273 RExC_end = RExC_parse + SvCUR(substitute_parse);
13274 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13276 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13278 /* Restore the saved values */
13280 RExC_start = save_start;
13281 RExC_parse = endbrace;
13282 RExC_end = orig_end;
13283 SET_recode_x_to_native(0);
13285 SvREFCNT_dec_NN(substitute_parse);
13288 RETURN_FAIL_ON_RESTART(flags, flagp);
13289 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13292 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13294 nextchar(pRExC_state);
13301 S_compute_EXACTish(RExC_state_t *pRExC_state)
13305 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13313 op = get_regex_charset(RExC_flags);
13314 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13315 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13316 been, so there is no hole */
13319 return op + EXACTF;
13323 S_new_regcurly(const char *s, const char *e)
13325 /* This is a temporary function designed to match the most lenient form of
13326 * a {m,n} quantifier we ever envision, with either number omitted, and
13327 * spaces anywhere between/before/after them.
13329 * If this function fails, then the string it matches is very unlikely to
13330 * ever be considered a valid quantifier, so we can allow the '{' that
13331 * begins it to be considered as a literal */
13333 bool has_min = FALSE;
13334 bool has_max = FALSE;
13336 PERL_ARGS_ASSERT_NEW_REGCURLY;
13338 if (s >= e || *s++ != '{')
13341 while (s < e && isSPACE(*s)) {
13344 while (s < e && isDIGIT(*s)) {
13348 while (s < e && isSPACE(*s)) {
13354 while (s < e && isSPACE(*s)) {
13357 while (s < e && isDIGIT(*s)) {
13361 while (s < e && isSPACE(*s)) {
13366 return s < e && *s == '}' && (has_min || has_max);
13369 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13370 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13373 S_backref_value(char *p, char *e)
13375 const char* endptr = e;
13377 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13384 - regatom - the lowest level
13386 Try to identify anything special at the start of the current parse position.
13387 If there is, then handle it as required. This may involve generating a
13388 single regop, such as for an assertion; or it may involve recursing, such as
13389 to handle a () structure.
13391 If the string doesn't start with something special then we gobble up
13392 as much literal text as we can. If we encounter a quantifier, we have to
13393 back off the final literal character, as that quantifier applies to just it
13394 and not to the whole string of literals.
13396 Once we have been able to handle whatever type of thing started the
13397 sequence, we return the offset into the regex engine program being compiled
13398 at which any next regnode should be placed.
13400 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13401 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13402 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13403 Otherwise does not return 0.
13405 Note: we have to be careful with escapes, as they can be both literal
13406 and special, and in the case of \10 and friends, context determines which.
13408 A summary of the code structure is:
13410 switch (first_byte) {
13411 cases for each special:
13412 handle this special;
13415 switch (2nd byte) {
13416 cases for each unambiguous special:
13417 handle this special;
13419 cases for each ambigous special/literal:
13421 if (special) handle here
13423 default: // unambiguously literal:
13426 default: // is a literal char
13429 create EXACTish node for literal;
13430 while (more input and node isn't full) {
13431 switch (input_byte) {
13432 cases for each special;
13433 make sure parse pointer is set so that the next call to
13434 regatom will see this special first
13435 goto loopdone; // EXACTish node terminated by prev. char
13437 append char to EXACTISH node;
13439 get next input byte;
13443 return the generated node;
13445 Specifically there are two separate switches for handling
13446 escape sequences, with the one for handling literal escapes requiring
13447 a dummy entry for all of the special escapes that are actually handled
13452 STATIC regnode_offset
13453 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13455 regnode_offset ret = 0;
13461 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13463 *flagp = 0; /* Initialize. */
13465 DEBUG_PARSE("atom");
13467 PERL_ARGS_ASSERT_REGATOM;
13470 parse_start = RExC_parse;
13471 assert(RExC_parse < RExC_end);
13472 switch ((U8)*RExC_parse) {
13474 RExC_seen_zerolen++;
13475 nextchar(pRExC_state);
13476 if (RExC_flags & RXf_PMf_MULTILINE)
13477 ret = reg_node(pRExC_state, MBOL);
13479 ret = reg_node(pRExC_state, SBOL);
13480 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13483 nextchar(pRExC_state);
13485 RExC_seen_zerolen++;
13486 if (RExC_flags & RXf_PMf_MULTILINE)
13487 ret = reg_node(pRExC_state, MEOL);
13489 ret = reg_node(pRExC_state, SEOL);
13490 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13493 nextchar(pRExC_state);
13494 if (RExC_flags & RXf_PMf_SINGLELINE)
13495 ret = reg_node(pRExC_state, SANY);
13497 ret = reg_node(pRExC_state, REG_ANY);
13498 *flagp |= HASWIDTH|SIMPLE;
13500 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13504 char * const oregcomp_parse = ++RExC_parse;
13505 ret = regclass(pRExC_state, flagp, depth+1,
13506 FALSE, /* means parse the whole char class */
13507 TRUE, /* allow multi-char folds */
13508 FALSE, /* don't silence non-portable warnings. */
13509 (bool) RExC_strict,
13510 TRUE, /* Allow an optimized regnode result */
13513 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13514 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13517 if (*RExC_parse != ']') {
13518 RExC_parse = oregcomp_parse;
13519 vFAIL("Unmatched [");
13521 nextchar(pRExC_state);
13522 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13526 nextchar(pRExC_state);
13527 ret = reg(pRExC_state, 2, &flags, depth+1);
13529 if (flags & TRYAGAIN) {
13530 if (RExC_parse >= RExC_end) {
13531 /* Make parent create an empty node if needed. */
13532 *flagp |= TRYAGAIN;
13537 RETURN_FAIL_ON_RESTART(flags, flagp);
13538 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13541 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13545 if (flags & TRYAGAIN) {
13546 *flagp |= TRYAGAIN;
13549 vFAIL("Internal urp");
13550 /* Supposed to be caught earlier. */
13556 vFAIL("Quantifier follows nothing");
13561 This switch handles escape sequences that resolve to some kind
13562 of special regop and not to literal text. Escape sequences that
13563 resolve to literal text are handled below in the switch marked
13566 Every entry in this switch *must* have a corresponding entry
13567 in the literal escape switch. However, the opposite is not
13568 required, as the default for this switch is to jump to the
13569 literal text handling code.
13572 switch ((U8)*RExC_parse) {
13573 /* Special Escapes */
13575 RExC_seen_zerolen++;
13576 /* Under wildcards, this is changed to match \n; should be
13577 * invisible to the user, as they have to compile under /m */
13578 if (RExC_pm_flags & PMf_WILDCARD) {
13579 ret = reg_node(pRExC_state, MBOL);
13582 ret = reg_node(pRExC_state, SBOL);
13583 /* SBOL is shared with /^/ so we set the flags so we can tell
13584 * /\A/ from /^/ in split. */
13585 FLAGS(REGNODE_p(ret)) = 1;
13586 *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
13588 goto finish_meta_pat;
13590 if (RExC_pm_flags & PMf_WILDCARD) {
13592 /* diag_listed_as: Use of %s is not allowed in Unicode property
13593 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13595 vFAIL("Use of '\\G' is not allowed in Unicode property"
13596 " wildcard subpatterns");
13598 ret = reg_node(pRExC_state, GPOS);
13599 RExC_seen |= REG_GPOS_SEEN;
13600 goto finish_meta_pat;
13602 if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13603 RExC_seen_zerolen++;
13604 ret = reg_node(pRExC_state, KEEPS);
13605 /* XXX:dmq : disabling in-place substitution seems to
13606 * be necessary here to avoid cases of memory corruption, as
13607 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13609 RExC_seen |= REG_LOOKBEHIND_SEEN;
13610 goto finish_meta_pat;
13613 ++RExC_parse; /* advance past the 'K' */
13614 vFAIL("\\K not permitted in lookahead/lookbehind");
13617 if (RExC_pm_flags & PMf_WILDCARD) {
13618 /* See comment under \A above */
13619 ret = reg_node(pRExC_state, MEOL);
13622 ret = reg_node(pRExC_state, SEOL);
13623 *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
13625 RExC_seen_zerolen++; /* Do not optimize RE away */
13626 goto finish_meta_pat;
13628 if (RExC_pm_flags & PMf_WILDCARD) {
13629 /* See comment under \A above */
13630 ret = reg_node(pRExC_state, MEOL);
13633 ret = reg_node(pRExC_state, EOS);
13634 *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
13636 RExC_seen_zerolen++; /* Do not optimize RE away */
13637 goto finish_meta_pat;
13639 vFAIL("\\C no longer supported");
13641 ret = reg_node(pRExC_state, CLUMP);
13642 *flagp |= HASWIDTH;
13643 goto finish_meta_pat;
13651 regex_charset charset = get_regex_charset(RExC_flags);
13653 RExC_seen_zerolen++;
13654 RExC_seen |= REG_LOOKBEHIND_SEEN;
13655 op = BOUND + charset;
13657 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13658 flags = TRADITIONAL_BOUND;
13659 if (op > BOUNDA) { /* /aa is same as /a */
13665 char name = *RExC_parse;
13666 char * endbrace = NULL;
13668 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13671 vFAIL2("Missing right brace on \\%c{}", name);
13673 /* XXX Need to decide whether to take spaces or not. Should be
13674 * consistent with \p{}, but that currently is SPACE, which
13675 * means vertical too, which seems wrong
13676 * while (isBLANK(*RExC_parse)) {
13679 if (endbrace == RExC_parse) {
13680 RExC_parse++; /* After the '}' */
13681 vFAIL2("Empty \\%c{}", name);
13683 length = endbrace - RExC_parse;
13684 /*while (isBLANK(*(RExC_parse + length - 1))) {
13687 switch (*RExC_parse) {
13690 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13692 goto bad_bound_type;
13697 if (length != 2 || *(RExC_parse + 1) != 'b') {
13698 goto bad_bound_type;
13703 if (length != 2 || *(RExC_parse + 1) != 'b') {
13704 goto bad_bound_type;
13709 if (length != 2 || *(RExC_parse + 1) != 'b') {
13710 goto bad_bound_type;
13716 RExC_parse = endbrace;
13718 "'%" UTF8f "' is an unknown bound type",
13719 UTF8fARG(UTF, length, endbrace - length));
13720 NOT_REACHED; /*NOTREACHED*/
13722 RExC_parse = endbrace;
13723 REQUIRE_UNI_RULES(flagp, 0);
13728 else if (op >= BOUNDA) { /* /aa is same as /a */
13732 /* Don't have to worry about UTF-8, in this message because
13733 * to get here the contents of the \b must be ASCII */
13734 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13735 "Using /u for '%.*s' instead of /%s",
13737 endbrace - length + 1,
13738 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13739 ? ASCII_RESTRICT_PAT_MODS
13740 : ASCII_MORE_RESTRICT_PAT_MODS);
13745 RExC_seen_d_op = TRUE;
13747 else if (op == BOUNDL) {
13748 RExC_contains_locale = 1;
13752 op += NBOUND - BOUND;
13755 ret = reg_node(pRExC_state, op);
13756 FLAGS(REGNODE_p(ret)) = flags;
13758 goto finish_meta_pat;
13762 ret = reg_node(pRExC_state, LNBREAK);
13763 *flagp |= HASWIDTH|SIMPLE;
13764 goto finish_meta_pat;
13778 /* These all have the same meaning inside [brackets], and it knows
13779 * how to do the best optimizations for them. So, pretend we found
13780 * these within brackets, and let it do the work */
13783 ret = regclass(pRExC_state, flagp, depth+1,
13784 TRUE, /* means just parse this element */
13785 FALSE, /* don't allow multi-char folds */
13786 FALSE, /* don't silence non-portable warnings. It
13787 would be a bug if these returned
13789 (bool) RExC_strict,
13790 TRUE, /* Allow an optimized regnode result */
13792 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13793 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13794 * multi-char folds are allowed. */
13796 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13799 RExC_parse--; /* regclass() leaves this one too far ahead */
13802 /* The escapes above that don't take a parameter can't be
13803 * followed by a '{'. But 'pX', 'p{foo}' and
13804 * correspondingly 'P' can be */
13805 if ( RExC_parse - parse_start == 1
13806 && UCHARAT(RExC_parse + 1) == '{'
13807 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13810 vFAIL("Unescaped left brace in regex is illegal here");
13812 Set_Node_Offset(REGNODE_p(ret), parse_start);
13813 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13814 nextchar(pRExC_state);
13817 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13818 * \N{...} evaluates to a sequence of more than one code points).
13819 * The function call below returns a regnode, which is our result.
13820 * The parameters cause it to fail if the \N{} evaluates to a
13821 * single code point; we handle those like any other literal. The
13822 * reason that the multicharacter case is handled here and not as
13823 * part of the EXACtish code is because of quantifiers. In
13824 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13825 * this way makes that Just Happen. dmq.
13826 * join_exact() will join this up with adjacent EXACTish nodes
13827 * later on, if appropriate. */
13829 if (grok_bslash_N(pRExC_state,
13830 &ret, /* Want a regnode returned */
13831 NULL, /* Fail if evaluates to a single code
13833 NULL, /* Don't need a count of how many code
13842 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13844 /* Here, evaluates to a single code point. Go get that */
13845 RExC_parse = parse_start;
13848 case 'k': /* Handle \k<NAME> and \k'NAME' */
13852 if ( RExC_parse >= RExC_end - 1
13853 || (( ch = RExC_parse[1]) != '<'
13858 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13859 vFAIL2("Sequence %.2s... not terminated", parse_start);
13862 ret = handle_named_backref(pRExC_state,
13874 case '1': case '2': case '3': case '4':
13875 case '5': case '6': case '7': case '8': case '9':
13880 if (*RExC_parse == 'g') {
13884 if (*RExC_parse == '{') {
13888 if (*RExC_parse == '-') {
13892 if (hasbrace && !isDIGIT(*RExC_parse)) {
13893 if (isrel) RExC_parse--;
13895 goto parse_named_seq;
13898 if (RExC_parse >= RExC_end) {
13899 goto unterminated_g;
13901 num = S_backref_value(RExC_parse, RExC_end);
13903 vFAIL("Reference to invalid group 0");
13904 else if (num == I32_MAX) {
13905 if (isDIGIT(*RExC_parse))
13906 vFAIL("Reference to nonexistent group");
13909 vFAIL("Unterminated \\g... pattern");
13913 num = RExC_npar - num;
13915 vFAIL("Reference to nonexistent or unclosed group");
13919 num = S_backref_value(RExC_parse, RExC_end);
13920 /* bare \NNN might be backref or octal - if it is larger
13921 * than or equal RExC_npar then it is assumed to be an
13922 * octal escape. Note RExC_npar is +1 from the actual
13923 * number of parens. */
13924 /* Note we do NOT check if num == I32_MAX here, as that is
13925 * handled by the RExC_npar check */
13928 /* any numeric escape < 10 is always a backref */
13930 /* any numeric escape < RExC_npar is a backref */
13931 && num >= RExC_npar
13932 /* cannot be an octal escape if it starts with [89] */
13933 && ! inRANGE(*RExC_parse, '8', '9')
13935 /* Probably not meant to be a backref, instead likely
13936 * to be an octal character escape, e.g. \35 or \777.
13937 * The above logic should make it obvious why using
13938 * octal escapes in patterns is problematic. - Yves */
13939 RExC_parse = parse_start;
13944 /* At this point RExC_parse points at a numeric escape like
13945 * \12 or \88 or something similar, which we should NOT treat
13946 * as an octal escape. It may or may not be a valid backref
13947 * escape. For instance \88888888 is unlikely to be a valid
13949 while (isDIGIT(*RExC_parse))
13952 if (*RExC_parse != '}')
13953 vFAIL("Unterminated \\g{...} pattern");
13956 if (num >= (I32)RExC_npar) {
13958 /* It might be a forward reference; we can't fail until we
13959 * know, by completing the parse to get all the groups, and
13960 * then reparsing */
13961 if (ALL_PARENS_COUNTED) {
13962 if (num >= RExC_total_parens) {
13963 vFAIL("Reference to nonexistent group");
13967 REQUIRE_PARENS_PASS;
13971 ret = reganode(pRExC_state,
13974 : (ASCII_FOLD_RESTRICTED)
13976 : (AT_LEAST_UNI_SEMANTICS)
13982 if (OP(REGNODE_p(ret)) == REFF) {
13983 RExC_seen_d_op = TRUE;
13985 *flagp |= HASWIDTH;
13987 /* override incorrect value set in reganode MJD */
13988 Set_Node_Offset(REGNODE_p(ret), parse_start);
13989 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13990 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13991 FALSE /* Don't force to /x */ );
13995 if (RExC_parse >= RExC_end)
13996 FAIL("Trailing \\");
13999 /* Do not generate "unrecognized" warnings here, we fall
14000 back into the quick-grab loop below */
14001 RExC_parse = parse_start;
14003 } /* end of switch on a \foo sequence */
14008 /* '#' comments should have been spaced over before this function was
14010 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14012 if (RExC_flags & RXf_PMf_EXTENDED) {
14013 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14014 if (RExC_parse < RExC_end)
14024 /* Here, we have determined that the next thing is probably a
14025 * literal character. RExC_parse points to the first byte of its
14026 * definition. (It still may be an escape sequence that evaluates
14027 * to a single character) */
14032 char *s, *old_s = NULL, *old_old_s = NULL;
14034 U32 max_string_len = 255;
14036 /* We may have to reparse the node, artificially stopping filling
14037 * it early, based on info gleaned in the first parse. This
14038 * variable gives where we stop. Make it above the normal stopping
14039 * place first time through; otherwise it would stop too early */
14040 U32 upper_fill = max_string_len + 1;
14042 /* We start out as an EXACT node, even if under /i, until we find a
14043 * character which is in a fold. The algorithm now segregates into
14044 * separate nodes, characters that fold from those that don't under
14045 * /i. (This hopefully will create nodes that are fixed strings
14046 * even under /i, giving the optimizer something to grab on to.)
14047 * So, if a node has something in it and the next character is in
14048 * the opposite category, that node is closed up, and the function
14049 * returns. Then regatom is called again, and a new node is
14050 * created for the new category. */
14051 U8 node_type = EXACT;
14053 /* Assume the node will be fully used; the excess is given back at
14054 * the end. Under /i, we may need to temporarily add the fold of
14055 * an extra character or two at the end to check for splitting
14056 * multi-char folds, so allocate extra space for that. We can't
14057 * make any other length assumptions, as a byte input sequence
14058 * could shrink down. */
14059 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14063 ? UTF8_MAXBYTES_CASE
14064 /* Max non-UTF-8 expansion is 2 */ : 2)));
14066 bool next_is_quantifier;
14067 char * oldp = NULL;
14069 /* We can convert EXACTF nodes to EXACTFU if they contain only
14070 * characters that match identically regardless of the target
14071 * string's UTF8ness. The reason to do this is that EXACTF is not
14072 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14075 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14076 * contain only above-Latin1 characters (hence must be in UTF8),
14077 * which don't participate in folds with Latin1-range characters,
14078 * as the latter's folds aren't known until runtime. */
14079 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14081 /* Single-character EXACTish nodes are almost always SIMPLE. This
14082 * allows us to override this as encountered */
14083 U8 maybe_SIMPLE = SIMPLE;
14085 /* Does this node contain something that can't match unless the
14086 * target string is (also) in UTF-8 */
14087 bool requires_utf8_target = FALSE;
14089 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14090 bool has_ss = FALSE;
14092 /* So is the MICRO SIGN */
14093 bool has_micro_sign = FALSE;
14095 /* Set when we fill up the current node and there is still more
14096 * text to process */
14099 /* Allocate an EXACT node. The node_type may change below to
14100 * another EXACTish node, but since the size of the node doesn't
14101 * change, it works */
14102 ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14104 FILL_NODE(ret, node_type);
14107 s = STRING(REGNODE_p(ret));
14118 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14119 maybe_SIMPLE = SIMPLE;
14120 requires_utf8_target = FALSE;
14122 has_micro_sign = FALSE;
14126 /* This breaks under rare circumstances. If folding, we do not
14127 * want to split a node at a character that is a non-final in a
14128 * multi-char fold, as an input string could just happen to want to
14129 * match across the node boundary. The code at the end of the loop
14130 * looks for this, and backs off until it finds not such a
14131 * character, but it is possible (though extremely, extremely
14132 * unlikely) for all characters in the node to be non-final fold
14133 * ones, in which case we just leave the node fully filled, and
14134 * hope that it doesn't match the string in just the wrong place */
14136 assert( ! UTF /* Is at the beginning of a character */
14137 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14138 || UTF8_IS_START(UCHARAT(RExC_parse)));
14140 overflowed = FALSE;
14142 /* Here, we have a literal character. Find the maximal string of
14143 * them in the input that we can fit into a single EXACTish node.
14144 * We quit at the first non-literal or when the node gets full, or
14145 * under /i the categorization of folding/non-folding character
14147 while (p < RExC_end && len < upper_fill) {
14149 /* In most cases each iteration adds one byte to the output.
14150 * The exceptions override this */
14151 Size_t added_len = 1;
14157 /* White space has already been ignored */
14158 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14159 || ! is_PATWS_safe((p), RExC_end, UTF));
14162 const char* message;
14175 /* Literal Escapes Switch
14177 This switch is meant to handle escape sequences that
14178 resolve to a literal character.
14180 Every escape sequence that represents something
14181 else, like an assertion or a char class, is handled
14182 in the switch marked 'Special Escapes' above in this
14183 routine, but also has an entry here as anything that
14184 isn't explicitly mentioned here will be treated as
14185 an unescaped equivalent literal.
14188 switch ((U8)*++p) {
14190 /* These are all the special escapes. */
14191 case 'A': /* Start assertion */
14192 case 'b': case 'B': /* Word-boundary assertion*/
14193 case 'C': /* Single char !DANGEROUS! */
14194 case 'd': case 'D': /* digit class */
14195 case 'g': case 'G': /* generic-backref, pos assertion */
14196 case 'h': case 'H': /* HORIZWS */
14197 case 'k': case 'K': /* named backref, keep marker */
14198 case 'p': case 'P': /* Unicode property */
14199 case 'R': /* LNBREAK */
14200 case 's': case 'S': /* space class */
14201 case 'v': case 'V': /* VERTWS */
14202 case 'w': case 'W': /* word class */
14203 case 'X': /* eXtended Unicode "combining
14204 character sequence" */
14205 case 'z': case 'Z': /* End of line/string assertion */
14209 /* Anything after here is an escape that resolves to a
14210 literal. (Except digits, which may or may not)
14216 case 'N': /* Handle a single-code point named character. */
14217 RExC_parse = p + 1;
14218 if (! grok_bslash_N(pRExC_state,
14219 NULL, /* Fail if evaluates to
14220 anything other than a
14221 single code point */
14222 &ender, /* The returned single code
14224 NULL, /* Don't need a count of
14225 how many code points */
14230 if (*flagp & NEED_UTF8)
14231 FAIL("panic: grok_bslash_N set NEED_UTF8");
14232 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14234 /* Here, it wasn't a single code point. Go close
14235 * up this EXACTish node. The switch() prior to
14236 * this switch handles the other cases */
14237 RExC_parse = p = oldp;
14241 RExC_parse = parse_start;
14243 /* The \N{} means the pattern, if previously /d,
14244 * becomes /u. That means it can't be an EXACTF node,
14245 * but an EXACTFU */
14246 if (node_type == EXACTF) {
14247 node_type = EXACTFU;
14249 /* If the node already contains something that
14250 * differs between EXACTF and EXACTFU, reparse it
14252 if (! maybe_exactfu) {
14273 ender = ESC_NATIVE;
14281 if (! grok_bslash_o(&p,
14286 (bool) RExC_strict,
14287 FALSE, /* No illegal cp's */
14290 RExC_parse = p; /* going to die anyway; point to
14291 exact spot of failure */
14295 if (message && TO_OUTPUT_WARNINGS(p)) {
14296 warn_non_literal_string(p, packed_warn, message);
14300 if (! grok_bslash_x(&p,
14305 (bool) RExC_strict,
14306 FALSE, /* No illegal cp's */
14309 RExC_parse = p; /* going to die anyway; point
14310 to exact spot of failure */
14314 if (message && TO_OUTPUT_WARNINGS(p)) {
14315 warn_non_literal_string(p, packed_warn, message);
14319 if (ender < 0x100) {
14320 if (RExC_recode_x_to_native) {
14321 ender = LATIN1_TO_NATIVE(ender);
14328 if (! grok_bslash_c(*p, &grok_c_char,
14329 &message, &packed_warn))
14331 /* going to die anyway; point to exact spot of
14333 RExC_parse = p + ((UTF)
14334 ? UTF8_SAFE_SKIP(p, RExC_end)
14339 ender = grok_c_char;
14341 if (message && TO_OUTPUT_WARNINGS(p)) {
14342 warn_non_literal_string(p, packed_warn, message);
14346 case '8': case '9': /* must be a backreference */
14348 /* we have an escape like \8 which cannot be an octal escape
14349 * so we exit the loop, and let the outer loop handle this
14350 * escape which may or may not be a legitimate backref. */
14352 case '1': case '2': case '3':case '4':
14353 case '5': case '6': case '7':
14354 /* When we parse backslash escapes there is ambiguity
14355 * between backreferences and octal escapes. Any escape
14356 * from \1 - \9 is a backreference, any multi-digit
14357 * escape which does not start with 0 and which when
14358 * evaluated as decimal could refer to an already
14359 * parsed capture buffer is a back reference. Anything
14362 * Note this implies that \118 could be interpreted as
14363 * 118 OR as "\11" . "8" depending on whether there
14364 * were 118 capture buffers defined already in the
14367 /* NOTE, RExC_npar is 1 more than the actual number of
14368 * parens we have seen so far, hence the "<" as opposed
14370 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14371 { /* Not to be treated as an octal constant, go
14379 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14380 | PERL_SCAN_NOTIFY_ILLDIGIT;
14382 ender = grok_oct(p, &numlen, &flags, NULL);
14384 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14385 && isDIGIT(*p) /* like \08, \178 */
14386 && ckWARN(WARN_REGEXP))
14388 reg_warn_non_literal_string(
14390 form_alien_digit_msg(8, numlen, p,
14391 RExC_end, UTF, FALSE));
14397 FAIL("Trailing \\");
14400 if (isALPHANUMERIC(*p)) {
14401 /* An alpha followed by '{' is going to fail next
14402 * iteration, so don't output this warning in that
14404 if (! isALPHA(*p) || *(p + 1) != '{') {
14405 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14406 " passed through", p);
14409 goto normal_default;
14410 } /* End of switch on '\' */
14413 /* Trying to gain new uses for '{' without breaking too
14414 * much existing code is hard. The solution currently
14416 * 1) If there is no ambiguity that a '{' should always
14417 * be taken literally, at the start of a construct, we
14419 * 2) If the literal '{' conflicts with our desired use
14420 * of it as a metacharacter, we die. The deprecation
14421 * cycles for this have come and gone.
14422 * 3) If there is ambiguity, we raise a simple warning.
14423 * This could happen, for example, if the user
14424 * intended it to introduce a quantifier, but slightly
14425 * misspelled the quantifier. Without this warning,
14426 * the quantifier would silently be taken as a literal
14427 * string of characters instead of a meta construct */
14428 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14430 || ( p > parse_start + 1
14431 && isALPHA_A(*(p - 1))
14432 && *(p - 2) == '\\')
14433 || new_regcurly(p, RExC_end))
14435 RExC_parse = p + 1;
14436 vFAIL("Unescaped left brace in regex is "
14439 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14440 " passed through");
14442 goto normal_default;
14445 if (p > RExC_parse && RExC_strict) {
14446 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14449 default: /* A literal character */
14451 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14453 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14454 &numlen, UTF8_ALLOW_DEFAULT);
14460 } /* End of switch on the literal */
14462 /* Here, have looked at the literal character, and <ender>
14463 * contains its ordinal; <p> points to the character after it.
14467 REQUIRE_UTF8(flagp);
14468 if ( UNICODE_IS_PERL_EXTENDED(ender)
14469 && TO_OUTPUT_WARNINGS(p))
14471 ckWARN2_non_literal_string(p,
14472 packWARN(WARN_PORTABLE),
14473 PL_extended_cp_format,
14478 /* We need to check if the next non-ignored thing is a
14479 * quantifier. Move <p> to after anything that should be
14480 * ignored, which, as a side effect, positions <p> for the next
14481 * loop iteration */
14482 skip_to_be_ignored_text(pRExC_state, &p,
14483 FALSE /* Don't force to /x */ );
14485 /* If the next thing is a quantifier, it applies to this
14486 * character only, which means that this character has to be in
14487 * its own node and can't just be appended to the string in an
14488 * existing node, so if there are already other characters in
14489 * the node, close the node with just them, and set up to do
14490 * this character again next time through, when it will be the
14491 * only thing in its new node */
14493 next_is_quantifier = LIKELY(p < RExC_end)
14494 && UNLIKELY(ISMULT2(p));
14496 if (next_is_quantifier && LIKELY(len)) {
14501 /* Ready to add 'ender' to the node */
14503 if (! FOLD) { /* The simple case, just append the literal */
14506 /* Don't output if it would overflow */
14507 if (UNLIKELY(len > max_string_len - ((UTF)
14508 ? UVCHR_SKIP(ender)
14515 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14516 *(s++) = (char) ender;
14519 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14520 added_len = (char *) new_s - s;
14521 s = (char *) new_s;
14524 requires_utf8_target = TRUE;
14528 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14530 /* Here are folding under /l, and the code point is
14531 * problematic. If this is the first character in the
14532 * node, change the node type to folding. Otherwise, if
14533 * this is the first problematic character, close up the
14534 * existing node, so can start a new node with this one */
14536 node_type = EXACTFL;
14537 RExC_contains_locale = 1;
14539 else if (node_type == EXACT) {
14544 /* This problematic code point means we can't simplify
14546 maybe_exactfu = FALSE;
14548 /* Here, we are adding a problematic fold character.
14549 * "Problematic" in this context means that its fold isn't
14550 * known until runtime. (The non-problematic code points
14551 * are the above-Latin1 ones that fold to also all
14552 * above-Latin1. Their folds don't vary no matter what the
14553 * locale is.) But here we have characters whose fold
14554 * depends on the locale. We just add in the unfolded
14555 * character, and wait until runtime to fold it */
14556 goto not_fold_common;
14558 else /* regular fold; see if actually is in a fold */
14559 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14561 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14563 /* Here, folding, but the character isn't in a fold.
14565 * Start a new node if previous characters in the node were
14567 if (len && node_type != EXACT) {
14572 /* Here, continuing a node with non-folded characters. Add
14574 goto not_fold_common;
14576 else { /* Here, does participate in some fold */
14578 /* If this is the first character in the node, change its
14579 * type to folding. Otherwise, if this is the first
14580 * folding character in the node, close up the existing
14581 * node, so can start a new node with this one. */
14583 node_type = compute_EXACTish(pRExC_state);
14585 else if (node_type == EXACT) {
14590 if (UTF) { /* Alway use the folded value for UTF-8
14592 if (UVCHR_IS_INVARIANT(ender)) {
14593 if (UNLIKELY(len + 1 > max_string_len)) {
14598 *(s)++ = (U8) toFOLD(ender);
14601 UV folded = _to_uni_fold_flags(
14603 (U8 *) s, /* We have allocated extra space
14604 in 's' so can't run off the
14607 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14608 ? FOLD_FLAGS_NOMIX_ASCII
14610 if (UNLIKELY(len + added_len > max_string_len)) {
14618 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14620 /* U+B5 folds to the MU, so its possible for a
14621 * non-UTF-8 target to match it */
14622 requires_utf8_target = TRUE;
14626 else { /* Here is non-UTF8. */
14628 /* The fold will be one or (rarely) two characters.
14629 * Check that there's room for at least a single one
14630 * before setting any flags, etc. Because otherwise an
14631 * overflowing character could cause a flag to be set
14632 * even though it doesn't end up in this node. (For
14633 * the two character fold, we check again, before
14634 * setting any flags) */
14635 if (UNLIKELY(len + 1 > max_string_len)) {
14640 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14641 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14642 || UNICODE_DOT_DOT_VERSION > 0)
14644 /* On non-ancient Unicodes, check for the only possible
14645 * multi-char fold */
14646 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14648 /* This potential multi-char fold means the node
14649 * can't be simple (because it could match more
14650 * than a single char). And in some cases it will
14651 * match 'ss', so set that flag */
14655 /* It can't change to be an EXACTFU (unless already
14656 * is one). We fold it iff under /u rules. */
14657 if (node_type != EXACTFU) {
14658 maybe_exactfu = FALSE;
14661 if (UNLIKELY(len + 2 > max_string_len)) {
14670 goto done_with_this_char;
14673 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14675 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14677 /* Also, the sequence 'ss' is special when not
14678 * under /u. If the target string is UTF-8, it
14679 * should match SHARP S; otherwise it won't. So,
14680 * here we have to exclude the possibility of this
14681 * node moving to /u.*/
14683 maybe_exactfu = FALSE;
14686 /* Here, the fold will be a single character */
14688 if (UNLIKELY(ender == MICRO_SIGN)) {
14689 has_micro_sign = TRUE;
14691 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14693 /* If the character's fold differs between /d and
14694 * /u, this can't change to be an EXACTFU node */
14695 maybe_exactfu = FALSE;
14698 *(s++) = (DEPENDS_SEMANTICS)
14699 ? (char) toFOLD(ender)
14701 /* Under /u, the fold of any character in
14702 * the 0-255 range happens to be its
14703 * lowercase equivalent, except for LATIN
14704 * SMALL LETTER SHARP S, which was handled
14705 * above, and the MICRO SIGN, whose fold
14706 * requires UTF-8 to represent. */
14707 : (char) toLOWER_L1(ender);
14709 } /* End of adding current character to the node */
14711 done_with_this_char:
14715 if (next_is_quantifier) {
14717 /* Here, the next input is a quantifier, and to get here,
14718 * the current character is the only one in the node. */
14722 } /* End of loop through literal characters */
14724 /* Here we have either exhausted the input or run out of room in
14725 * the node. If the former, we are done. (If we encountered a
14726 * character that can't be in the node, transfer is made directly
14727 * to <loopdone>, and so we wouldn't have fallen off the end of the
14729 if (LIKELY(! overflowed)) {
14733 /* Here we have run out of room. We can grow plain EXACT and
14734 * LEXACT nodes. If the pattern is gigantic enough, though,
14735 * eventually we'll have to artificially chunk the pattern into
14736 * multiple nodes. */
14737 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14738 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14739 Size_t overhead_expansion = 0;
14741 Size_t max_nodes_for_string;
14745 /* Here we couldn't fit the final character in the current
14746 * node, so it will have to be reparsed, no matter what else we
14750 /* If would have overflowed a regular EXACT node, switch
14751 * instead to an LEXACT. The code below is structured so that
14752 * the actual growing code is common to changing from an EXACT
14753 * or just increasing the LEXACT size. This means that we have
14754 * to save the string in the EXACT case before growing, and
14755 * then copy it afterwards to its new location */
14756 if (node_type == EXACT) {
14757 overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14758 RExC_emit += overhead_expansion;
14759 Copy(s0, temp, len, char);
14762 /* Ready to grow. If it was a plain EXACT, the string was
14763 * saved, and the first few bytes of it overwritten by adding
14764 * an argument field. We assume, as we do elsewhere in this
14765 * file, that one byte of remaining input will translate into
14766 * one byte of output, and if that's too small, we grow again,
14767 * if too large the excess memory is freed at the end */
14769 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14770 achievable = MIN(max_nodes_for_string,
14771 current_string_nodes + STR_SZ(RExC_end - p));
14772 delta = achievable - current_string_nodes;
14774 /* If there is just no more room, go finish up this chunk of
14780 change_engine_size(pRExC_state, delta + overhead_expansion);
14781 current_string_nodes += delta;
14783 = sizeof(struct regnode) * current_string_nodes;
14784 upper_fill = max_string_len + 1;
14786 /* If the length was small, we know this was originally an
14787 * EXACT node now converted to LEXACT, and the string has to be
14788 * restored. Otherwise the string was untouched. 260 is just
14789 * a number safely above 255 so don't have to worry about
14790 * getting it precise */
14792 node_type = LEXACT;
14793 FILL_NODE(ret, node_type);
14794 s0 = STRING(REGNODE_p(ret));
14795 Copy(temp, s0, len, char);
14799 goto continue_parse;
14802 bool splittable = FALSE;
14803 bool backed_up = FALSE;
14804 char * e; /* should this be U8? */
14805 char * s_start; /* should this be U8? */
14807 /* Here is /i. Running out of room creates a problem if we are
14808 * folding, and the split happens in the middle of a
14809 * multi-character fold, as a match that should have occurred,
14810 * won't, due to the way nodes are matched, and our artificial
14811 * boundary. So back off until we aren't splitting such a
14812 * fold. If there is no such place to back off to, we end up
14813 * taking the entire node as-is. This can happen if the node
14814 * consists entirely of 'f' or entirely of 's' characters (or
14815 * things that fold to them) as 'ff' and 'ss' are
14816 * multi-character folds.
14818 * The Unicode standard says that multi character folds consist
14819 * of either two or three characters. That means we would be
14820 * splitting one if the final character in the node is at the
14821 * beginning of either type, or is the second of a three
14825 * ender is the code point of the character that won't fit
14827 * s points to just beyond the final byte in the node.
14828 * It's where we would place ender if there were
14829 * room, and where in fact we do place ender's fold
14830 * in the code below, as we've over-allocated space
14831 * for s0 (hence s) to allow for this
14832 * e starts at 's' and advances as we append things.
14833 * old_s is the same as 's'. (If ender had fit, 's' would
14834 * have been advanced to beyond it).
14835 * old_old_s points to the beginning byte of the final
14836 * character in the node
14837 * p points to the beginning byte in the input of the
14838 * character beyond 'ender'.
14839 * oldp points to the beginning byte in the input of
14842 * In the case of /il, we haven't folded anything that could be
14843 * affected by the locale. That means only above-Latin1
14844 * characters that fold to other above-latin1 characters get
14845 * folded at compile time. To check where a good place to
14846 * split nodes is, everything in it will have to be folded.
14847 * The boolean 'maybe_exactfu' keeps track in /il if there are
14848 * any unfolded characters in the node. */
14849 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14851 /* If we do need to fold the node, we need a place to store the
14852 * folded copy, and a way to map back to the unfolded original
14854 char * locfold_buf = NULL;
14855 Size_t * loc_correspondence = NULL;
14857 if (! need_to_fold_loc) { /* The normal case. Just
14858 initialize to the actual node */
14861 s = old_old_s; /* Point to the beginning of the final char
14862 that fits in the node */
14866 /* Here, we have filled a /il node, and there are unfolded
14867 * characters in it. If the runtime locale turns out to be
14868 * UTF-8, there are possible multi-character folds, just
14869 * like when not under /l. The node hence can't terminate
14870 * in the middle of such a fold. To determine this, we
14871 * have to create a folded copy of this node. That means
14872 * reparsing the node, folding everything assuming a UTF-8
14873 * locale. (If at runtime it isn't such a locale, the
14874 * actions here wouldn't have been necessary, but we have
14875 * to assume the worst case.) If we find we need to back
14876 * off the folded string, we do so, and then map that
14877 * position back to the original unfolded node, which then
14878 * gets output, truncated at that spot */
14880 char * redo_p = RExC_parse;
14884 /* Allow enough space assuming a single byte input folds to
14885 * a single byte output, plus assume that the two unparsed
14886 * characters (that we may need) fold to the largest number
14887 * of bytes possible, plus extra for one more worst case
14888 * scenario. In the loop below, if we start eating into
14889 * that final spare space, we enlarge this initial space */
14890 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14892 Newxz(locfold_buf, size, char);
14893 Newxz(loc_correspondence, size, Size_t);
14895 /* Redo this node's parse, folding into 'locfold_buf' */
14896 redo_p = RExC_parse;
14897 old_redo_e = redo_e = locfold_buf;
14898 while (redo_p <= oldp) {
14900 old_redo_e = redo_e;
14901 loc_correspondence[redo_e - locfold_buf]
14902 = redo_p - RExC_parse;
14907 (void) _to_utf8_fold_flags((U8 *) redo_p,
14912 redo_e += added_len;
14913 redo_p += UTF8SKIP(redo_p);
14917 /* Note that if this code is run on some ancient
14918 * Unicode versions, SHARP S doesn't fold to 'ss',
14919 * but rather than clutter the code with #ifdef's,
14920 * as is done above, we ignore that possibility.
14921 * This is ok because this code doesn't affect what
14922 * gets matched, but merely where the node gets
14924 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14925 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14935 /* If we're getting so close to the end that a
14936 * worst-case fold in the next character would cause us
14937 * to overflow, increase, assuming one byte output byte
14938 * per one byte input one, plus room for another worst
14940 if ( redo_p <= oldp
14941 && redo_e > locfold_buf + size
14942 - (UTF8_MAXBYTES_CASE + 1))
14944 Size_t new_size = size
14946 + UTF8_MAXBYTES_CASE + 1;
14947 Ptrdiff_t e_offset = redo_e - locfold_buf;
14949 Renew(locfold_buf, new_size, char);
14950 Renew(loc_correspondence, new_size, Size_t);
14953 redo_e = locfold_buf + e_offset;
14957 /* Set so that things are in terms of the folded, temporary
14960 s_start = locfold_buf;
14965 /* Here, we have 's', 's_start' and 'e' set up to point to the
14966 * input that goes into the node, folded.
14968 * If the final character of the node and the fold of ender
14969 * form the first two characters of a three character fold, we
14970 * need to peek ahead at the next (unparsed) character in the
14971 * input to determine if the three actually do form such a
14972 * fold. Just looking at that character is not generally
14973 * sufficient, as it could be, for example, an escape sequence
14974 * that evaluates to something else, and it needs to be folded.
14976 * khw originally thought to just go through the parse loop one
14977 * extra time, but that doesn't work easily as that iteration
14978 * could cause things to think that the parse is over and to
14979 * goto loopdone. The character could be a '$' for example, or
14980 * the character beyond could be a quantifier, and other
14981 * glitches as well.
14983 * The solution used here for peeking ahead is to look at that
14984 * next character. If it isn't ASCII punctuation, then it will
14985 * be something that continues in an EXACTish node if there
14986 * were space. We append the fold of it to s, having reserved
14987 * enough room in s0 for the purpose. If we can't reasonably
14988 * peek ahead, we instead assume the worst case: that it is
14989 * something that would form the completion of a multi-char
14992 * If we can't split between s and ender, we work backwards
14993 * character-by-character down to s0. At each current point
14994 * see if we are at the beginning of a multi-char fold. If so,
14995 * that means we would be splitting the fold across nodes, and
14996 * so we back up one and try again.
14998 * If we're not at the beginning, we still could be at the
14999 * final two characters of a (rare) three character fold. We
15000 * check if the sequence starting at the character before the
15001 * current position (and including the current and next
15002 * characters) is a three character fold. If not, the node can
15003 * be split here. If it is, we have to backup two characters
15006 * Otherwise, the node can be split at the current position.
15008 * The same logic is used for UTF-8 patterns and not */
15012 /* Append the fold of ender */
15013 (void) _to_uni_fold_flags(
15017 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15018 ? FOLD_FLAGS_NOMIX_ASCII
15022 /* 's' and the character folded to by ender may be the
15023 * first two of a three-character fold, in which case the
15024 * node should not be split here. That may mean examining
15025 * the so-far unparsed character starting at 'p'. But if
15026 * ender folded to more than one character, we already have
15027 * three characters to look at. Also, we first check if
15028 * the sequence consisting of s and the next character form
15029 * the first two of some three character fold. If not,
15030 * there's no need to peek ahead. */
15031 if ( added_len <= UTF8SKIP(e - added_len)
15032 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15034 /* Here, the two do form the beginning of a potential
15035 * three character fold. The unexamined character may
15036 * or may not complete it. Peek at it. It might be
15037 * something that ends the node or an escape sequence,
15038 * in which case we don't know without a lot of work
15039 * what it evaluates to, so we have to assume the worst
15040 * case: that it does complete the fold, and so we
15041 * can't split here. All such instances will have
15042 * that character be an ASCII punctuation character,
15043 * like a backslash. So, for that case, backup one and
15044 * drop down to try at that position */
15046 s = (char *) utf8_hop_back((U8 *) s, -1,
15051 /* Here, since it's not punctuation, it must be a
15052 * real character, and we can append its fold to
15053 * 'e' (having deliberately reserved enough space
15054 * for this eventuality) and drop down to check if
15055 * the three actually do form a folded sequence */
15056 (void) _to_utf8_fold_flags(
15057 (U8 *) p, (U8 *) RExC_end,
15060 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15061 ? FOLD_FLAGS_NOMIX_ASCII
15067 /* Here, we either have three characters available in
15068 * sequence starting at 's', or we have two characters and
15069 * know that the following one can't possibly be part of a
15070 * three character fold. We go through the node backwards
15071 * until we find a place where we can split it without
15072 * breaking apart a multi-character fold. At any given
15073 * point we have to worry about if such a fold begins at
15074 * the current 's', and also if a three-character fold
15075 * begins at s-1, (containing s and s+1). Splitting in
15076 * either case would break apart a fold */
15078 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15081 /* If is a multi-char fold, can't split here. Backup
15082 * one char and try again */
15083 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15089 /* If the two characters beginning at 's' are part of a
15090 * three character fold starting at the character
15091 * before s, we can't split either before or after s.
15092 * Backup two chars and try again */
15093 if ( LIKELY(s > s_start)
15094 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15097 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15102 /* Here there's no multi-char fold between s and the
15103 * next character following it. We can split */
15107 } while (s > s_start); /* End of loops backing up through the node */
15109 /* Here we either couldn't find a place to split the node,
15110 * or else we broke out of the loop setting 'splittable' to
15111 * true. In the latter case, the place to split is between
15112 * the first and second characters in the sequence starting
15118 else { /* Pattern not UTF-8 */
15119 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15120 || ASCII_FOLD_RESTRICTED)
15122 assert( toLOWER_L1(ender) < 256 );
15123 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15131 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15138 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15139 || ASCII_FOLD_RESTRICTED)
15141 assert( toLOWER_L1(ender) < 256 );
15142 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15152 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15158 if ( LIKELY(s > s_start)
15159 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15169 } while (s > s_start);
15176 /* Here, we are done backing up. If we didn't backup at all
15177 * (the likely case), just proceed */
15180 /* If we did find a place to split, reparse the entire node
15181 * stopping where we have calculated. */
15184 /* If we created a temporary folded string under /l, we
15185 * have to map that back to the original */
15186 if (need_to_fold_loc) {
15187 upper_fill = loc_correspondence[s - s_start];
15188 if (upper_fill == 0) {
15189 FAIL2("panic: loc_correspondence[%d] is 0",
15190 (int) (s - s_start));
15192 Safefree(locfold_buf);
15193 Safefree(loc_correspondence);
15196 upper_fill = s - s0;
15201 /* Here the node consists entirely of non-final multi-char
15202 * folds. (Likely it is all 'f's or all 's's.) There's no
15203 * decent place to split it, so give up and just take the
15208 if (need_to_fold_loc) {
15209 Safefree(locfold_buf);
15210 Safefree(loc_correspondence);
15212 } /* End of verifying node ends with an appropriate char */
15214 /* We need to start the next node at the character that didn't fit
15218 loopdone: /* Jumped to when encounters something that shouldn't be
15221 /* Free up any over-allocated space; cast is to silence bogus
15222 * warning in MS VC */
15223 change_engine_size(pRExC_state,
15224 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15226 /* I (khw) don't know if you can get here with zero length, but the
15227 * old code handled this situation by creating a zero-length EXACT
15228 * node. Might as well be NOTHING instead */
15230 OP(REGNODE_p(ret)) = NOTHING;
15234 /* If the node type is EXACT here, check to see if it
15235 * should be EXACTL, or EXACT_REQ8. */
15236 if (node_type == EXACT) {
15238 node_type = EXACTL;
15240 else if (requires_utf8_target) {
15241 node_type = EXACT_REQ8;
15244 else if (node_type == LEXACT) {
15245 if (requires_utf8_target) {
15246 node_type = LEXACT_REQ8;
15250 if ( UNLIKELY(has_micro_sign || has_ss)
15251 && (node_type == EXACTFU || ( node_type == EXACTF
15252 && maybe_exactfu)))
15253 { /* These two conditions are problematic in non-UTF-8
15256 node_type = EXACTFUP;
15258 else if (node_type == EXACTFL) {
15260 /* 'maybe_exactfu' is deliberately set above to
15261 * indicate this node type, where all code points in it
15263 if (maybe_exactfu) {
15264 node_type = EXACTFLU8;
15267 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15269 /* A character that folds to more than one will
15270 * match multiple characters, so can't be SIMPLE.
15271 * We don't have to worry about this with EXACTFLU8
15272 * nodes just above, as they have already been
15273 * folded (since the fold doesn't vary at run
15274 * time). Here, if the final character in the node
15275 * folds to multiple, it can't be simple. (This
15276 * only has an effect if the node has only a single
15277 * character, hence the final one, as elsewhere we
15278 * turn off simple for nodes whose length > 1 */
15282 else if (node_type == EXACTF) { /* Means is /di */
15284 /* This intermediate variable is needed solely because
15285 * the asserts in the macro where used exceed Win32's
15286 * literal string capacity */
15287 char first_char = * STRING(REGNODE_p(ret));
15289 /* If 'maybe_exactfu' is clear, then we need to stay
15290 * /di. If it is set, it means there are no code
15291 * points that match differently depending on UTF8ness
15292 * of the target string, so it can become an EXACTFU
15294 if (! maybe_exactfu) {
15295 RExC_seen_d_op = TRUE;
15297 else if ( isALPHA_FOLD_EQ(first_char, 's')
15298 || isALPHA_FOLD_EQ(ender, 's'))
15300 /* But, if the node begins or ends in an 's' we
15301 * have to defer changing it into an EXACTFU, as
15302 * the node could later get joined with another one
15303 * that ends or begins with 's' creating an 'ss'
15304 * sequence which would then wrongly match the
15305 * sharp s without the target being UTF-8. We
15306 * create a special node that we resolve later when
15307 * we join nodes together */
15309 node_type = EXACTFU_S_EDGE;
15312 node_type = EXACTFU;
15316 if (requires_utf8_target && node_type == EXACTFU) {
15317 node_type = EXACTFU_REQ8;
15321 OP(REGNODE_p(ret)) = node_type;
15322 setSTR_LEN(REGNODE_p(ret), len);
15323 RExC_emit += STR_SZ(len);
15325 /* If the node isn't a single character, it can't be SIMPLE */
15326 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15330 *flagp |= HASWIDTH | maybe_SIMPLE;
15333 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15337 /* len is STRLEN which is unsigned, need to copy to signed */
15340 vFAIL("Internal disaster");
15343 } /* End of label 'defchar:' */
15345 } /* End of giant switch on input character */
15347 /* Position parse to next real character */
15348 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15349 FALSE /* Don't force to /x */ );
15350 if ( *RExC_parse == '{'
15351 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15353 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15355 vFAIL("Unescaped left brace in regex is illegal here");
15357 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15358 " passed through");
15366 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15368 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
15369 * sets up the bitmap and any flags, removing those code points from the
15370 * inversion list, setting it to NULL should it become completely empty */
15373 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15374 assert(PL_regkind[OP(node)] == ANYOF);
15376 /* There is no bitmap for this node type */
15377 if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15381 ANYOF_BITMAP_ZERO(node);
15382 if (*invlist_ptr) {
15384 /* This gets set if we actually need to modify things */
15385 bool change_invlist = FALSE;
15389 /* Start looking through *invlist_ptr */
15390 invlist_iterinit(*invlist_ptr);
15391 while (invlist_iternext(*invlist_ptr, &start, &end)) {
15395 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15396 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15399 /* Quit if are above what we should change */
15400 if (start >= NUM_ANYOF_CODE_POINTS) {
15404 change_invlist = TRUE;
15406 /* Set all the bits in the range, up to the max that we are doing */
15407 high = (end < NUM_ANYOF_CODE_POINTS - 1)
15409 : NUM_ANYOF_CODE_POINTS - 1;
15410 for (i = start; i <= (int) high; i++) {
15411 ANYOF_BITMAP_SET(node, i);
15414 invlist_iterfinish(*invlist_ptr);
15416 /* Done with loop; remove any code points that are in the bitmap from
15417 * *invlist_ptr; similarly for code points above the bitmap if we have
15418 * a flag to match all of them anyways */
15419 if (change_invlist) {
15420 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15422 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15423 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15426 /* If have completely emptied it, remove it completely */
15427 if (_invlist_len(*invlist_ptr) == 0) {
15428 SvREFCNT_dec_NN(*invlist_ptr);
15429 *invlist_ptr = NULL;
15434 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15435 Character classes ([:foo:]) can also be negated ([:^foo:]).
15436 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15437 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15438 but trigger failures because they are currently unimplemented. */
15440 #define POSIXCC_DONE(c) ((c) == ':')
15441 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15442 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15443 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15445 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
15446 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
15447 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
15449 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15451 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15453 #define ADD_POSIX_WARNING(p, text) STMT_START { \
15454 if (posix_warnings) { \
15455 if (! RExC_warn_text ) RExC_warn_text = \
15456 (AV *) sv_2mortal((SV *) newAV()); \
15457 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
15461 REPORT_LOCATION_ARGS(p))); \
15464 #define CLEAR_POSIX_WARNINGS() \
15466 if (posix_warnings && RExC_warn_text) \
15467 av_clear(RExC_warn_text); \
15470 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
15472 CLEAR_POSIX_WARNINGS(); \
15477 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15479 const char * const s, /* Where the putative posix class begins.
15480 Normally, this is one past the '['. This
15481 parameter exists so it can be somewhere
15482 besides RExC_parse. */
15483 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15485 AV ** posix_warnings, /* Where to place any generated warnings, or
15487 const bool check_only /* Don't die if error */
15490 /* This parses what the caller thinks may be one of the three POSIX
15492 * 1) a character class, like [:blank:]
15493 * 2) a collating symbol, like [. .]
15494 * 3) an equivalence class, like [= =]
15495 * In the latter two cases, it croaks if it finds a syntactically legal
15496 * one, as these are not handled by Perl.
15498 * The main purpose is to look for a POSIX character class. It returns:
15499 * a) the class number
15500 * if it is a completely syntactically and semantically legal class.
15501 * 'updated_parse_ptr', if not NULL, is set to point to just after the
15502 * closing ']' of the class
15503 * b) OOB_NAMEDCLASS
15504 * if it appears that one of the three POSIX constructs was meant, but
15505 * its specification was somehow defective. 'updated_parse_ptr', if
15506 * not NULL, is set to point to the character just after the end
15507 * character of the class. See below for handling of warnings.
15508 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15509 * if it doesn't appear that a POSIX construct was intended.
15510 * 'updated_parse_ptr' is not changed. No warnings nor errors are
15513 * In b) there may be errors or warnings generated. If 'check_only' is
15514 * TRUE, then any errors are discarded. Warnings are returned to the
15515 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
15516 * instead it is NULL, warnings are suppressed.
15518 * The reason for this function, and its complexity is that a bracketed
15519 * character class can contain just about anything. But it's easy to
15520 * mistype the very specific posix class syntax but yielding a valid
15521 * regular bracketed class, so it silently gets compiled into something
15522 * quite unintended.
15524 * The solution adopted here maintains backward compatibility except that
15525 * it adds a warning if it looks like a posix class was intended but
15526 * improperly specified. The warning is not raised unless what is input
15527 * very closely resembles one of the 14 legal posix classes. To do this,
15528 * it uses fuzzy parsing. It calculates how many single-character edits it
15529 * would take to transform what was input into a legal posix class. Only
15530 * if that number is quite small does it think that the intention was a
15531 * posix class. Obviously these are heuristics, and there will be cases
15532 * where it errs on one side or another, and they can be tweaked as
15533 * experience informs.
15535 * The syntax for a legal posix class is:
15537 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15539 * What this routine considers syntactically to be an intended posix class
15540 * is this (the comments indicate some restrictions that the pattern
15543 * qr/(?x: \[? # The left bracket, possibly
15545 * \h* # possibly followed by blanks
15546 * (?: \^ \h* )? # possibly a misplaced caret
15547 * [:;]? # The opening class character,
15548 * # possibly omitted. A typo
15549 * # semi-colon can also be used.
15551 * \^? # possibly a correctly placed
15552 * # caret, but not if there was also
15553 * # a misplaced one
15555 * .{3,15} # The class name. If there are
15556 * # deviations from the legal syntax,
15557 * # its edit distance must be close
15558 * # to a real class name in order
15559 * # for it to be considered to be
15560 * # an intended posix class.
15562 * [[:punct:]]? # The closing class character,
15563 * # possibly omitted. If not a colon
15564 * # nor semi colon, the class name
15565 * # must be even closer to a valid
15568 * \]? # The right bracket, possibly
15572 * In the above, \h must be ASCII-only.
15574 * These are heuristics, and can be tweaked as field experience dictates.
15575 * There will be cases when someone didn't intend to specify a posix class
15576 * that this warns as being so. The goal is to minimize these, while
15577 * maximizing the catching of things intended to be a posix class that
15578 * aren't parsed as such.
15582 const char * const e = RExC_end;
15583 unsigned complement = 0; /* If to complement the class */
15584 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15585 bool has_opening_bracket = FALSE;
15586 bool has_opening_colon = FALSE;
15587 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15589 const char * possible_end = NULL; /* used for a 2nd parse pass */
15590 const char* name_start; /* ptr to class name first char */
15592 /* If the number of single-character typos the input name is away from a
15593 * legal name is no more than this number, it is considered to have meant
15594 * the legal name */
15595 int max_distance = 2;
15597 /* to store the name. The size determines the maximum length before we
15598 * decide that no posix class was intended. Should be at least
15599 * sizeof("alphanumeric") */
15601 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15603 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15605 CLEAR_POSIX_WARNINGS();
15608 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15611 if (*(p - 1) != '[') {
15612 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15613 found_problem = TRUE;
15616 has_opening_bracket = TRUE;
15619 /* They could be confused and think you can put spaces between the
15622 found_problem = TRUE;
15626 } while (p < e && isBLANK(*p));
15628 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15631 /* For [. .] and [= =]. These are quite different internally from [: :],
15632 * so they are handled separately. */
15633 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15634 and 1 for at least one char in it
15637 const char open_char = *p;
15638 const char * temp_ptr = p + 1;
15640 /* These two constructs are not handled by perl, and if we find a
15641 * syntactically valid one, we croak. khw, who wrote this code, finds
15642 * this explanation of them very unclear:
15643 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15644 * And searching the rest of the internet wasn't very helpful either.
15645 * It looks like just about any byte can be in these constructs,
15646 * depending on the locale. But unless the pattern is being compiled
15647 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15648 * In that case, it looks like [= =] isn't allowed at all, and that
15649 * [. .] could be any single code point, but for longer strings the
15650 * constituent characters would have to be the ASCII alphabetics plus
15651 * the minus-hyphen. Any sensible locale definition would limit itself
15652 * to these. And any portable one definitely should. Trying to parse
15653 * the general case is a nightmare (see [perl #127604]). So, this code
15654 * looks only for interiors of these constructs that match:
15656 * Using \w relaxes the apparent rules a little, without adding much
15657 * danger of mistaking something else for one of these constructs.
15659 * [. .] in some implementations described on the internet is usable to
15660 * escape a character that otherwise is special in bracketed character
15661 * classes. For example [.].] means a literal right bracket instead of
15662 * the ending of the class
15664 * [= =] can legitimately contain a [. .] construct, but we don't
15665 * handle this case, as that [. .] construct will later get parsed
15666 * itself and croak then. And [= =] is checked for even when not under
15667 * /l, as Perl has long done so.
15669 * The code below relies on there being a trailing NUL, so it doesn't
15670 * have to keep checking if the parse ptr < e.
15672 if (temp_ptr[1] == open_char) {
15675 else while ( temp_ptr < e
15676 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15681 if (*temp_ptr == open_char) {
15683 if (*temp_ptr == ']') {
15685 if (! found_problem && ! check_only) {
15686 RExC_parse = (char *) temp_ptr;
15687 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15688 "extensions", open_char, open_char);
15691 /* Here, the syntax wasn't completely valid, or else the call
15692 * is to check-only */
15693 if (updated_parse_ptr) {
15694 *updated_parse_ptr = (char *) temp_ptr;
15697 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15701 /* If we find something that started out to look like one of these
15702 * constructs, but isn't, we continue below so that it can be checked
15703 * for being a class name with a typo of '.' or '=' instead of a colon.
15707 /* Here, we think there is a possibility that a [: :] class was meant, and
15708 * we have the first real character. It could be they think the '^' comes
15711 found_problem = TRUE;
15712 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15717 found_problem = TRUE;
15721 } while (p < e && isBLANK(*p));
15723 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15727 /* But the first character should be a colon, which they could have easily
15728 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15729 * distinguish from a colon, so treat that as a colon). */
15732 has_opening_colon = TRUE;
15734 else if (*p == ';') {
15735 found_problem = TRUE;
15737 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15738 has_opening_colon = TRUE;
15741 found_problem = TRUE;
15742 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15744 /* Consider an initial punctuation (not one of the recognized ones) to
15745 * be a left terminator */
15746 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15751 /* They may think that you can put spaces between the components */
15753 found_problem = TRUE;
15757 } while (p < e && isBLANK(*p));
15759 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15764 /* We consider something like [^:^alnum:]] to not have been intended to
15765 * be a posix class, but XXX maybe we should */
15767 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15774 /* Again, they may think that you can put spaces between the components */
15776 found_problem = TRUE;
15780 } while (p < e && isBLANK(*p));
15782 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15787 /* XXX This ']' may be a typo, and something else was meant. But
15788 * treating it as such creates enough complications, that that
15789 * possibility isn't currently considered here. So we assume that the
15790 * ']' is what is intended, and if we've already found an initial '[',
15791 * this leaves this construct looking like [:] or [:^], which almost
15792 * certainly weren't intended to be posix classes */
15793 if (has_opening_bracket) {
15794 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15797 /* But this function can be called when we parse the colon for
15798 * something like qr/[alpha:]]/, so we back up to look for the
15803 found_problem = TRUE;
15804 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15806 else if (*p != ':') {
15808 /* XXX We are currently very restrictive here, so this code doesn't
15809 * consider the possibility that, say, /[alpha.]]/ was intended to
15810 * be a posix class. */
15811 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15814 /* Here we have something like 'foo:]'. There was no initial colon,
15815 * and we back up over 'foo. XXX Unlike the going forward case, we
15816 * don't handle typos of non-word chars in the middle */
15817 has_opening_colon = FALSE;
15820 while (p > RExC_start && isWORDCHAR(*p)) {
15825 /* Here, we have positioned ourselves to where we think the first
15826 * character in the potential class is */
15829 /* Now the interior really starts. There are certain key characters that
15830 * can end the interior, or these could just be typos. To catch both
15831 * cases, we may have to do two passes. In the first pass, we keep on
15832 * going unless we come to a sequence that matches
15833 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15834 * This means it takes a sequence to end the pass, so two typos in a row if
15835 * that wasn't what was intended. If the class is perfectly formed, just
15836 * this one pass is needed. We also stop if there are too many characters
15837 * being accumulated, but this number is deliberately set higher than any
15838 * real class. It is set high enough so that someone who thinks that
15839 * 'alphanumeric' is a correct name would get warned that it wasn't.
15840 * While doing the pass, we keep track of where the key characters were in
15841 * it. If we don't find an end to the class, and one of the key characters
15842 * was found, we redo the pass, but stop when we get to that character.
15843 * Thus the key character was considered a typo in the first pass, but a
15844 * terminator in the second. If two key characters are found, we stop at
15845 * the second one in the first pass. Again this can miss two typos, but
15846 * catches a single one
15848 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15849 * point to the first key character. For the second pass, it starts as -1.
15855 bool has_blank = FALSE;
15856 bool has_upper = FALSE;
15857 bool has_terminating_colon = FALSE;
15858 bool has_terminating_bracket = FALSE;
15859 bool has_semi_colon = FALSE;
15860 unsigned int name_len = 0;
15861 int punct_count = 0;
15865 /* Squeeze out blanks when looking up the class name below */
15866 if (isBLANK(*p) ) {
15868 found_problem = TRUE;
15873 /* The name will end with a punctuation */
15875 const char * peek = p + 1;
15877 /* Treat any non-']' punctuation followed by a ']' (possibly
15878 * with intervening blanks) as trying to terminate the class.
15879 * ']]' is very likely to mean a class was intended (but
15880 * missing the colon), but the warning message that gets
15881 * generated shows the error position better if we exit the
15882 * loop at the bottom (eventually), so skip it here. */
15884 if (peek < e && isBLANK(*peek)) {
15886 found_problem = TRUE;
15889 } while (peek < e && isBLANK(*peek));
15892 if (peek < e && *peek == ']') {
15893 has_terminating_bracket = TRUE;
15895 has_terminating_colon = TRUE;
15897 else if (*p == ';') {
15898 has_semi_colon = TRUE;
15899 has_terminating_colon = TRUE;
15902 found_problem = TRUE;
15909 /* Here we have punctuation we thought didn't end the class.
15910 * Keep track of the position of the key characters that are
15911 * more likely to have been class-enders */
15912 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15914 /* Allow just one such possible class-ender not actually
15915 * ending the class. */
15916 if (possible_end) {
15922 /* If we have too many punctuation characters, no use in
15924 if (++punct_count > max_distance) {
15928 /* Treat the punctuation as a typo. */
15929 input_text[name_len++] = *p;
15932 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15933 input_text[name_len++] = toLOWER(*p);
15935 found_problem = TRUE;
15937 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15938 input_text[name_len++] = *p;
15942 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15946 /* The declaration of 'input_text' is how long we allow a potential
15947 * class name to be, before saying they didn't mean a class name at
15949 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15954 /* We get to here when the possible class name hasn't been properly
15955 * terminated before:
15956 * 1) we ran off the end of the pattern; or
15957 * 2) found two characters, each of which might have been intended to
15958 * be the name's terminator
15959 * 3) found so many punctuation characters in the purported name,
15960 * that the edit distance to a valid one is exceeded
15961 * 4) we decided it was more characters than anyone could have
15962 * intended to be one. */
15964 found_problem = TRUE;
15966 /* In the final two cases, we know that looking up what we've
15967 * accumulated won't lead to a match, even a fuzzy one. */
15968 if ( name_len >= C_ARRAY_LENGTH(input_text)
15969 || punct_count > max_distance)
15971 /* If there was an intermediate key character that could have been
15972 * an intended end, redo the parse, but stop there */
15973 if (possible_end && possible_end != (char *) -1) {
15974 possible_end = (char *) -1; /* Special signal value to say
15975 we've done a first pass */
15980 /* Otherwise, it can't have meant to have been a class */
15981 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15984 /* If we ran off the end, and the final character was a punctuation
15985 * one, back up one, to look at that final one just below. Later, we
15986 * will restore the parse pointer if appropriate */
15987 if (name_len && p == e && isPUNCT(*(p-1))) {
15992 if (p < e && isPUNCT(*p)) {
15994 has_terminating_bracket = TRUE;
15996 /* If this is a 2nd ']', and the first one is just below this
15997 * one, consider that to be the real terminator. This gives a
15998 * uniform and better positioning for the warning message */
16000 && possible_end != (char *) -1
16001 && *possible_end == ']'
16002 && name_len && input_text[name_len - 1] == ']')
16007 /* And this is actually equivalent to having done the 2nd
16008 * pass now, so set it to not try again */
16009 possible_end = (char *) -1;
16014 has_terminating_colon = TRUE;
16016 else if (*p == ';') {
16017 has_semi_colon = TRUE;
16018 has_terminating_colon = TRUE;
16026 /* Here, we have a class name to look up. We can short circuit the
16027 * stuff below for short names that can't possibly be meant to be a
16028 * class name. (We can do this on the first pass, as any second pass
16029 * will yield an even shorter name) */
16030 if (name_len < 3) {
16031 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16034 /* Find which class it is. Initially switch on the length of the name.
16036 switch (name_len) {
16038 if (memEQs(name_start, 4, "word")) {
16039 /* this is not POSIX, this is the Perl \w */
16040 class_number = ANYOF_WORDCHAR;
16044 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16045 * graph lower print punct space upper
16046 * Offset 4 gives the best switch position. */
16047 switch (name_start[4]) {
16049 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16050 class_number = ANYOF_ALPHA;
16053 if (memBEGINs(name_start, 5, "spac")) /* space */
16054 class_number = ANYOF_SPACE;
16057 if (memBEGINs(name_start, 5, "grap")) /* graph */
16058 class_number = ANYOF_GRAPH;
16061 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16062 class_number = ANYOF_ASCII;
16065 if (memBEGINs(name_start, 5, "blan")) /* blank */
16066 class_number = ANYOF_BLANK;
16069 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16070 class_number = ANYOF_CNTRL;
16073 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16074 class_number = ANYOF_ALPHANUMERIC;
16077 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16078 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16079 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16080 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16083 if (memBEGINs(name_start, 5, "digi")) /* digit */
16084 class_number = ANYOF_DIGIT;
16085 else if (memBEGINs(name_start, 5, "prin")) /* print */
16086 class_number = ANYOF_PRINT;
16087 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16088 class_number = ANYOF_PUNCT;
16093 if (memEQs(name_start, 6, "xdigit"))
16094 class_number = ANYOF_XDIGIT;
16098 /* If the name exactly matches a posix class name the class number will
16099 * here be set to it, and the input almost certainly was meant to be a
16100 * posix class, so we can skip further checking. If instead the syntax
16101 * is exactly correct, but the name isn't one of the legal ones, we
16102 * will return that as an error below. But if neither of these apply,
16103 * it could be that no posix class was intended at all, or that one
16104 * was, but there was a typo. We tease these apart by doing fuzzy
16105 * matching on the name */
16106 if (class_number == OOB_NAMEDCLASS && found_problem) {
16107 const UV posix_names[][6] = {
16108 { 'a', 'l', 'n', 'u', 'm' },
16109 { 'a', 'l', 'p', 'h', 'a' },
16110 { 'a', 's', 'c', 'i', 'i' },
16111 { 'b', 'l', 'a', 'n', 'k' },
16112 { 'c', 'n', 't', 'r', 'l' },
16113 { 'd', 'i', 'g', 'i', 't' },
16114 { 'g', 'r', 'a', 'p', 'h' },
16115 { 'l', 'o', 'w', 'e', 'r' },
16116 { 'p', 'r', 'i', 'n', 't' },
16117 { 'p', 'u', 'n', 'c', 't' },
16118 { 's', 'p', 'a', 'c', 'e' },
16119 { 'u', 'p', 'p', 'e', 'r' },
16120 { 'w', 'o', 'r', 'd' },
16121 { 'x', 'd', 'i', 'g', 'i', 't' }
16123 /* The names of the above all have added NULs to make them the same
16124 * size, so we need to also have the real lengths */
16125 const UV posix_name_lengths[] = {
16126 sizeof("alnum") - 1,
16127 sizeof("alpha") - 1,
16128 sizeof("ascii") - 1,
16129 sizeof("blank") - 1,
16130 sizeof("cntrl") - 1,
16131 sizeof("digit") - 1,
16132 sizeof("graph") - 1,
16133 sizeof("lower") - 1,
16134 sizeof("print") - 1,
16135 sizeof("punct") - 1,
16136 sizeof("space") - 1,
16137 sizeof("upper") - 1,
16138 sizeof("word") - 1,
16139 sizeof("xdigit")- 1
16142 int temp_max = max_distance; /* Use a temporary, so if we
16143 reparse, we haven't changed the
16146 /* Use a smaller max edit distance if we are missing one of the
16148 if ( has_opening_bracket + has_opening_colon < 2
16149 || has_terminating_bracket + has_terminating_colon < 2)
16154 /* See if the input name is close to a legal one */
16155 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16157 /* Short circuit call if the lengths are too far apart to be
16159 if (abs( (int) (name_len - posix_name_lengths[i]))
16165 if (edit_distance(input_text,
16168 posix_name_lengths[i],
16172 { /* If it is close, it probably was intended to be a class */
16173 goto probably_meant_to_be;
16177 /* Here the input name is not close enough to a valid class name
16178 * for us to consider it to be intended to be a posix class. If
16179 * we haven't already done so, and the parse found a character that
16180 * could have been terminators for the name, but which we absorbed
16181 * as typos during the first pass, repeat the parse, signalling it
16182 * to stop at that character */
16183 if (possible_end && possible_end != (char *) -1) {
16184 possible_end = (char *) -1;
16189 /* Here neither pass found a close-enough class name */
16190 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16193 probably_meant_to_be:
16195 /* Here we think that a posix specification was intended. Update any
16197 if (updated_parse_ptr) {
16198 *updated_parse_ptr = (char *) p;
16201 /* If a posix class name was intended but incorrectly specified, we
16202 * output or return the warnings */
16203 if (found_problem) {
16205 /* We set flags for these issues in the parse loop above instead of
16206 * adding them to the list of warnings, because we can parse it
16207 * twice, and we only want one warning instance */
16209 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16212 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16214 if (has_semi_colon) {
16215 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16217 else if (! has_terminating_colon) {
16218 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16220 if (! has_terminating_bracket) {
16221 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16224 if ( posix_warnings
16226 && av_count(RExC_warn_text) > 0)
16228 *posix_warnings = RExC_warn_text;
16231 else if (class_number != OOB_NAMEDCLASS) {
16232 /* If it is a known class, return the class. The class number
16233 * #defines are structured so each complement is +1 to the normal
16235 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16237 else if (! check_only) {
16239 /* Here, it is an unrecognized class. This is an error (unless the
16240 * call is to check only, which we've already handled above) */
16241 const char * const complement_string = (complement)
16244 RExC_parse = (char *) p;
16245 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16247 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16251 return OOB_NAMEDCLASS;
16253 #undef ADD_POSIX_WARNING
16255 STATIC unsigned int
16256 S_regex_set_precedence(const U8 my_operator) {
16258 /* Returns the precedence in the (?[...]) construct of the input operator,
16259 * specified by its character representation. The precedence follows
16260 * general Perl rules, but it extends this so that ')' and ']' have (low)
16261 * precedence even though they aren't really operators */
16263 switch (my_operator) {
16279 NOT_REACHED; /* NOTREACHED */
16280 return 0; /* Silence compiler warning */
16283 STATIC regnode_offset
16284 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16285 I32 *flagp, U32 depth,
16286 char * const oregcomp_parse)
16288 /* Handle the (?[...]) construct to do set operations */
16290 U8 curchar; /* Current character being parsed */
16291 UV start, end; /* End points of code point ranges */
16292 SV* final = NULL; /* The end result inversion list */
16293 SV* result_string; /* 'final' stringified */
16294 AV* stack; /* stack of operators and operands not yet
16296 AV* fence_stack = NULL; /* A stack containing the positions in
16297 'stack' of where the undealt-with left
16298 parens would be if they were actually
16300 /* The 'volatile' is a workaround for an optimiser bug
16301 * in Solaris Studio 12.3. See RT #127455 */
16302 volatile IV fence = 0; /* Position of where most recent undealt-
16303 with left paren in stack is; -1 if none.
16305 STRLEN len; /* Temporary */
16306 regnode_offset node; /* Temporary, and final regnode returned by
16308 const bool save_fold = FOLD; /* Temporary */
16309 char *save_end, *save_parse; /* Temporaries */
16310 const bool in_locale = LOC; /* we turn off /l during processing */
16312 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16314 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16315 PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16317 DEBUG_PARSE("xcls");
16320 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16323 /* The use of this operator implies /u. This is required so that the
16324 * compile time values are valid in all runtime cases */
16325 REQUIRE_UNI_RULES(flagp, 0);
16327 ckWARNexperimental(RExC_parse,
16328 WARN_EXPERIMENTAL__REGEX_SETS,
16329 "The regex_sets feature is experimental");
16331 /* Everything in this construct is a metacharacter. Operands begin with
16332 * either a '\' (for an escape sequence), or a '[' for a bracketed
16333 * character class. Any other character should be an operator, or
16334 * parenthesis for grouping. Both types of operands are handled by calling
16335 * regclass() to parse them. It is called with a parameter to indicate to
16336 * return the computed inversion list. The parsing here is implemented via
16337 * a stack. Each entry on the stack is a single character representing one
16338 * of the operators; or else a pointer to an operand inversion list. */
16340 #define IS_OPERATOR(a) SvIOK(a)
16341 #define IS_OPERAND(a) (! IS_OPERATOR(a))
16343 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
16344 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16345 * with pronouncing it called it Reverse Polish instead, but now that YOU
16346 * know how to pronounce it you can use the correct term, thus giving due
16347 * credit to the person who invented it, and impressing your geek friends.
16348 * Wikipedia says that the pronounciation of "Ł" has been changing so that
16349 * it is now more like an English initial W (as in wonk) than an L.)
16351 * This means that, for example, 'a | b & c' is stored on the stack as
16359 * where the numbers in brackets give the stack [array] element number.
16360 * In this implementation, parentheses are not stored on the stack.
16361 * Instead a '(' creates a "fence" so that the part of the stack below the
16362 * fence is invisible except to the corresponding ')' (this allows us to
16363 * replace testing for parens, by using instead subtraction of the fence
16364 * position). As new operands are processed they are pushed onto the stack
16365 * (except as noted in the next paragraph). New operators of higher
16366 * precedence than the current final one are inserted on the stack before
16367 * the lhs operand (so that when the rhs is pushed next, everything will be
16368 * in the correct positions shown above. When an operator of equal or
16369 * lower precedence is encountered in parsing, all the stacked operations
16370 * of equal or higher precedence are evaluated, leaving the result as the
16371 * top entry on the stack. This makes higher precedence operations
16372 * evaluate before lower precedence ones, and causes operations of equal
16373 * precedence to left associate.
16375 * The only unary operator '!' is immediately pushed onto the stack when
16376 * encountered. When an operand is encountered, if the top of the stack is
16377 * a '!", the complement is immediately performed, and the '!' popped. The
16378 * resulting value is treated as a new operand, and the logic in the
16379 * previous paragraph is executed. Thus in the expression
16381 * the stack looks like
16387 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16394 * A ')' is treated as an operator with lower precedence than all the
16395 * aforementioned ones, which causes all operations on the stack above the
16396 * corresponding '(' to be evaluated down to a single resultant operand.
16397 * Then the fence for the '(' is removed, and the operand goes through the
16398 * algorithm above, without the fence.
16400 * A separate stack is kept of the fence positions, so that the position of
16401 * the latest so-far unbalanced '(' is at the top of it.
16403 * The ']' ending the construct is treated as the lowest operator of all,
16404 * so that everything gets evaluated down to a single operand, which is the
16407 sv_2mortal((SV *)(stack = newAV()));
16408 sv_2mortal((SV *)(fence_stack = newAV()));
16410 while (RExC_parse < RExC_end) {
16411 I32 top_index; /* Index of top-most element in 'stack' */
16412 SV** top_ptr; /* Pointer to top 'stack' element */
16413 SV* current = NULL; /* To contain the current inversion list
16415 SV* only_to_avoid_leaks;
16417 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16418 TRUE /* Force /x */ );
16419 if (RExC_parse >= RExC_end) { /* Fail */
16423 curchar = UCHARAT(RExC_parse);
16427 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16428 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16429 DEBUG_U(dump_regex_sets_structures(pRExC_state,
16430 stack, fence, fence_stack));
16433 top_index = av_tindex_skip_len_mg(stack);
16436 SV** stacked_ptr; /* Ptr to something already on 'stack' */
16437 char stacked_operator; /* The topmost operator on the 'stack'. */
16438 SV* lhs; /* Operand to the left of the operator */
16439 SV* rhs; /* Operand to the right of the operator */
16440 SV* fence_ptr; /* Pointer to top element of the fence
16444 if ( RExC_parse < RExC_end - 2
16445 && UCHARAT(RExC_parse + 1) == '?'
16446 && UCHARAT(RExC_parse + 2) == '^')
16448 const regnode_offset orig_emit = RExC_emit;
16449 SV * resultant_invlist;
16451 /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16452 * This happens when we have some thing like
16454 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16456 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
16458 * Here we would be handling the interpolated
16459 * '$thai_or_lao'. We handle this by a recursive call to
16460 * reg which returns the inversion list the
16461 * interpolated expression evaluates to. Actually, the
16462 * return is a special regnode containing a pointer to that
16463 * inversion list. If the return isn't that regnode alone,
16464 * we know that this wasn't such an interpolation, which is
16465 * an error: we need to get a single inversion list back
16466 * from the recursion */
16471 node = reg(pRExC_state, 2, flagp, depth+1);
16472 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16474 if ( OP(REGNODE_p(node)) != REGEX_SET
16475 /* If more than a single node returned, the nested
16476 * parens evaluated to more than just a (?[...]),
16477 * which isn't legal */
16478 || RExC_emit != orig_emit
16479 + NODE_STEP_REGNODE
16480 + regarglen[REGEX_SET])
16482 vFAIL("Expecting interpolated extended charclass");
16484 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16485 current = invlist_clone(resultant_invlist, NULL);
16486 SvREFCNT_dec(resultant_invlist);
16489 RExC_emit = orig_emit;
16490 goto handle_operand;
16493 /* A regular '('. Look behind for illegal syntax */
16494 if (top_index - fence >= 0) {
16495 /* If the top entry on the stack is an operator, it had
16496 * better be a '!', otherwise the entry below the top
16497 * operand should be an operator */
16498 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
16499 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16500 || ( IS_OPERAND(*top_ptr)
16501 && ( top_index - fence < 1
16502 || ! (stacked_ptr = av_fetch(stack,
16505 || ! IS_OPERATOR(*stacked_ptr))))
16508 vFAIL("Unexpected '(' with no preceding operator");
16512 /* Stack the position of this undealt-with left paren */
16513 av_push(fence_stack, newSViv(fence));
16514 fence = top_index + 1;
16518 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16519 * multi-char folds are allowed. */
16520 if (!regclass(pRExC_state, flagp, depth+1,
16521 TRUE, /* means parse just the next thing */
16522 FALSE, /* don't allow multi-char folds */
16523 FALSE, /* don't silence non-portable warnings. */
16525 FALSE, /* Require return to be an ANYOF */
16528 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16529 goto regclass_failed;
16534 /* regclass() will return with parsing just the \ sequence,
16535 * leaving the parse pointer at the next thing to parse */
16537 goto handle_operand;
16539 case '[': /* Is a bracketed character class */
16541 /* See if this is a [:posix:] class. */
16542 bool is_posix_class = (OOB_NAMEDCLASS
16543 < handle_possible_posix(pRExC_state,
16547 TRUE /* checking only */));
16548 /* If it is a posix class, leave the parse pointer at the '['
16549 * to fool regclass() into thinking it is part of a
16550 * '[[:posix:]]'. */
16551 if (! is_posix_class) {
16555 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16556 * multi-char folds are allowed. */
16557 if (!regclass(pRExC_state, flagp, depth+1,
16558 is_posix_class, /* parse the whole char
16559 class only if not a
16561 FALSE, /* don't allow multi-char folds */
16562 TRUE, /* silence non-portable warnings. */
16564 FALSE, /* Require return to be an ANYOF */
16567 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16568 goto regclass_failed;
16573 /* function call leaves parse pointing to the ']', except if we
16575 if (is_posix_class) {
16579 goto handle_operand;
16583 if (top_index >= 1) {
16584 goto join_operators;
16587 /* Only a single operand on the stack: are done */
16591 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16592 if (UCHARAT(RExC_parse - 1) == ']') {
16596 vFAIL("Unexpected ')'");
16599 /* If nothing after the fence, is missing an operand */
16600 if (top_index - fence < 0) {
16604 /* If at least two things on the stack, treat this as an
16606 if (top_index - fence >= 1) {
16607 goto join_operators;
16610 /* Here only a single thing on the fenced stack, and there is a
16611 * fence. Get rid of it */
16612 fence_ptr = av_pop(fence_stack);
16614 fence = SvIV(fence_ptr);
16615 SvREFCNT_dec_NN(fence_ptr);
16622 /* Having gotten rid of the fence, we pop the operand at the
16623 * stack top and process it as a newly encountered operand */
16624 current = av_pop(stack);
16625 if (IS_OPERAND(current)) {
16626 goto handle_operand;
16638 /* These binary operators should have a left operand already
16640 if ( top_index - fence < 0
16641 || top_index - fence == 1
16642 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16643 || ! IS_OPERAND(*top_ptr))
16645 goto unexpected_binary;
16648 /* If only the one operand is on the part of the stack visible
16649 * to us, we just place this operator in the proper position */
16650 if (top_index - fence < 2) {
16652 /* Place the operator before the operand */
16654 SV* lhs = av_pop(stack);
16655 av_push(stack, newSVuv(curchar));
16656 av_push(stack, lhs);
16660 /* But if there is something else on the stack, we need to
16661 * process it before this new operator if and only if the
16662 * stacked operation has equal or higher precedence than the
16667 /* The operator on the stack is supposed to be below both its
16669 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16670 || IS_OPERAND(*stacked_ptr))
16672 /* But if not, it's legal and indicates we are completely
16673 * done if and only if we're currently processing a ']',
16674 * which should be the final thing in the expression */
16675 if (curchar == ']') {
16681 vFAIL2("Unexpected binary operator '%c' with no "
16682 "preceding operand", curchar);
16684 stacked_operator = (char) SvUV(*stacked_ptr);
16686 if (regex_set_precedence(curchar)
16687 > regex_set_precedence(stacked_operator))
16689 /* Here, the new operator has higher precedence than the
16690 * stacked one. This means we need to add the new one to
16691 * the stack to await its rhs operand (and maybe more
16692 * stuff). We put it before the lhs operand, leaving
16693 * untouched the stacked operator and everything below it
16695 lhs = av_pop(stack);
16696 assert(IS_OPERAND(lhs));
16698 av_push(stack, newSVuv(curchar));
16699 av_push(stack, lhs);
16703 /* Here, the new operator has equal or lower precedence than
16704 * what's already there. This means the operation already
16705 * there should be performed now, before the new one. */
16707 rhs = av_pop(stack);
16708 if (! IS_OPERAND(rhs)) {
16710 /* This can happen when a ! is not followed by an operand,
16711 * like in /(?[\t &!])/ */
16715 lhs = av_pop(stack);
16717 if (! IS_OPERAND(lhs)) {
16719 /* This can happen when there is an empty (), like in
16720 * /(?[[0]+()+])/ */
16724 switch (stacked_operator) {
16726 _invlist_intersection(lhs, rhs, &rhs);
16731 _invlist_union(lhs, rhs, &rhs);
16735 _invlist_subtract(lhs, rhs, &rhs);
16738 case '^': /* The union minus the intersection */
16743 _invlist_union(lhs, rhs, &u);
16744 _invlist_intersection(lhs, rhs, &i);
16745 _invlist_subtract(u, i, &rhs);
16746 SvREFCNT_dec_NN(i);
16747 SvREFCNT_dec_NN(u);
16753 /* Here, the higher precedence operation has been done, and the
16754 * result is in 'rhs'. We overwrite the stacked operator with
16755 * the result. Then we redo this code to either push the new
16756 * operator onto the stack or perform any higher precedence
16757 * stacked operation */
16758 only_to_avoid_leaks = av_pop(stack);
16759 SvREFCNT_dec(only_to_avoid_leaks);
16760 av_push(stack, rhs);
16763 case '!': /* Highest priority, right associative */
16765 /* If what's already at the top of the stack is another '!",
16766 * they just cancel each other out */
16767 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16768 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16770 only_to_avoid_leaks = av_pop(stack);
16771 SvREFCNT_dec(only_to_avoid_leaks);
16773 else { /* Otherwise, since it's right associative, just push
16775 av_push(stack, newSVuv(curchar));
16780 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16781 if (RExC_parse >= RExC_end) {
16784 vFAIL("Unexpected character");
16788 /* Here 'current' is the operand. If something is already on the
16789 * stack, we have to check if it is a !. But first, the code above
16790 * may have altered the stack in the time since we earlier set
16793 top_index = av_tindex_skip_len_mg(stack);
16794 if (top_index - fence >= 0) {
16795 /* If the top entry on the stack is an operator, it had better
16796 * be a '!', otherwise the entry below the top operand should
16797 * be an operator */
16798 top_ptr = av_fetch(stack, top_index, FALSE);
16800 if (IS_OPERATOR(*top_ptr)) {
16802 /* The only permissible operator at the top of the stack is
16803 * '!', which is applied immediately to this operand. */
16804 curchar = (char) SvUV(*top_ptr);
16805 if (curchar != '!') {
16806 SvREFCNT_dec(current);
16807 vFAIL2("Unexpected binary operator '%c' with no "
16808 "preceding operand", curchar);
16811 _invlist_invert(current);
16813 only_to_avoid_leaks = av_pop(stack);
16814 SvREFCNT_dec(only_to_avoid_leaks);
16816 /* And we redo with the inverted operand. This allows
16817 * handling multiple ! in a row */
16818 goto handle_operand;
16820 /* Single operand is ok only for the non-binary ')'
16822 else if ((top_index - fence == 0 && curchar != ')')
16823 || (top_index - fence > 0
16824 && (! (stacked_ptr = av_fetch(stack,
16827 || IS_OPERAND(*stacked_ptr))))
16829 SvREFCNT_dec(current);
16830 vFAIL("Operand with no preceding operator");
16834 /* Here there was nothing on the stack or the top element was
16835 * another operand. Just add this new one */
16836 av_push(stack, current);
16838 } /* End of switch on next parse token */
16840 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16841 } /* End of loop parsing through the construct */
16843 vFAIL("Syntax error in (?[...])");
16847 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16848 if (RExC_parse < RExC_end) {
16852 vFAIL("Unexpected ']' with no following ')' in (?[...");
16855 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16856 vFAIL("Unmatched (");
16859 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16860 || ((final = av_pop(stack)) == NULL)
16861 || ! IS_OPERAND(final)
16862 || ! is_invlist(final)
16863 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16866 SvREFCNT_dec(final);
16867 vFAIL("Incomplete expression within '(?[ ])'");
16870 /* Here, 'final' is the resultant inversion list from evaluating the
16871 * expression. Return it if so requested */
16872 if (return_invlist) {
16873 *return_invlist = final;
16877 if (RExC_sets_depth) { /* If within a recursive call, return in a special
16880 node = regpnode(pRExC_state, REGEX_SET, final);
16884 /* Otherwise generate a resultant node, based on 'final'. regclass()
16885 * is expecting a string of ranges and individual code points */
16886 invlist_iterinit(final);
16887 result_string = newSVpvs("");
16888 while (invlist_iternext(final, &start, &end)) {
16889 if (start == end) {
16890 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16893 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16894 UVXf "}", start, end);
16898 /* About to generate an ANYOF (or similar) node from the inversion list
16899 * we have calculated */
16900 save_parse = RExC_parse;
16901 RExC_parse = SvPV(result_string, len);
16902 save_end = RExC_end;
16903 RExC_end = RExC_parse + len;
16904 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16906 /* We turn off folding around the call, as the class we have
16907 * constructed already has all folding taken into consideration, and we
16908 * don't want regclass() to add to that */
16909 RExC_flags &= ~RXf_PMf_FOLD;
16910 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16911 * folds are allowed. */
16912 node = regclass(pRExC_state, flagp, depth+1,
16913 FALSE, /* means parse the whole char class */
16914 FALSE, /* don't allow multi-char folds */
16915 TRUE, /* silence non-portable warnings. The above may
16916 very well have generated non-portable code
16917 points, but they're valid on this machine */
16918 FALSE, /* similarly, no need for strict */
16920 /* We can optimize into something besides an ANYOF,
16921 * except under /l, which needs to be ANYOF because of
16922 * runtime checks for locale sanity, etc */
16928 RExC_parse = save_parse + 1;
16929 RExC_end = save_end;
16930 SvREFCNT_dec_NN(final);
16931 SvREFCNT_dec_NN(result_string);
16934 RExC_flags |= RXf_PMf_FOLD;
16938 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16939 goto regclass_failed;
16942 /* Fix up the node type if we are in locale. (We have pretended we are
16943 * under /u for the purposes of regclass(), as this construct will only
16944 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
16945 * (so as to cause any warnings about bad locales to be output in
16946 * regexec.c), and add the flag that indicates to check if not in a
16947 * UTF-8 locale. The reason we above forbid optimization into
16948 * something other than an ANYOF node is simply to minimize the number
16949 * of code changes in regexec.c. Otherwise we would have to create new
16950 * EXACTish node types and deal with them. This decision could be
16951 * revisited should this construct become popular.
16953 * (One might think we could look at the resulting ANYOF node and
16954 * suppress the flag if everything is above 255, as those would be
16955 * UTF-8 only, but this isn't true, as the components that led to that
16956 * result could have been locale-affected, and just happen to cancel
16957 * each other out under UTF-8 locales.) */
16959 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16961 assert(OP(REGNODE_p(node)) == ANYOF);
16963 OP(REGNODE_p(node)) = ANYOFL;
16964 ANYOF_FLAGS(REGNODE_p(node))
16965 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16969 nextchar(pRExC_state);
16970 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16974 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16978 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16981 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16982 AV * stack, const IV fence, AV * fence_stack)
16983 { /* Dumps the stacks in handle_regex_sets() */
16985 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16986 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16989 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16991 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16993 if (stack_top < 0) {
16994 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16997 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16998 for (i = stack_top; i >= 0; i--) {
16999 SV ** element_ptr = av_fetch(stack, i, FALSE);
17000 if (! element_ptr) {
17003 if (IS_OPERATOR(*element_ptr)) {
17004 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17005 (int) i, (int) SvIV(*element_ptr));
17008 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17009 sv_dump(*element_ptr);
17014 if (fence_stack_top < 0) {
17015 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17018 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17019 for (i = fence_stack_top; i >= 0; i--) {
17020 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17021 if (! element_ptr) {
17024 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17025 (int) i, (int) SvIV(*element_ptr));
17036 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17038 /* This adds the Latin1/above-Latin1 folding rules.
17040 * This should be called only for a Latin1-range code points, cp, which is
17041 * known to be involved in a simple fold with other code points above
17042 * Latin1. It would give false results if /aa has been specified.
17043 * Multi-char folds are outside the scope of this, and must be handled
17046 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17048 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17050 /* The rules that are valid for all Unicode versions are hard-coded in */
17055 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17059 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17062 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17063 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17065 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17066 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17067 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17069 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17070 *invlist = add_cp_to_invlist(*invlist,
17071 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17074 default: /* Other code points are checked against the data for the
17075 current Unicode version */
17077 Size_t folds_count;
17079 const U32 * remaining_folds;
17083 folded_cp = toFOLD(cp);
17086 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17088 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17091 if (folded_cp > 255) {
17092 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17095 folds_count = _inverse_folds(folded_cp, &first_fold,
17097 if (folds_count == 0) {
17099 /* Use deprecated warning to increase the chances of this being
17101 ckWARN2reg_d(RExC_parse,
17102 "Perl folding rules are not up-to-date for 0x%02X;"
17103 " please use the perlbug utility to report;", cp);
17108 if (first_fold > 255) {
17109 *invlist = add_cp_to_invlist(*invlist, first_fold);
17111 for (i = 0; i < folds_count - 1; i++) {
17112 if (remaining_folds[i] > 255) {
17113 *invlist = add_cp_to_invlist(*invlist,
17114 remaining_folds[i]);
17124 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17126 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17130 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17132 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17134 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17135 CLEAR_POSIX_WARNINGS();
17139 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17140 if (first_is_fatal) { /* Avoid leaking this */
17141 av_undef(posix_warnings); /* This isn't necessary if the
17142 array is mortal, but is a
17144 (void) sv_2mortal(msg);
17147 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17148 SvREFCNT_dec_NN(msg);
17151 UPDATE_WARNINGS_LOC(RExC_parse);
17154 PERL_STATIC_INLINE Size_t
17155 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17157 const U8 * const start = s1;
17158 const U8 * const send = start + max;
17160 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17162 while (s1 < send && *s1 == *s2) {
17171 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17173 /* This adds the string scalar <multi_string> to the array
17174 * <multi_char_matches>. <multi_string> is known to have exactly
17175 * <cp_count> code points in it. This is used when constructing a
17176 * bracketed character class and we find something that needs to match more
17177 * than a single character.
17179 * <multi_char_matches> is actually an array of arrays. Each top-level
17180 * element is an array that contains all the strings known so far that are
17181 * the same length. And that length (in number of code points) is the same
17182 * as the index of the top-level array. Hence, the [2] element is an
17183 * array, each element thereof is a string containing TWO code points;
17184 * while element [3] is for strings of THREE characters, and so on. Since
17185 * this is for multi-char strings there can never be a [0] nor [1] element.
17187 * When we rewrite the character class below, we will do so such that the
17188 * longest strings are written first, so that it prefers the longest
17189 * matching strings first. This is done even if it turns out that any
17190 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17191 * Christiansen has agreed that this is ok. This makes the test for the
17192 * ligature 'ffi' come before the test for 'ff', for example */
17195 AV** this_array_ptr;
17197 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17199 if (! multi_char_matches) {
17200 multi_char_matches = newAV();
17203 if (av_exists(multi_char_matches, cp_count)) {
17204 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17205 this_array = *this_array_ptr;
17208 this_array = newAV();
17209 av_store(multi_char_matches, cp_count,
17212 av_push(this_array, multi_string);
17214 return multi_char_matches;
17217 /* The names of properties whose definitions are not known at compile time are
17218 * stored in this SV, after a constant heading. So if the length has been
17219 * changed since initialization, then there is a run-time definition. */
17220 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17221 (SvCUR(listsv) != initial_listsv_len)
17223 /* There is a restricted set of white space characters that are legal when
17224 * ignoring white space in a bracketed character class. This generates the
17225 * code to skip them.
17227 * There is a line below that uses the same white space criteria but is outside
17228 * this macro. Both here and there must use the same definition */
17229 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
17232 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
17239 STATIC regnode_offset
17240 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17241 const bool stop_at_1, /* Just parse the next thing, don't
17242 look for a full character class */
17243 bool allow_mutiple_chars,
17244 const bool silence_non_portable, /* Don't output warnings
17248 bool optimizable, /* ? Allow a non-ANYOF return
17250 SV** ret_invlist /* Return an inversion list, not a node */
17253 /* parse a bracketed class specification. Most of these will produce an
17254 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17255 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17256 * under /i with multi-character folds: it will be rewritten following the
17257 * paradigm of this example, where the <multi-fold>s are characters which
17258 * fold to multiple character sequences:
17259 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17260 * gets effectively rewritten as:
17261 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17262 * reg() gets called (recursively) on the rewritten version, and this
17263 * function will return what it constructs. (Actually the <multi-fold>s
17264 * aren't physically removed from the [abcdefghi], it's just that they are
17265 * ignored in the recursion by means of a flag:
17266 * <RExC_in_multi_char_class>.)
17268 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17269 * characters, with the corresponding bit set if that character is in the
17270 * list. For characters above this, an inversion list is used. There
17271 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17272 * determinable at compile time
17274 * On success, returns the offset at which any next node should be placed
17275 * into the regex engine program being compiled.
17277 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17278 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17282 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17284 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17285 regnode_offset ret = -1; /* Initialized to an illegal value */
17287 int namedclass = OOB_NAMEDCLASS;
17288 char *rangebegin = NULL;
17289 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17290 aren't available at the time this was called */
17291 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17292 than just initialized. */
17293 SV* properties = NULL; /* Code points that match \p{} \P{} */
17294 SV* posixes = NULL; /* Code points that match classes like [:word:],
17295 extended beyond the Latin1 range. These have to
17296 be kept separate from other code points for much
17297 of this function because their handling is
17298 different under /i, and for most classes under
17300 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17301 separate for a while from the non-complemented
17302 versions because of complications with /d
17304 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17305 treated more simply than the general case,
17306 leading to less compilation and execution
17308 UV element_count = 0; /* Number of distinct elements in the class.
17309 Optimizations may be possible if this is tiny */
17310 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17311 character; used under /i */
17313 char * stop_ptr = RExC_end; /* where to stop parsing */
17315 /* ignore unescaped whitespace? */
17316 const bool skip_white = cBOOL( ret_invlist
17317 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17319 /* inversion list of code points this node matches only when the target
17320 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17322 SV* upper_latin1_only_utf8_matches = NULL;
17324 /* Inversion list of code points this node matches regardless of things
17325 * like locale, folding, utf8ness of the target string */
17326 SV* cp_list = NULL;
17328 /* Like cp_list, but code points on this list need to be checked for things
17329 * that fold to/from them under /i */
17330 SV* cp_foldable_list = NULL;
17332 /* Like cp_list, but code points on this list are valid only when the
17333 * runtime locale is UTF-8 */
17334 SV* only_utf8_locale_list = NULL;
17336 /* In a range, if one of the endpoints is non-character-set portable,
17337 * meaning that it hard-codes a code point that may mean a different
17338 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17339 * mnemonic '\t' which each mean the same character no matter which
17340 * character set the platform is on. */
17341 unsigned int non_portable_endpoint = 0;
17343 /* Is the range unicode? which means on a platform that isn't 1-1 native
17344 * to Unicode (i.e. non-ASCII), each code point in it should be considered
17345 * to be a Unicode value. */
17346 bool unicode_range = FALSE;
17347 bool invert = FALSE; /* Is this class to be complemented */
17349 bool warn_super = ALWAYS_WARN_SUPER;
17351 const char * orig_parse = RExC_parse;
17353 /* This variable is used to mark where the end in the input is of something
17354 * that looks like a POSIX construct but isn't. During the parse, when
17355 * something looks like it could be such a construct is encountered, it is
17356 * checked for being one, but not if we've already checked this area of the
17357 * input. Only after this position is reached do we check again */
17358 char *not_posix_region_end = RExC_parse - 1;
17360 AV* posix_warnings = NULL;
17361 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17362 U8 op = END; /* The returned node-type, initialized to an impossible
17364 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
17365 U32 posixl = 0; /* bit field of posix classes matched under /l */
17368 /* Flags as to what things aren't knowable until runtime. (Note that these are
17369 * mutually exclusive.) */
17370 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
17371 haven't been defined as of yet */
17372 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
17374 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
17375 what gets folded */
17376 U32 has_runtime_dependency = 0; /* OR of the above flags */
17378 DECLARE_AND_GET_RE_DEBUG_FLAGS;
17380 PERL_ARGS_ASSERT_REGCLASS;
17382 PERL_UNUSED_ARG(depth);
17385 assert(! (ret_invlist && allow_mutiple_chars));
17387 /* If wants an inversion list returned, we can't optimize to something
17390 optimizable = FALSE;
17393 DEBUG_PARSE("clas");
17395 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
17396 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
17397 && UNICODE_DOT_DOT_VERSION == 0)
17398 allow_mutiple_chars = FALSE;
17401 /* We include the /i status at the beginning of this so that we can
17402 * know it at runtime */
17403 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17404 initial_listsv_len = SvCUR(listsv);
17405 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
17407 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17409 assert(RExC_parse <= RExC_end);
17411 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
17414 allow_mutiple_chars = FALSE;
17416 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17419 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17420 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17421 int maybe_class = handle_possible_posix(pRExC_state,
17423 ¬_posix_region_end,
17425 TRUE /* checking only */);
17426 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17427 ckWARN4reg(not_posix_region_end,
17428 "POSIX syntax [%c %c] belongs inside character classes%s",
17429 *RExC_parse, *RExC_parse,
17430 (maybe_class == OOB_NAMEDCLASS)
17431 ? ((POSIXCC_NOTYET(*RExC_parse))
17432 ? " (but this one isn't implemented)"
17433 : " (but this one isn't fully valid)")
17439 /* If the caller wants us to just parse a single element, accomplish this
17440 * by faking the loop ending condition */
17441 if (stop_at_1 && RExC_end > RExC_parse) {
17442 stop_ptr = RExC_parse + 1;
17445 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17446 if (UCHARAT(RExC_parse) == ']')
17447 goto charclassloop;
17451 if ( posix_warnings
17452 && av_tindex_skip_len_mg(posix_warnings) >= 0
17453 && RExC_parse > not_posix_region_end)
17455 /* Warnings about posix class issues are considered tentative until
17456 * we are far enough along in the parse that we can no longer
17457 * change our mind, at which point we output them. This is done
17458 * each time through the loop so that a later class won't zap them
17459 * before they have been dealt with. */
17460 output_posix_warnings(pRExC_state, posix_warnings);
17463 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17465 if (RExC_parse >= stop_ptr) {
17469 if (UCHARAT(RExC_parse) == ']') {
17475 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17476 save_value = value;
17477 save_prevvalue = prevvalue;
17480 rangebegin = RExC_parse;
17482 non_portable_endpoint = 0;
17484 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17485 value = utf8n_to_uvchr((U8*)RExC_parse,
17486 RExC_end - RExC_parse,
17487 &numlen, UTF8_ALLOW_DEFAULT);
17488 RExC_parse += numlen;
17491 value = UCHARAT(RExC_parse++);
17493 if (value == '[') {
17494 char * posix_class_end;
17495 namedclass = handle_possible_posix(pRExC_state,
17498 do_posix_warnings ? &posix_warnings : NULL,
17499 FALSE /* die if error */);
17500 if (namedclass > OOB_NAMEDCLASS) {
17502 /* If there was an earlier attempt to parse this particular
17503 * posix class, and it failed, it was a false alarm, as this
17504 * successful one proves */
17505 if ( posix_warnings
17506 && av_tindex_skip_len_mg(posix_warnings) >= 0
17507 && not_posix_region_end >= RExC_parse
17508 && not_posix_region_end <= posix_class_end)
17510 av_undef(posix_warnings);
17513 RExC_parse = posix_class_end;
17515 else if (namedclass == OOB_NAMEDCLASS) {
17516 not_posix_region_end = posix_class_end;
17519 namedclass = OOB_NAMEDCLASS;
17522 else if ( RExC_parse - 1 > not_posix_region_end
17523 && MAYBE_POSIXCC(value))
17525 (void) handle_possible_posix(
17527 RExC_parse - 1, /* -1 because parse has already been
17529 ¬_posix_region_end,
17530 do_posix_warnings ? &posix_warnings : NULL,
17531 TRUE /* checking only */);
17533 else if ( strict && ! skip_white
17534 && ( _generic_isCC(value, _CC_VERTSPACE)
17535 || is_VERTWS_cp_high(value)))
17537 vFAIL("Literal vertical space in [] is illegal except under /x");
17539 else if (value == '\\') {
17540 /* Is a backslash; get the code point of the char after it */
17542 if (RExC_parse >= RExC_end) {
17543 vFAIL("Unmatched [");
17546 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17547 value = utf8n_to_uvchr((U8*)RExC_parse,
17548 RExC_end - RExC_parse,
17549 &numlen, UTF8_ALLOW_DEFAULT);
17550 RExC_parse += numlen;
17553 value = UCHARAT(RExC_parse++);
17555 /* Some compilers cannot handle switching on 64-bit integer
17556 * values, therefore value cannot be an UV. Yes, this will
17557 * be a problem later if we want switch on Unicode.
17558 * A similar issue a little bit later when switching on
17559 * namedclass. --jhi */
17561 /* If the \ is escaping white space when white space is being
17562 * skipped, it means that that white space is wanted literally, and
17563 * is already in 'value'. Otherwise, need to translate the escape
17564 * into what it signifies. */
17565 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17566 const char * message;
17570 case 'w': namedclass = ANYOF_WORDCHAR; break;
17571 case 'W': namedclass = ANYOF_NWORDCHAR; break;
17572 case 's': namedclass = ANYOF_SPACE; break;
17573 case 'S': namedclass = ANYOF_NSPACE; break;
17574 case 'd': namedclass = ANYOF_DIGIT; break;
17575 case 'D': namedclass = ANYOF_NDIGIT; break;
17576 case 'v': namedclass = ANYOF_VERTWS; break;
17577 case 'V': namedclass = ANYOF_NVERTWS; break;
17578 case 'h': namedclass = ANYOF_HORIZWS; break;
17579 case 'H': namedclass = ANYOF_NHORIZWS; break;
17580 case 'N': /* Handle \N{NAME} in class */
17582 const char * const backslash_N_beg = RExC_parse - 2;
17585 if (! grok_bslash_N(pRExC_state,
17586 NULL, /* No regnode */
17587 &value, /* Yes single value */
17588 &cp_count, /* Multiple code pt count */
17594 if (*flagp & NEED_UTF8)
17595 FAIL("panic: grok_bslash_N set NEED_UTF8");
17597 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17599 if (cp_count < 0) {
17600 vFAIL("\\N in a character class must be a named character: \\N{...}");
17602 else if (cp_count == 0) {
17603 ckWARNreg(RExC_parse,
17604 "Ignoring zero length \\N{} in character class");
17606 else { /* cp_count > 1 */
17607 assert(cp_count > 1);
17608 if (! RExC_in_multi_char_class) {
17609 if ( ! allow_mutiple_chars
17612 || *RExC_parse == '-')
17616 vFAIL("\\N{} here is restricted to one character");
17618 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17619 break; /* <value> contains the first code
17620 point. Drop out of the switch to
17624 SV * multi_char_N = newSVpvn(backslash_N_beg,
17625 RExC_parse - backslash_N_beg);
17627 = add_multi_match(multi_char_matches,
17632 } /* End of cp_count != 1 */
17634 /* This element should not be processed further in this
17637 value = save_value;
17638 prevvalue = save_prevvalue;
17639 continue; /* Back to top of loop to get next char */
17642 /* Here, is a single code point, and <value> contains it */
17643 unicode_range = TRUE; /* \N{} are Unicode */
17651 if (RExC_pm_flags & PMf_WILDCARD) {
17653 /* diag_listed_as: Use of %s is not allowed in Unicode
17654 property wildcard subpatterns in regex; marked by <--
17656 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17657 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17660 /* \p means they want Unicode semantics */
17661 REQUIRE_UNI_RULES(flagp, 0);
17663 if (RExC_parse >= RExC_end)
17664 vFAIL2("Empty \\%c", (U8)value);
17665 if (*RExC_parse == '{') {
17666 const U8 c = (U8)value;
17667 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17670 vFAIL2("Missing right brace on \\%c{}", c);
17675 /* White space is allowed adjacent to the braces and after
17676 * any '^', even when not under /x */
17677 while (isSPACE(*RExC_parse)) {
17681 if (UCHARAT(RExC_parse) == '^') {
17683 /* toggle. (The rhs xor gets the single bit that
17684 * differs between P and p; the other xor inverts just
17686 value ^= 'P' ^ 'p';
17689 while (isSPACE(*RExC_parse)) {
17694 if (e == RExC_parse)
17695 vFAIL2("Empty \\%c{}", c);
17697 n = e - RExC_parse;
17698 while (isSPACE(*(RExC_parse + n - 1)))
17701 } /* The \p isn't immediately followed by a '{' */
17702 else if (! isALPHA(*RExC_parse)) {
17703 RExC_parse += (UTF)
17704 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17706 vFAIL2("Character following \\%c must be '{' or a "
17707 "single-character Unicode property name",
17715 char* name = RExC_parse;
17717 /* Any message returned about expanding the definition */
17718 SV* msg = newSVpvs_flags("", SVs_TEMP);
17720 /* If set TRUE, the property is user-defined as opposed to
17721 * official Unicode */
17722 bool user_defined = FALSE;
17723 AV * strings = NULL;
17725 SV * prop_definition = parse_uniprop_string(
17726 name, n, UTF, FOLD,
17727 FALSE, /* This is compile-time */
17729 /* We can't defer this defn when
17730 * the full result is required in
17732 ! cBOOL(ret_invlist),
17739 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17740 assert(prop_definition == NULL);
17741 RExC_parse = e + 1;
17742 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17743 thing so, or else the display is
17747 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17748 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17749 SvCUR(msg), SvPVX(msg)));
17752 assert(prop_definition || strings);
17756 if (! prop_definition) {
17757 RExC_parse = e + 1;
17758 vFAIL("Unicode string properties are not implemented in (?[...])");
17762 "Using just the single character results"
17763 " returned by \\p{} in (?[...])");
17766 else if (! RExC_in_multi_char_class) {
17767 if (invert ^ (value == 'P')) {
17768 RExC_parse = e + 1;
17769 vFAIL("Inverting a character class which contains"
17770 " a multi-character sequence is illegal");
17773 /* For each multi-character string ... */
17774 while (av_count(strings) > 0) {
17775 /* ... Each entry is itself an array of code
17777 AV * this_string = (AV *) av_shift( strings);
17778 STRLEN cp_count = av_count(this_string);
17779 SV * final = newSV(cp_count * 4);
17782 /* Create another string of sequences of \x{...} */
17783 while (av_count(this_string) > 0) {
17784 SV * character = av_shift(this_string);
17785 UV cp = SvUV(character);
17788 REQUIRE_UTF8(flagp);
17790 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17792 SvREFCNT_dec_NN(character);
17794 SvREFCNT_dec_NN(this_string);
17796 /* And add that to the list of such things */
17798 = add_multi_match(multi_char_matches,
17803 SvREFCNT_dec_NN(strings);
17806 if (! prop_definition) { /* If we got only a string,
17807 this iteration didn't really
17808 find a character */
17811 else if (! is_invlist(prop_definition)) {
17813 /* Here, the definition isn't known, so we have gotten
17814 * returned a string that will be evaluated if and when
17815 * encountered at runtime. We add it to the list of
17816 * such properties, along with whether it should be
17817 * complemented or not */
17818 if (value == 'P') {
17819 sv_catpvs(listsv, "!");
17822 sv_catpvs(listsv, "+");
17824 sv_catsv(listsv, prop_definition);
17826 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17828 /* We don't know yet what this matches, so have to flag
17830 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17833 assert (prop_definition && is_invlist(prop_definition));
17835 /* Here we do have the complete property definition
17837 * Temporary workaround for [perl #133136]. For this
17838 * precise input that is in the .t that is failing,
17839 * load utf8.pm, which is what the test wants, so that
17840 * that .t passes */
17841 if ( memEQs(RExC_start, e + 1 - RExC_start,
17843 && ! hv_common(GvHVn(PL_incgv),
17845 "utf8.pm", sizeof("utf8.pm") - 1,
17846 0, HV_FETCH_ISEXISTS, NULL, 0))
17848 require_pv("utf8.pm");
17851 if (! user_defined &&
17852 /* We warn on matching an above-Unicode code point
17853 * if the match would return true, except don't
17854 * warn for \p{All}, which has exactly one element
17856 (_invlist_contains_cp(prop_definition, 0x110000)
17857 && (! (_invlist_len(prop_definition) == 1
17858 && *invlist_array(prop_definition) == 0))))
17863 /* Invert if asking for the complement */
17864 if (value == 'P') {
17865 _invlist_union_complement_2nd(properties,
17870 _invlist_union(properties, prop_definition, &properties);
17875 RExC_parse = e + 1;
17876 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17880 case 'n': value = '\n'; break;
17881 case 'r': value = '\r'; break;
17882 case 't': value = '\t'; break;
17883 case 'f': value = '\f'; break;
17884 case 'b': value = '\b'; break;
17885 case 'e': value = ESC_NATIVE; break;
17886 case 'a': value = '\a'; break;
17888 RExC_parse--; /* function expects to be pointed at the 'o' */
17889 if (! grok_bslash_o(&RExC_parse,
17895 cBOOL(range), /* MAX_UV allowed for range
17901 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17902 warn_non_literal_string(RExC_parse, packed_warn, message);
17906 non_portable_endpoint++;
17910 RExC_parse--; /* function expects to be pointed at the 'x' */
17911 if (! grok_bslash_x(&RExC_parse,
17917 cBOOL(range), /* MAX_UV allowed for range
17923 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17924 warn_non_literal_string(RExC_parse, packed_warn, message);
17928 non_portable_endpoint++;
17932 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17935 /* going to die anyway; point to exact spot of
17937 RExC_parse += (UTF)
17938 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17943 value = grok_c_char;
17945 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17946 warn_non_literal_string(RExC_parse, packed_warn, message);
17949 non_portable_endpoint++;
17951 case '0': case '1': case '2': case '3': case '4':
17952 case '5': case '6': case '7':
17954 /* Take 1-3 octal digits */
17955 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17956 | PERL_SCAN_NOTIFY_ILLDIGIT;
17957 numlen = (strict) ? 4 : 3;
17958 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17959 RExC_parse += numlen;
17962 RExC_parse += (UTF)
17963 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17965 vFAIL("Need exactly 3 octal digits");
17967 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17968 && RExC_parse < RExC_end
17969 && isDIGIT(*RExC_parse)
17970 && ckWARN(WARN_REGEXP))
17972 reg_warn_non_literal_string(
17974 form_alien_digit_msg(8, numlen, RExC_parse,
17975 RExC_end, UTF, FALSE));
17979 non_portable_endpoint++;
17984 /* Allow \_ to not give an error */
17985 if (isWORDCHAR(value) && value != '_') {
17987 vFAIL2("Unrecognized escape \\%c in character class",
17991 ckWARN2reg(RExC_parse,
17992 "Unrecognized escape \\%c in character class passed through",
17997 } /* End of switch on char following backslash */
17998 } /* end of handling backslash escape sequences */
18000 /* Here, we have the current token in 'value' */
18002 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18005 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
18006 * literal, as is the character that began the false range, i.e.
18007 * the 'a' in the examples */
18009 const int w = (RExC_parse >= rangebegin)
18010 ? RExC_parse - rangebegin
18014 "False [] range \"%" UTF8f "\"",
18015 UTF8fARG(UTF, w, rangebegin));
18018 ckWARN2reg(RExC_parse,
18019 "False [] range \"%" UTF8f "\"",
18020 UTF8fARG(UTF, w, rangebegin));
18021 cp_list = add_cp_to_invlist(cp_list, '-');
18022 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18026 range = 0; /* this was not a true range */
18027 element_count += 2; /* So counts for three values */
18030 classnum = namedclass_to_classnum(namedclass);
18032 if (LOC && namedclass < ANYOF_POSIXL_MAX
18033 #ifndef HAS_ISASCII
18034 && classnum != _CC_ASCII
18037 SV* scratch_list = NULL;
18039 /* What the Posix classes (like \w, [:space:]) match isn't
18040 * generally knowable under locale until actual match time. A
18041 * special node is used for these which has extra space for a
18042 * bitmap, with a bit reserved for each named class that is to
18043 * be matched against. (This isn't needed for \p{} and
18044 * pseudo-classes, as they are not affected by locale, and
18045 * hence are dealt with separately.) However, if a named class
18046 * and its complement are both present, then it matches
18047 * everything, and there is no runtime dependency. Odd numbers
18048 * are the complements of the next lower number, so xor works.
18049 * (Note that something like [\w\D] should match everything,
18050 * because \d should be a proper subset of \w. But rather than
18051 * trust that the locale is well behaved, we leave this to
18052 * runtime to sort out) */
18053 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18054 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18055 POSIXL_ZERO(posixl);
18056 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18057 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18058 continue; /* We could ignore the rest of the class, but
18059 best to parse it for any errors */
18061 else { /* Here, isn't the complement of any already parsed
18063 POSIXL_SET(posixl, namedclass);
18064 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18065 anyof_flags |= ANYOF_MATCHES_POSIXL;
18067 /* The above-Latin1 characters are not subject to locale
18068 * rules. Just add them to the unconditionally-matched
18071 /* Get the list of the above-Latin1 code points this
18073 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18074 PL_XPosix_ptrs[classnum],
18076 /* Odd numbers are complements,
18077 * like NDIGIT, NASCII, ... */
18078 namedclass % 2 != 0,
18080 /* Checking if 'cp_list' is NULL first saves an extra
18081 * clone. Its reference count will be decremented at the
18082 * next union, etc, or if this is the only instance, at the
18083 * end of the routine */
18085 cp_list = scratch_list;
18088 _invlist_union(cp_list, scratch_list, &cp_list);
18089 SvREFCNT_dec_NN(scratch_list);
18091 continue; /* Go get next character */
18096 /* Here, is not /l, or is a POSIX class for which /l doesn't
18097 * matter (or is a Unicode property, which is skipped here). */
18098 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18099 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18101 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18102 * nor /l make a difference in what these match,
18103 * therefore we just add what they match to cp_list. */
18104 if (classnum != _CC_VERTSPACE) {
18105 assert( namedclass == ANYOF_HORIZWS
18106 || namedclass == ANYOF_NHORIZWS);
18108 /* It turns out that \h is just a synonym for
18110 classnum = _CC_BLANK;
18113 _invlist_union_maybe_complement_2nd(
18115 PL_XPosix_ptrs[classnum],
18116 namedclass % 2 != 0, /* Complement if odd
18117 (NHORIZWS, NVERTWS)
18122 else if ( AT_LEAST_UNI_SEMANTICS
18123 || classnum == _CC_ASCII
18124 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
18125 || classnum == _CC_XDIGIT)))
18127 /* We usually have to worry about /d affecting what POSIX
18128 * classes match, with special code needed because we won't
18129 * know until runtime what all matches. But there is no
18130 * extra work needed under /u and /a; and [:ascii:] is
18131 * unaffected by /d; and :digit: and :xdigit: don't have
18132 * runtime differences under /d. So we can special case
18133 * these, and avoid some extra work below, and at runtime.
18135 _invlist_union_maybe_complement_2nd(
18137 ((AT_LEAST_ASCII_RESTRICTED)
18138 ? PL_Posix_ptrs[classnum]
18139 : PL_XPosix_ptrs[classnum]),
18140 namedclass % 2 != 0,
18143 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18144 complement and use nposixes */
18145 SV** posixes_ptr = namedclass % 2 == 0
18148 _invlist_union_maybe_complement_2nd(
18150 PL_XPosix_ptrs[classnum],
18151 namedclass % 2 != 0,
18155 } /* end of namedclass \blah */
18157 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18159 /* If 'range' is set, 'value' is the ending of a range--check its
18160 * validity. (If value isn't a single code point in the case of a
18161 * range, we should have figured that out above in the code that
18162 * catches false ranges). Later, we will handle each individual code
18163 * point in the range. If 'range' isn't set, this could be the
18164 * beginning of a range, so check for that by looking ahead to see if
18165 * the next real character to be processed is the range indicator--the
18170 /* For unicode ranges, we have to test that the Unicode as opposed
18171 * to the native values are not decreasing. (Above 255, there is
18172 * no difference between native and Unicode) */
18173 if (unicode_range && prevvalue < 255 && value < 255) {
18174 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18175 goto backwards_range;
18180 if (prevvalue > value) /* b-a */ {
18185 w = RExC_parse - rangebegin;
18187 "Invalid [] range \"%" UTF8f "\"",
18188 UTF8fARG(UTF, w, rangebegin));
18189 NOT_REACHED; /* NOTREACHED */
18193 prevvalue = value; /* save the beginning of the potential range */
18194 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18195 && *RExC_parse == '-')
18197 char* next_char_ptr = RExC_parse + 1;
18199 /* Get the next real char after the '-' */
18200 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18202 /* If the '-' is at the end of the class (just before the ']',
18203 * it is a literal minus; otherwise it is a range */
18204 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18205 RExC_parse = next_char_ptr;
18207 /* a bad range like \w-, [:word:]- ? */
18208 if (namedclass > OOB_NAMEDCLASS) {
18209 if (strict || ckWARN(WARN_REGEXP)) {
18210 const int w = RExC_parse >= rangebegin
18211 ? RExC_parse - rangebegin
18214 vFAIL4("False [] range \"%*.*s\"",
18219 "False [] range \"%*.*s\"",
18223 cp_list = add_cp_to_invlist(cp_list, '-');
18226 range = 1; /* yeah, it's a range! */
18227 continue; /* but do it the next time */
18232 if (namedclass > OOB_NAMEDCLASS) {
18236 /* Here, we have a single value this time through the loop, and
18237 * <prevvalue> is the beginning of the range, if any; or <value> if
18240 /* non-Latin1 code point implies unicode semantics. */
18242 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18243 || prevvalue > MAX_LEGAL_CP))
18245 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18247 REQUIRE_UNI_RULES(flagp, 0);
18248 if ( ! silence_non_portable
18249 && UNICODE_IS_PERL_EXTENDED(value)
18250 && TO_OUTPUT_WARNINGS(RExC_parse))
18252 ckWARN2_non_literal_string(RExC_parse,
18253 packWARN(WARN_PORTABLE),
18254 PL_extended_cp_format,
18259 /* Ready to process either the single value, or the completed range.
18260 * For single-valued non-inverted ranges, we consider the possibility
18261 * of multi-char folds. (We made a conscious decision to not do this
18262 * for the other cases because it can often lead to non-intuitive
18263 * results. For example, you have the peculiar case that:
18264 * "s s" =~ /^[^\xDF]+$/i => Y
18265 * "ss" =~ /^[^\xDF]+$/i => N
18267 * See [perl #89750] */
18268 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18269 if ( value == LATIN_SMALL_LETTER_SHARP_S
18270 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18273 /* Here <value> is indeed a multi-char fold. Get what it is */
18275 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18278 UV folded = _to_uni_fold_flags(
18282 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18283 ? FOLD_FLAGS_NOMIX_ASCII
18287 /* Here, <folded> should be the first character of the
18288 * multi-char fold of <value>, with <foldbuf> containing the
18289 * whole thing. But, if this fold is not allowed (because of
18290 * the flags), <fold> will be the same as <value>, and should
18291 * be processed like any other character, so skip the special
18293 if (folded != value) {
18295 /* Skip if we are recursed, currently parsing the class
18296 * again. Otherwise add this character to the list of
18297 * multi-char folds. */
18298 if (! RExC_in_multi_char_class) {
18299 STRLEN cp_count = utf8_length(foldbuf,
18300 foldbuf + foldlen);
18301 SV* multi_fold = sv_2mortal(newSVpvs(""));
18303 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18306 = add_multi_match(multi_char_matches,
18312 /* This element should not be processed further in this
18315 value = save_value;
18316 prevvalue = save_prevvalue;
18322 if (strict && ckWARN(WARN_REGEXP)) {
18325 /* If the range starts above 255, everything is portable and
18326 * likely to be so for any forseeable character set, so don't
18328 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18329 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18331 else if (prevvalue != value) {
18333 /* Under strict, ranges that stop and/or end in an ASCII
18334 * printable should have each end point be a portable value
18335 * for it (preferably like 'A', but we don't warn if it is
18336 * a (portable) Unicode name or code point), and the range
18337 * must be all digits or all letters of the same case.
18338 * Otherwise, the range is non-portable and unclear as to
18339 * what it contains */
18340 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
18341 && ( non_portable_endpoint
18342 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18343 || (isLOWER_A(prevvalue) && isLOWER_A(value))
18344 || (isUPPER_A(prevvalue) && isUPPER_A(value))
18346 vWARN(RExC_parse, "Ranges of ASCII printables should"
18347 " be some subset of \"0-9\","
18348 " \"A-Z\", or \"a-z\"");
18350 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18351 SSize_t index_start;
18352 SSize_t index_final;
18354 /* But the nature of Unicode and languages mean we
18355 * can't do the same checks for above-ASCII ranges,
18356 * except in the case of digit ones. These should
18357 * contain only digits from the same group of 10. The
18358 * ASCII case is handled just above. Hence here, the
18359 * range could be a range of digits. First some
18360 * unlikely special cases. Grandfather in that a range
18361 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18362 * if its starting value is one of the 10 digits prior
18363 * to it. This is because it is an alternate way of
18364 * writing 19D1, and some people may expect it to be in
18365 * that group. But it is bad, because it won't give
18366 * the expected results. In Unicode 5.2 it was
18367 * considered to be in that group (of 11, hence), but
18368 * this was fixed in the next version */
18370 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18371 goto warn_bad_digit_range;
18373 else if (UNLIKELY( prevvalue >= 0x1D7CE
18374 && value <= 0x1D7FF))
18376 /* This is the only other case currently in Unicode
18377 * where the algorithm below fails. The code
18378 * points just above are the end points of a single
18379 * range containing only decimal digits. It is 5
18380 * different series of 0-9. All other ranges of
18381 * digits currently in Unicode are just a single
18382 * series. (And mktables will notify us if a later
18383 * Unicode version breaks this.)
18385 * If the range being checked is at most 9 long,
18386 * and the digit values represented are in
18387 * numerical order, they are from the same series.
18389 if ( value - prevvalue > 9
18390 || ((( value - 0x1D7CE) % 10)
18391 <= (prevvalue - 0x1D7CE) % 10))
18393 goto warn_bad_digit_range;
18398 /* For all other ranges of digits in Unicode, the
18399 * algorithm is just to check if both end points
18400 * are in the same series, which is the same range.
18402 index_start = _invlist_search(
18403 PL_XPosix_ptrs[_CC_DIGIT],
18406 /* Warn if the range starts and ends with a digit,
18407 * and they are not in the same group of 10. */
18408 if ( index_start >= 0
18409 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18411 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18412 value)) != index_start
18413 && index_final >= 0
18414 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18416 warn_bad_digit_range:
18417 vWARN(RExC_parse, "Ranges of digits should be"
18418 " from the same group of"
18425 if ((! range || prevvalue == value) && non_portable_endpoint) {
18426 if (isPRINT_A(value)) {
18429 if (isBACKSLASHED_PUNCT(value)) {
18430 literal[d++] = '\\';
18432 literal[d++] = (char) value;
18433 literal[d++] = '\0';
18436 "\"%.*s\" is more clearly written simply as \"%s\"",
18437 (int) (RExC_parse - rangebegin),
18442 else if (isMNEMONIC_CNTRL(value)) {
18444 "\"%.*s\" is more clearly written simply as \"%s\"",
18445 (int) (RExC_parse - rangebegin),
18447 cntrl_to_mnemonic((U8) value)
18453 /* Deal with this element of the class */
18456 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18459 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18460 * that don't require special handling, we can just add the range like
18461 * we do for ASCII platforms */
18462 if ((UNLIKELY(prevvalue == 0) && value >= 255)
18463 || ! (prevvalue < 256
18465 || (! non_portable_endpoint
18466 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18467 || (isUPPER_A(prevvalue)
18468 && isUPPER_A(value)))))))
18470 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18474 /* Here, requires special handling. This can be because it is a
18475 * range whose code points are considered to be Unicode, and so
18476 * must be individually translated into native, or because its a
18477 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18478 * EBCDIC, but we have defined them to include only the "expected"
18479 * upper or lower case ASCII alphabetics. Subranges above 255 are
18480 * the same in native and Unicode, so can be added as a range */
18481 U8 start = NATIVE_TO_LATIN1(prevvalue);
18483 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18484 for (j = start; j <= end; j++) {
18485 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18488 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18494 range = 0; /* this range (if it was one) is done now */
18495 } /* End of loop through all the text within the brackets */
18497 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18498 output_posix_warnings(pRExC_state, posix_warnings);
18501 /* If anything in the class expands to more than one character, we have to
18502 * deal with them by building up a substitute parse string, and recursively
18503 * calling reg() on it, instead of proceeding */
18504 if (multi_char_matches) {
18505 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18508 char *save_end = RExC_end;
18509 char *save_parse = RExC_parse;
18510 char *save_start = RExC_start;
18511 Size_t constructed_prefix_len = 0; /* This gives the length of the
18512 constructed portion of the
18513 substitute parse. */
18514 bool first_time = TRUE; /* First multi-char occurrence doesn't get
18519 /* Only one level of recursion allowed */
18520 assert(RExC_copy_start_in_constructed == RExC_precomp);
18522 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
18523 because too confusing */
18525 sv_catpvs(substitute_parse, "(?:");
18529 /* Look at the longest strings first */
18530 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18535 if (av_exists(multi_char_matches, cp_count)) {
18536 AV** this_array_ptr;
18539 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18541 while ((this_sequence = av_pop(*this_array_ptr)) !=
18544 if (! first_time) {
18545 sv_catpvs(substitute_parse, "|");
18547 first_time = FALSE;
18549 sv_catpv(substitute_parse, SvPVX(this_sequence));
18554 /* If the character class contains anything else besides these
18555 * multi-character strings, have to include it in recursive parsing */
18556 if (element_count) {
18557 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18559 sv_catpvs(substitute_parse, "|");
18560 if (has_l_bracket) { /* Add an [ if the original had one */
18561 sv_catpvs(substitute_parse, "[");
18563 constructed_prefix_len = SvCUR(substitute_parse);
18564 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18566 /* Put in a closing ']' to match any opening one, but not if going
18567 * off the end, as otherwise we are adding something that really
18569 if (has_l_bracket && RExC_parse < RExC_end) {
18570 sv_catpvs(substitute_parse, "]");
18574 sv_catpvs(substitute_parse, ")");
18577 /* This is a way to get the parse to skip forward a whole named
18578 * sequence instead of matching the 2nd character when it fails the
18580 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18584 /* Set up the data structure so that any errors will be properly
18585 * reported. See the comments at the definition of
18586 * REPORT_LOCATION_ARGS for details */
18587 RExC_copy_start_in_input = (char *) orig_parse;
18588 RExC_start = RExC_parse = SvPV(substitute_parse, len);
18589 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18590 RExC_end = RExC_parse + len;
18591 RExC_in_multi_char_class = 1;
18593 ret = reg(pRExC_state, 1, ®_flags, depth+1);
18595 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18597 /* And restore so can parse the rest of the pattern */
18598 RExC_parse = save_parse;
18599 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18600 RExC_end = save_end;
18601 RExC_in_multi_char_class = 0;
18602 SvREFCNT_dec_NN(multi_char_matches);
18606 /* If folding, we calculate all characters that could fold to or from the
18607 * ones already on the list */
18608 if (cp_foldable_list) {
18610 UV start, end; /* End points of code point ranges */
18612 SV* fold_intersection = NULL;
18615 /* Our calculated list will be for Unicode rules. For locale
18616 * matching, we have to keep a separate list that is consulted at
18617 * runtime only when the locale indicates Unicode rules (and we
18618 * don't include potential matches in the ASCII/Latin1 range, as
18619 * any code point could fold to any other, based on the run-time
18620 * locale). For non-locale, we just use the general list */
18622 use_list = &only_utf8_locale_list;
18625 use_list = &cp_list;
18628 /* Only the characters in this class that participate in folds need
18629 * be checked. Get the intersection of this class and all the
18630 * possible characters that are foldable. This can quickly narrow
18631 * down a large class */
18632 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18633 &fold_intersection);
18635 /* Now look at the foldable characters in this class individually */
18636 invlist_iterinit(fold_intersection);
18637 while (invlist_iternext(fold_intersection, &start, &end)) {
18641 /* Look at every character in the range */
18642 for (j = start; j <= end; j++) {
18643 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18646 Size_t folds_count;
18648 const U32 * remaining_folds;
18652 /* Under /l, we don't know what code points below 256
18653 * fold to, except we do know the MICRO SIGN folds to
18654 * an above-255 character if the locale is UTF-8, so we
18655 * add it to the special list (in *use_list) Otherwise
18656 * we know now what things can match, though some folds
18657 * are valid under /d only if the target is UTF-8.
18658 * Those go in a separate list */
18659 if ( IS_IN_SOME_FOLD_L1(j)
18660 && ! (LOC && j != MICRO_SIGN))
18663 /* ASCII is always matched; non-ASCII is matched
18664 * only under Unicode rules (which could happen
18665 * under /l if the locale is a UTF-8 one */
18666 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18667 *use_list = add_cp_to_invlist(*use_list,
18668 PL_fold_latin1[j]);
18670 else if (j != PL_fold_latin1[j]) {
18671 upper_latin1_only_utf8_matches
18672 = add_cp_to_invlist(
18673 upper_latin1_only_utf8_matches,
18674 PL_fold_latin1[j]);
18678 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18679 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18681 add_above_Latin1_folds(pRExC_state,
18688 /* Here is an above Latin1 character. We don't have the
18689 * rules hard-coded for it. First, get its fold. This is
18690 * the simple fold, as the multi-character folds have been
18691 * handled earlier and separated out */
18692 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18693 (ASCII_FOLD_RESTRICTED)
18694 ? FOLD_FLAGS_NOMIX_ASCII
18697 /* Single character fold of above Latin1. Add everything
18698 * in its fold closure to the list that this node should
18700 folds_count = _inverse_folds(folded, &first_fold,
18702 for (k = 0; k <= folds_count; k++) {
18703 UV c = (k == 0) /* First time through use itself */
18705 : (k == 1) /* 2nd time use, the first fold */
18708 /* Then the remaining ones */
18709 : remaining_folds[k-2];
18711 /* /aa doesn't allow folds between ASCII and non- */
18712 if (( ASCII_FOLD_RESTRICTED
18713 && (isASCII(c) != isASCII(j))))
18718 /* Folds under /l which cross the 255/256 boundary are
18719 * added to a separate list. (These are valid only
18720 * when the locale is UTF-8.) */
18721 if (c < 256 && LOC) {
18722 *use_list = add_cp_to_invlist(*use_list, c);
18726 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18728 cp_list = add_cp_to_invlist(cp_list, c);
18731 /* Similarly folds involving non-ascii Latin1
18732 * characters under /d are added to their list */
18733 upper_latin1_only_utf8_matches
18734 = add_cp_to_invlist(
18735 upper_latin1_only_utf8_matches,
18741 SvREFCNT_dec_NN(fold_intersection);
18744 /* Now that we have finished adding all the folds, there is no reason
18745 * to keep the foldable list separate */
18746 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18747 SvREFCNT_dec_NN(cp_foldable_list);
18750 /* And combine the result (if any) with any inversion lists from posix
18751 * classes. The lists are kept separate up to now because we don't want to
18752 * fold the classes */
18753 if (simple_posixes) { /* These are the classes known to be unaffected by
18756 _invlist_union(cp_list, simple_posixes, &cp_list);
18757 SvREFCNT_dec_NN(simple_posixes);
18760 cp_list = simple_posixes;
18763 if (posixes || nposixes) {
18764 if (! DEPENDS_SEMANTICS) {
18766 /* For everything but /d, we can just add the current 'posixes' and
18767 * 'nposixes' to the main list */
18770 _invlist_union(cp_list, posixes, &cp_list);
18771 SvREFCNT_dec_NN(posixes);
18779 _invlist_union(cp_list, nposixes, &cp_list);
18780 SvREFCNT_dec_NN(nposixes);
18783 cp_list = nposixes;
18788 /* Under /d, things like \w match upper Latin1 characters only if
18789 * the target string is in UTF-8. But things like \W match all the
18790 * upper Latin1 characters if the target string is not in UTF-8.
18792 * Handle the case with something like \W separately */
18794 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18796 /* A complemented posix class matches all upper Latin1
18797 * characters if not in UTF-8. And it matches just certain
18798 * ones when in UTF-8. That means those certain ones are
18799 * matched regardless, so can just be added to the
18800 * unconditional list */
18802 _invlist_union(cp_list, nposixes, &cp_list);
18803 SvREFCNT_dec_NN(nposixes);
18807 cp_list = nposixes;
18810 /* Likewise for 'posixes' */
18811 _invlist_union(posixes, cp_list, &cp_list);
18812 SvREFCNT_dec(posixes);
18814 /* Likewise for anything else in the range that matched only
18816 if (upper_latin1_only_utf8_matches) {
18817 _invlist_union(cp_list,
18818 upper_latin1_only_utf8_matches,
18820 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18821 upper_latin1_only_utf8_matches = NULL;
18824 /* If we don't match all the upper Latin1 characters regardless
18825 * of UTF-8ness, we have to set a flag to match the rest when
18827 _invlist_subtract(only_non_utf8_list, cp_list,
18828 &only_non_utf8_list);
18829 if (_invlist_len(only_non_utf8_list) != 0) {
18830 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18832 SvREFCNT_dec_NN(only_non_utf8_list);
18835 /* Here there were no complemented posix classes. That means
18836 * the upper Latin1 characters in 'posixes' match only when the
18837 * target string is in UTF-8. So we have to add them to the
18838 * list of those types of code points, while adding the
18839 * remainder to the unconditional list.
18841 * First calculate what they are */
18842 SV* nonascii_but_latin1_properties = NULL;
18843 _invlist_intersection(posixes, PL_UpperLatin1,
18844 &nonascii_but_latin1_properties);
18846 /* And add them to the final list of such characters. */
18847 _invlist_union(upper_latin1_only_utf8_matches,
18848 nonascii_but_latin1_properties,
18849 &upper_latin1_only_utf8_matches);
18851 /* Remove them from what now becomes the unconditional list */
18852 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18855 /* And add those unconditional ones to the final list */
18857 _invlist_union(cp_list, posixes, &cp_list);
18858 SvREFCNT_dec_NN(posixes);
18865 SvREFCNT_dec(nonascii_but_latin1_properties);
18867 /* Get rid of any characters from the conditional list that we
18868 * now know are matched unconditionally, which may make that
18870 _invlist_subtract(upper_latin1_only_utf8_matches,
18872 &upper_latin1_only_utf8_matches);
18873 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18874 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18875 upper_latin1_only_utf8_matches = NULL;
18881 /* And combine the result (if any) with any inversion list from properties.
18882 * The lists are kept separate up to now so that we can distinguish the two
18883 * in regards to matching above-Unicode. A run-time warning is generated
18884 * if a Unicode property is matched against a non-Unicode code point. But,
18885 * we allow user-defined properties to match anything, without any warning,
18886 * and we also suppress the warning if there is a portion of the character
18887 * class that isn't a Unicode property, and which matches above Unicode, \W
18888 * or [\x{110000}] for example.
18889 * (Note that in this case, unlike the Posix one above, there is no
18890 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18891 * forces Unicode semantics */
18895 /* If it matters to the final outcome, see if a non-property
18896 * component of the class matches above Unicode. If so, the
18897 * warning gets suppressed. This is true even if just a single
18898 * such code point is specified, as, though not strictly correct if
18899 * another such code point is matched against, the fact that they
18900 * are using above-Unicode code points indicates they should know
18901 * the issues involved */
18903 warn_super = ! (invert
18904 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18907 _invlist_union(properties, cp_list, &cp_list);
18908 SvREFCNT_dec_NN(properties);
18911 cp_list = properties;
18916 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18918 /* Because an ANYOF node is the only one that warns, this node
18919 * can't be optimized into something else */
18920 optimizable = FALSE;
18924 /* Here, we have calculated what code points should be in the character
18927 * Now we can see about various optimizations. Fold calculation (which we
18928 * did above) needs to take place before inversion. Otherwise /[^k]/i
18929 * would invert to include K, which under /i would match k, which it
18930 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18931 * folded until runtime */
18933 /* If we didn't do folding, it's because some information isn't available
18934 * until runtime; set the run-time fold flag for these We know to set the
18935 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18936 * at least one 0-255 range code point */
18939 /* Some things on the list might be unconditionally included because of
18940 * other components. Remove them, and clean up the list if it goes to
18942 if (only_utf8_locale_list && cp_list) {
18943 _invlist_subtract(only_utf8_locale_list, cp_list,
18944 &only_utf8_locale_list);
18946 if (_invlist_len(only_utf8_locale_list) == 0) {
18947 SvREFCNT_dec_NN(only_utf8_locale_list);
18948 only_utf8_locale_list = NULL;
18951 if ( only_utf8_locale_list
18952 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18953 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18955 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18958 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18960 else if (cp_list && invlist_lowest(cp_list) < 256) {
18961 /* If nothing is below 256, has no locale dependency; otherwise it
18963 anyof_flags |= ANYOFL_FOLD;
18964 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18967 else if ( DEPENDS_SEMANTICS
18968 && ( upper_latin1_only_utf8_matches
18969 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18971 RExC_seen_d_op = TRUE;
18972 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18975 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18979 && ! has_runtime_dependency)
18981 _invlist_invert(cp_list);
18983 /* Clear the invert flag since have just done it here */
18987 /* All possible optimizations below still have these characteristics.
18988 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18990 *flagp |= HASWIDTH|SIMPLE;
18993 *ret_invlist = cp_list;
18995 return (cp_list) ? RExC_emit : 0;
18998 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18999 RExC_contains_locale = 1;
19002 /* Some character classes are equivalent to other nodes. Such nodes take
19003 * up less room, and some nodes require fewer operations to execute, than
19004 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
19005 * improve efficiency. */
19008 PERL_UINT_FAST8_T i;
19009 UV partial_cp_count = 0;
19010 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19011 UV end[MAX_FOLD_FROMS+1] = { 0 };
19012 bool single_range = FALSE;
19014 if (cp_list) { /* Count the code points in enough ranges that we would
19015 see all the ones possible in any fold in this version
19018 invlist_iterinit(cp_list);
19019 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19020 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19023 partial_cp_count += end[i] - start[i] + 1;
19027 single_range = TRUE;
19029 invlist_iterfinish(cp_list);
19032 /* If we know at compile time that this matches every possible code
19033 * point, any run-time dependencies don't matter */
19034 if (start[0] == 0 && end[0] == UV_MAX) {
19036 ret = reganode(pRExC_state, OPFAIL, 0);
19039 ret = reg_node(pRExC_state, SANY);
19045 /* Similarly, for /l posix classes, if both a class and its
19046 * complement match, any run-time dependencies don't matter */
19048 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19051 if ( POSIXL_TEST(posixl, namedclass) /* class */
19052 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19055 ret = reganode(pRExC_state, OPFAIL, 0);
19058 ret = reg_node(pRExC_state, SANY);
19065 /* For well-behaved locales, some classes are subsets of others,
19066 * so complementing the subset and including the non-complemented
19067 * superset should match everything, like [\D[:alnum:]], and
19068 * [[:^alpha:][:alnum:]], but some implementations of locales are
19069 * buggy, and khw thinks its a bad idea to have optimization change
19070 * behavior, even if it avoids an OS bug in a given case */
19072 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19074 /* If is a single posix /l class, can optimize to just that op.
19075 * Such a node will not match anything in the Latin1 range, as that
19076 * is not determinable until runtime, but will match whatever the
19077 * class does outside that range. (Note that some classes won't
19078 * match anything outside the range, like [:ascii:]) */
19079 if ( isSINGLE_BIT_SET(posixl)
19080 && (partial_cp_count == 0 || start[0] > 255))
19083 SV * class_above_latin1 = NULL;
19084 bool already_inverted;
19085 bool are_equivalent;
19087 /* Compute which bit is set, which is the same thing as, e.g.,
19088 * ANYOF_CNTRL. From
19089 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19091 static const int MultiplyDeBruijnBitPosition2[32] =
19093 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19094 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19097 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19098 * 0x077CB531U) >> 27];
19099 classnum = namedclass_to_classnum(namedclass);
19101 /* The named classes are such that the inverted number is one
19102 * larger than the non-inverted one */
19103 already_inverted = namedclass
19104 - classnum_to_namedclass(classnum);
19106 /* Create an inversion list of the official property, inverted
19107 * if the constructed node list is inverted, and restricted to
19108 * only the above latin1 code points, which are the only ones
19109 * known at compile time */
19110 _invlist_intersection_maybe_complement_2nd(
19112 PL_XPosix_ptrs[classnum],
19114 &class_above_latin1);
19115 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19117 SvREFCNT_dec_NN(class_above_latin1);
19119 if (are_equivalent) {
19121 /* Resolve the run-time inversion flag with this possibly
19122 * inverted class */
19123 invert = invert ^ already_inverted;
19125 ret = reg_node(pRExC_state,
19126 POSIXL + invert * (NPOSIXL - POSIXL));
19127 FLAGS(REGNODE_p(ret)) = classnum;
19133 /* khw can't think of any other possible transformation involving
19135 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19139 if (! has_runtime_dependency) {
19141 /* If the list is empty, nothing matches. This happens, for
19142 * example, when a Unicode property that doesn't match anything is
19143 * the only element in the character class (perluniprops.pod notes
19144 * such properties). */
19145 if (partial_cp_count == 0) {
19147 ret = reg_node(pRExC_state, SANY);
19150 ret = reganode(pRExC_state, OPFAIL, 0);
19156 /* If matches everything but \n */
19157 if ( start[0] == 0 && end[0] == '\n' - 1
19158 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19161 ret = reg_node(pRExC_state, REG_ANY);
19167 /* Next see if can optimize classes that contain just a few code points
19168 * into an EXACTish node. The reason to do this is to let the
19169 * optimizer join this node with adjacent EXACTish ones, and ANYOF
19170 * nodes require conversion to code point from UTF-8.
19172 * An EXACTFish node can be generated even if not under /i, and vice
19173 * versa. But care must be taken. An EXACTFish node has to be such
19174 * that it only matches precisely the code points in the class, but we
19175 * want to generate the least restrictive one that does that, to
19176 * increase the odds of being able to join with an adjacent node. For
19177 * example, if the class contains [kK], we have to make it an EXACTFAA
19178 * node to prevent the KELVIN SIGN from matching. Whether we are under
19179 * /i or not is irrelevant in this case. Less obvious is the pattern
19180 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
19181 * supposed to match the single character U+0149 LATIN SMALL LETTER N
19182 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
19183 * that includes \X{02BC}, there is a multi-char fold that does, and so
19184 * the node generated for it must be an EXACTFish one. On the other
19185 * hand qr/:/i should generate a plain EXACT node since the colon
19186 * participates in no fold whatsoever, and having it EXACT tells the
19187 * optimizer the target string cannot match unless it has a colon in
19193 /* Only try if there are no more code points in the class than
19194 * in the max possible fold */
19195 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19197 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19199 /* We can always make a single code point class into an
19200 * EXACTish node. */
19204 /* Here is /l: Use EXACTL, except if there is a fold not
19205 * known until runtime so shows as only a single code point
19206 * here. For code points above 255, we know which can
19207 * cause problems by having a potential fold to the Latin1
19210 || ( start[0] > 255
19211 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19219 else if (! FOLD) { /* Not /l and not /i */
19220 op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19222 else if (start[0] < 256) { /* /i, not /l, and the code point is
19225 /* Under /i, it gets a little tricky. A code point that
19226 * doesn't participate in a fold should be an EXACT node.
19227 * We know this one isn't the result of a simple fold, or
19228 * there'd be more than one code point in the list, but it
19229 * could be part of a multi- character fold. In that case
19230 * we better not create an EXACT node, as we would wrongly
19231 * be telling the optimizer that this code point must be in
19232 * the target string, and that is wrong. This is because
19233 * if the sequence around this code point forms a
19234 * multi-char fold, what needs to be in the string could be
19235 * the code point that folds to the sequence.
19237 * This handles the case of below-255 code points, as we
19238 * have an easy look up for those. The next clause handles
19239 * the above-256 one */
19240 op = IS_IN_SOME_FOLD_L1(start[0])
19244 else { /* /i, larger code point. Since we are under /i, and
19245 have just this code point, we know that it can't
19246 fold to something else, so PL_InMultiCharFold
19248 op = _invlist_contains_cp(PL_InMultiCharFold,
19256 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19257 && _invlist_contains_cp(PL_in_some_fold, start[0]))
19259 /* Here, the only runtime dependency, if any, is from /d, and
19260 * the class matches more than one code point, and the lowest
19261 * code point participates in some fold. It might be that the
19262 * other code points are /i equivalent to this one, and hence
19263 * they would representable by an EXACTFish node. Above, we
19264 * eliminated classes that contain too many code points to be
19265 * EXACTFish, with the test for MAX_FOLD_FROMS
19267 * First, special case the ASCII fold pairs, like 'B' and 'b'.
19268 * We do this because we have EXACTFAA at our disposal for the
19270 if (partial_cp_count == 2 && isASCII(start[0])) {
19272 /* The only ASCII characters that participate in folds are
19274 assert(isALPHA(start[0]));
19275 if ( end[0] == start[0] /* First range is a single
19276 character, so 2nd exists */
19277 && isALPHA_FOLD_EQ(start[0], start[1]))
19280 /* Here, is part of an ASCII fold pair */
19282 if ( ASCII_FOLD_RESTRICTED
19283 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19285 /* If the second clause just above was true, it
19286 * means we can't be under /i, or else the list
19287 * would have included more than this fold pair.
19288 * Therefore we have to exclude the possibility of
19289 * whatever else it is that folds to these, by
19290 * using EXACTFAA */
19293 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19295 /* Here, there's no simple fold that start[0] is part
19296 * of, but there is a multi-character one. If we
19297 * are not under /i, we want to exclude that
19298 * possibility; if under /i, we want to include it
19300 op = (FOLD) ? EXACTFU : EXACTFAA;
19304 /* Here, the only possible fold start[0] particpates in
19305 * is with start[1]. /i or not isn't relevant */
19309 value = toFOLD(start[0]);
19312 else if ( ! upper_latin1_only_utf8_matches
19313 || ( _invlist_len(upper_latin1_only_utf8_matches)
19316 invlist_highest(upper_latin1_only_utf8_matches)]
19319 /* Here, the smallest character is non-ascii or there are
19320 * more than 2 code points matched by this node. Also, we
19321 * either don't have /d UTF-8 dependent matches, or if we
19322 * do, they look like they could be a single character that
19323 * is the fold of the lowest one in the always-match list.
19324 * This test quickly excludes most of the false positives
19325 * when there are /d UTF-8 depdendent matches. These are
19326 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19327 * SMALL LETTER A WITH GRAVE iff the target string is
19328 * UTF-8. (We don't have to worry above about exceeding
19329 * the array bounds of PL_fold_latin1[] because any code
19330 * point in 'upper_latin1_only_utf8_matches' is below 256.)
19332 * EXACTFAA would apply only to pairs (hence exactly 2 code
19333 * points) in the ASCII range, so we can't use it here to
19334 * artificially restrict the fold domain, so we check if
19335 * the class does or does not match some EXACTFish node.
19336 * Further, if we aren't under /i, and the folded-to
19337 * character is part of a multi-character fold, we can't do
19338 * this optimization, as the sequence around it could be
19339 * that multi-character fold, and we don't here know the
19340 * context, so we have to assume it is that multi-char
19341 * fold, to prevent potential bugs.
19343 * To do the general case, we first find the fold of the
19344 * lowest code point (which may be higher than the lowest
19345 * one), then find everything that folds to it. (The data
19346 * structure we have only maps from the folded code points,
19347 * so we have to do the earlier step.) */
19350 U8 foldbuf[UTF8_MAXBYTES_CASE];
19351 UV folded = _to_uni_fold_flags(start[0],
19352 foldbuf, &foldlen, 0);
19354 const U32 * remaining_folds;
19355 Size_t folds_to_this_cp_count = _inverse_folds(
19359 Size_t folds_count = folds_to_this_cp_count + 1;
19360 SV * fold_list = _new_invlist(folds_count);
19363 /* If there are UTF-8 dependent matches, create a temporary
19364 * list of what this node matches, including them. */
19365 SV * all_cp_list = NULL;
19366 SV ** use_this_list = &cp_list;
19368 if (upper_latin1_only_utf8_matches) {
19369 all_cp_list = _new_invlist(0);
19370 use_this_list = &all_cp_list;
19371 _invlist_union(cp_list,
19372 upper_latin1_only_utf8_matches,
19376 /* Having gotten everything that participates in the fold
19377 * containing the lowest code point, we turn that into an
19378 * inversion list, making sure everything is included. */
19379 fold_list = add_cp_to_invlist(fold_list, start[0]);
19380 fold_list = add_cp_to_invlist(fold_list, folded);
19381 if (folds_to_this_cp_count > 0) {
19382 fold_list = add_cp_to_invlist(fold_list, first_fold);
19383 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19384 fold_list = add_cp_to_invlist(fold_list,
19385 remaining_folds[i]);
19389 /* If the fold list is identical to what's in this ANYOF
19390 * node, the node can be represented by an EXACTFish one
19392 if (_invlistEQ(*use_this_list, fold_list,
19393 0 /* Don't complement */ )
19396 /* But, we have to be careful, as mentioned above.
19397 * Just the right sequence of characters could match
19398 * this if it is part of a multi-character fold. That
19399 * IS what we want if we are under /i. But it ISN'T
19400 * what we want if not under /i, as it could match when
19401 * it shouldn't. So, when we aren't under /i and this
19402 * character participates in a multi-char fold, we
19403 * don't optimize into an EXACTFish node. So, for each
19404 * case below we have to check if we are folding
19405 * and if not, if it is not part of a multi-char fold.
19407 if (start[0] > 255) { /* Highish code point */
19408 if (FOLD || ! _invlist_contains_cp(
19409 PL_InMultiCharFold, folded))
19413 : (ASCII_FOLD_RESTRICTED)
19418 } /* Below, the lowest code point < 256 */
19421 && DEPENDS_SEMANTICS)
19422 { /* An EXACTF node containing a single character
19423 's', can be an EXACTFU if it doesn't get
19424 joined with an adjacent 's' */
19425 op = EXACTFU_S_EDGE;
19429 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19431 if (upper_latin1_only_utf8_matches) {
19434 /* We can't use the fold, as that only matches
19438 else if ( UNLIKELY(start[0] == MICRO_SIGN)
19440 { /* EXACTFUP is a special node for this
19442 op = (ASCII_FOLD_RESTRICTED)
19445 value = MICRO_SIGN;
19447 else if ( ASCII_FOLD_RESTRICTED
19448 && ! isASCII(start[0]))
19449 { /* For ASCII under /iaa, we can use EXACTFU
19461 SvREFCNT_dec_NN(fold_list);
19462 SvREFCNT_dec(all_cp_list);
19469 /* Here, we have calculated what EXACTish node to use. Have to
19470 * convert to UTF-8 if not already there */
19473 SvREFCNT_dec(cp_list);;
19474 REQUIRE_UTF8(flagp);
19477 /* This is a kludge to the special casing issues with this
19478 * ligature under /aa. FB05 should fold to FB06, but the
19479 * call above to _to_uni_fold_flags() didn't find this, as
19480 * it didn't use the /aa restriction in order to not miss
19481 * other folds that would be affected. This is the only
19482 * instance likely to ever be a problem in all of Unicode.
19483 * So special case it. */
19484 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
19485 && ASCII_FOLD_RESTRICTED)
19487 value = LATIN_SMALL_LIGATURE_ST;
19491 len = (UTF) ? UVCHR_SKIP(value) : 1;
19493 ret = regnode_guts(pRExC_state, op, len, "exact");
19494 FILL_NODE(ret, op);
19495 RExC_emit += 1 + STR_SZ(len);
19496 setSTR_LEN(REGNODE_p(ret), len);
19498 *STRINGs(REGNODE_p(ret)) = (U8) value;
19501 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19507 if (! has_runtime_dependency) {
19509 /* See if this can be turned into an ANYOFM node. Think about the
19510 * bit patterns in two different bytes. In some positions, the
19511 * bits in each will be 1; and in other positions both will be 0;
19512 * and in some positions the bit will be 1 in one byte, and 0 in
19513 * the other. Let 'n' be the number of positions where the bits
19514 * differ. We create a mask which has exactly 'n' 0 bits, each in
19515 * a position where the two bytes differ. Now take the set of all
19516 * bytes that when ANDed with the mask yield the same result. That
19517 * set has 2**n elements, and is representable by just two 8 bit
19518 * numbers: the result and the mask. Importantly, matching the set
19519 * can be vectorized by creating a word full of the result bytes,
19520 * and a word full of the mask bytes, yielding a significant speed
19521 * up. Here, see if this node matches such a set. As a concrete
19522 * example consider [01], and the byte representing '0' which is
19523 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
19524 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
19525 * 0x30. Any other bytes ANDed yield something else. So [01],
19526 * which is a common usage, is optimizable into ANYOFM, and can
19527 * benefit from the speed up. We can only do this on UTF-8
19528 * invariant bytes, because they have the same bit patterns under
19530 PERL_UINT_FAST8_T inverted = 0;
19532 const PERL_UINT_FAST8_T max_permissible = 0xFF;
19534 const PERL_UINT_FAST8_T max_permissible = 0x7F;
19536 /* If doesn't fit the criteria for ANYOFM, invert and try again.
19537 * If that works we will instead later generate an NANYOFM, and
19538 * invert back when through */
19539 if (invlist_highest(cp_list) > max_permissible) {
19540 _invlist_invert(cp_list);
19544 if (invlist_highest(cp_list) <= max_permissible) {
19545 UV this_start, this_end;
19546 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
19547 U8 bits_differing = 0;
19548 Size_t full_cp_count = 0;
19549 bool first_time = TRUE;
19551 /* Go through the bytes and find the bit positions that differ
19553 invlist_iterinit(cp_list);
19554 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19555 unsigned int i = this_start;
19558 if (! UVCHR_IS_INVARIANT(i)) {
19562 first_time = FALSE;
19563 lowest_cp = this_start;
19565 /* We have set up the code point to compare with.
19566 * Don't compare it with itself */
19570 /* Find the bit positions that differ from the lowest code
19571 * point in the node. Keep track of all such positions by
19573 for (; i <= this_end; i++) {
19574 if (! UVCHR_IS_INVARIANT(i)) {
19578 bits_differing |= i ^ lowest_cp;
19581 full_cp_count += this_end - this_start + 1;
19584 /* At the end of the loop, we count how many bits differ from
19585 * the bits in lowest code point, call the count 'd'. If the
19586 * set we found contains 2**d elements, it is the closure of
19587 * all code points that differ only in those bit positions. To
19588 * convince yourself of that, first note that the number in the
19589 * closure must be a power of 2, which we test for. The only
19590 * way we could have that count and it be some differing set,
19591 * is if we got some code points that don't differ from the
19592 * lowest code point in any position, but do differ from each
19593 * other in some other position. That means one code point has
19594 * a 1 in that position, and another has a 0. But that would
19595 * mean that one of them differs from the lowest code point in
19596 * that position, which possibility we've already excluded. */
19597 if ( (inverted || full_cp_count > 1)
19598 && full_cp_count == 1U << PL_bitcount[bits_differing])
19602 op = ANYOFM + inverted;;
19604 /* We need to make the bits that differ be 0's */
19605 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19607 /* The argument is the lowest code point */
19608 ret = reganode(pRExC_state, op, lowest_cp);
19609 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19613 invlist_iterfinish(cp_list);
19617 _invlist_invert(cp_list);
19624 /* XXX We could create an ANYOFR_LOW node here if we saved above if
19625 * all were invariants, it wasn't inverted, and there is a single
19626 * range. This would be faster than some of the posix nodes we
19627 * create below like /\d/a, but would be twice the size. Without
19628 * having actually measured the gain, khw doesn't think the
19629 * tradeoff is really worth it */
19632 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19633 PERL_UINT_FAST8_T type;
19634 SV * intersection = NULL;
19635 SV* d_invlist = NULL;
19637 /* See if this matches any of the POSIX classes. The POSIXA and
19638 * POSIXD ones are about the same speed as ANYOF ops, but take less
19639 * room; the ones that have above-Latin1 code point matches are
19640 * somewhat faster than ANYOF. */
19642 for (type = POSIXA; type >= POSIXD; type--) {
19645 if (type == POSIXL) { /* But not /l posix classes */
19649 for (posix_class = 0;
19650 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19653 SV** our_code_points = &cp_list;
19654 SV** official_code_points;
19657 if (type == POSIXA) {
19658 official_code_points = &PL_Posix_ptrs[posix_class];
19661 official_code_points = &PL_XPosix_ptrs[posix_class];
19664 /* Skip non-existent classes of this type. e.g. \v only
19665 * has an entry in PL_XPosix_ptrs */
19666 if (! *official_code_points) {
19670 /* Try both the regular class, and its inversion */
19671 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19672 bool this_inverted = invert ^ try_inverted;
19674 if (type != POSIXD) {
19676 /* This class that isn't /d can't match if we have
19677 * /d dependencies */
19678 if (has_runtime_dependency
19679 & HAS_D_RUNTIME_DEPENDENCY)
19684 else /* is /d */ if (! this_inverted) {
19686 /* /d classes don't match anything non-ASCII below
19687 * 256 unconditionally (which cp_list contains) */
19688 _invlist_intersection(cp_list, PL_UpperLatin1,
19690 if (_invlist_len(intersection) != 0) {
19694 SvREFCNT_dec(d_invlist);
19695 d_invlist = invlist_clone(cp_list, NULL);
19697 /* But under UTF-8 it turns into using /u rules.
19698 * Add the things it matches under these conditions
19699 * so that we check below that these are identical
19700 * to what the tested class should match */
19701 if (upper_latin1_only_utf8_matches) {
19704 upper_latin1_only_utf8_matches,
19707 our_code_points = &d_invlist;
19709 else { /* POSIXD, inverted. If this doesn't have this
19710 flag set, it isn't /d. */
19711 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19715 our_code_points = &cp_list;
19718 /* Here, have weeded out some things. We want to see
19719 * if the list of characters this node contains
19720 * ('*our_code_points') precisely matches those of the
19721 * class we are currently checking against
19722 * ('*official_code_points'). */
19723 if (_invlistEQ(*our_code_points,
19724 *official_code_points,
19727 /* Here, they precisely match. Optimize this ANYOF
19728 * node into its equivalent POSIX one of the
19729 * correct type, possibly inverted */
19730 ret = reg_node(pRExC_state, (try_inverted)
19734 FLAGS(REGNODE_p(ret)) = posix_class;
19735 SvREFCNT_dec(d_invlist);
19736 SvREFCNT_dec(intersection);
19742 SvREFCNT_dec(d_invlist);
19743 SvREFCNT_dec(intersection);
19746 /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19747 * both in size and speed. Currently, a 20 bit range base (smallest
19748 * code point in the range), and a 12 bit maximum delta are packed into
19749 * a 32 bit word. This allows for using it on all of the Unicode code
19750 * points except for the highest plane, which is only for private use
19751 * code points. khw doubts that a bigger delta is likely in real world
19754 && ! has_runtime_dependency
19755 && anyof_flags == 0
19756 && start[0] < (1 << ANYOFR_BASE_BITS)
19757 && end[0] - start[0]
19758 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19759 * CHARBITS - ANYOFR_BASE_BITS))))
19762 U8 low_utf8[UTF8_MAXBYTES+1];
19763 U8 high_utf8[UTF8_MAXBYTES+1];
19765 ret = reganode(pRExC_state, ANYOFR,
19766 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19768 /* Place the lowest UTF-8 start byte in the flags field, so as to
19769 * allow efficient ruling out at run time of many possible inputs.
19771 (void) uvchr_to_utf8(low_utf8, start[0]);
19772 (void) uvchr_to_utf8(high_utf8, end[0]);
19774 /* If all code points share the same first byte, this can be an
19775 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
19776 * quickly rule out many inputs at run-time without having to
19777 * compute the code point from UTF-8. For EBCDIC, we use I8, as
19778 * not doing that transformation would not rule out nearly so many
19780 if (low_utf8[0] == high_utf8[0]) {
19781 OP(REGNODE_p(ret)) = ANYOFRb;
19782 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19785 ANYOF_FLAGS(REGNODE_p(ret))
19786 = NATIVE_UTF8_TO_I8(low_utf8[0]);
19792 /* If didn't find an optimization and there is no need for a bitmap,
19793 * optimize to indicate that */
19794 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19796 && ! upper_latin1_only_utf8_matches
19797 && anyof_flags == 0)
19799 U8 low_utf8[UTF8_MAXBYTES+1];
19800 UV highest_cp = invlist_highest(cp_list);
19802 /* Currently the maximum allowed code point by the system is
19803 * IV_MAX. Higher ones are reserved for future internal use. This
19804 * particular regnode can be used for higher ones, but we can't
19805 * calculate the code point of those. IV_MAX suffices though, as
19806 * it will be a large first byte */
19807 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19810 /* We store the lowest possible first byte of the UTF-8
19811 * representation, using the flags field. This allows for quick
19812 * ruling out of some inputs without having to convert from UTF-8
19813 * to code point. For EBCDIC, we use I8, as not doing that
19814 * transformation would not rule out nearly so many things */
19815 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19819 /* If the first UTF-8 start byte for the highest code point in the
19820 * range is suitably small, we may be able to get an upper bound as
19822 if (highest_cp <= IV_MAX) {
19823 U8 high_utf8[UTF8_MAXBYTES+1];
19824 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19827 /* If the lowest and highest are the same, we can get an exact
19828 * first byte instead of a just minimum or even a sequence of
19829 * exact leading bytes. We signal these with different
19831 if (low_utf8[0] == high_utf8[0]) {
19832 Size_t len = find_first_differing_byte_pos(low_utf8,
19834 MIN(low_len, high_len));
19838 /* No need to convert to I8 for EBCDIC as this is an
19840 anyof_flags = low_utf8[0];
19845 ret = regnode_guts(pRExC_state, op,
19846 regarglen[op] + STR_SZ(len),
19848 FILL_NODE(ret, op);
19849 ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19851 Copy(low_utf8, /* Add the common bytes */
19852 ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19854 RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19855 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19856 NULL, only_utf8_locale_list);
19860 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19863 /* Here, the high byte is not the same as the low, but is
19864 * small enough that its reasonable to have a loose upper
19865 * bound, which is packed in with the strict lower bound.
19866 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19867 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19868 * is the same thing as UTF-8 */
19871 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19872 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19875 if (range_diff <= max_range_diff / 8) {
19878 else if (range_diff <= max_range_diff / 4) {
19881 else if (range_diff <= max_range_diff / 2) {
19884 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19889 goto done_finding_op;
19891 } /* End of seeing if can optimize it into a different node */
19893 is_anyof: /* It's going to be an ANYOF node. */
19894 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19904 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19905 FILL_NODE(ret, op); /* We set the argument later */
19906 RExC_emit += 1 + regarglen[op];
19907 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19909 /* Here, <cp_list> contains all the code points we can determine at
19910 * compile time that match under all conditions. Go through it, and
19911 * for things that belong in the bitmap, put them there, and delete from
19912 * <cp_list>. While we are at it, see if everything above 255 is in the
19913 * list, and if so, set a flag to speed up execution */
19915 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19918 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19922 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19925 /* Here, the bitmap has been populated with all the Latin1 code points that
19926 * always match. Can now add to the overall list those that match only
19927 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19929 if (upper_latin1_only_utf8_matches) {
19931 _invlist_union(cp_list,
19932 upper_latin1_only_utf8_matches,
19934 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19937 cp_list = upper_latin1_only_utf8_matches;
19939 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19942 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19943 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19946 only_utf8_locale_list);
19947 SvREFCNT_dec(cp_list);;
19948 SvREFCNT_dec(only_utf8_locale_list);
19953 /* Here, the node is getting optimized into something that's not an ANYOF
19954 * one. Finish up. */
19956 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19957 RExC_parse - orig_parse);;
19958 SvREFCNT_dec(cp_list);;
19959 SvREFCNT_dec(only_utf8_locale_list);
19963 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19966 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19967 regnode* const node,
19969 SV* const runtime_defns,
19970 SV* const only_utf8_locale_list)
19972 /* Sets the arg field of an ANYOF-type node 'node', using information about
19973 * the node passed-in. If there is nothing outside the node's bitmap, the
19974 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19975 * the count returned by add_data(), having allocated and stored an array,
19978 * av[0] stores the inversion list defining this class as far as known at
19979 * this time, or PL_sv_undef if nothing definite is now known.
19980 * av[1] stores the inversion list of code points that match only if the
19981 * current locale is UTF-8, or if none, PL_sv_undef if there is an
19982 * av[2], or no entry otherwise.
19983 * av[2] stores the list of user-defined properties whose subroutine
19984 * definitions aren't known at this time, or no entry if none. */
19988 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19990 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19991 assert(! (ANYOF_FLAGS(node)
19992 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19993 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19996 AV * const av = newAV();
20000 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20003 /* (Note that if any of this changes, the size calculations in
20004 * S_optimize_regclass() might need to be updated.) */
20006 if (only_utf8_locale_list) {
20007 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20008 SvREFCNT_inc_NN(only_utf8_locale_list));
20011 if (runtime_defns) {
20012 av_store(av, DEFERRED_USER_DEFINED_INDEX,
20013 SvREFCNT_inc_NN(runtime_defns));
20016 rv = newRV_noinc(MUTABLE_SV(av));
20017 n = add_data(pRExC_state, STR_WITH_LEN("s"));
20018 RExC_rxi->data->data[n] = (void*)rv;
20025 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20026 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20028 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20032 /* For internal core use only.
20033 * Returns the inversion list for the input 'node' in the regex 'prog'.
20034 * If <doinit> is 'true', will attempt to create the inversion list if not
20036 * If <listsvp> is non-null, will return the printable contents of the
20037 * property definition. This can be used to get debugging information
20038 * even before the inversion list exists, by calling this function with
20039 * 'doinit' set to false, in which case the components that will be used
20040 * to eventually create the inversion list are returned (in a printable
20042 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20043 * store an inversion list of code points that should match only if the
20044 * execution-time locale is a UTF-8 one.
20045 * If <output_invlist> is not NULL, it is where this routine is to store an
20046 * inversion list of the code points that would be instead returned in
20047 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20048 * when this parameter is used, is just the non-code point data that
20049 * will go into creating the inversion list. This currently should be just
20050 * user-defined properties whose definitions were not known at compile
20051 * time. Using this parameter allows for easier manipulation of the
20052 * inversion list's data by the caller. It is illegal to call this
20053 * function with this parameter set, but not <listsvp>
20055 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20056 * that, in spite of this function's name, the inversion list it returns
20057 * may include the bitmap data as well */
20059 SV *si = NULL; /* Input initialization string */
20060 SV* invlist = NULL;
20062 RXi_GET_DECL(prog, progi);
20063 const struct reg_data * const data = prog ? progi->data : NULL;
20065 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20066 PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20068 PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20070 assert(! output_invlist || listsvp);
20072 if (data && data->count) {
20073 const U32 n = ARG(node);
20075 if (data->what[n] == 's') {
20076 SV * const rv = MUTABLE_SV(data->data[n]);
20077 AV * const av = MUTABLE_AV(SvRV(rv));
20078 SV **const ary = AvARRAY(av);
20080 invlist = ary[INVLIST_INDEX];
20082 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20083 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20086 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20087 si = ary[DEFERRED_USER_DEFINED_INDEX];
20090 if (doinit && (si || invlist)) {
20093 SV * msg = newSVpvs_flags("", SVs_TEMP);
20095 SV * prop_definition = handle_user_defined_property(
20096 "", 0, FALSE, /* There is no \p{}, \P{} */
20097 SvPVX_const(si)[1] - '0', /* /i or not has been
20098 stored here for just
20100 TRUE, /* run time */
20101 FALSE, /* This call must find the defn */
20102 si, /* The property definition */
20105 0 /* base level call */
20109 assert(prop_definition == NULL);
20111 Perl_croak(aTHX_ "%" UTF8f,
20112 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20116 _invlist_union(invlist, prop_definition, &invlist);
20117 SvREFCNT_dec_NN(prop_definition);
20120 invlist = prop_definition;
20123 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20124 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20126 ary[INVLIST_INDEX] = invlist;
20127 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20128 ? ONLY_LOCALE_MATCHES_INDEX
20136 /* If requested, return a printable version of what this ANYOF node matches
20139 SV* matches_string = NULL;
20141 /* This function can be called at compile-time, before everything gets
20142 * resolved, in which case we return the currently best available
20143 * information, which is the string that will eventually be used to do
20144 * that resolving, 'si' */
20146 /* Here, we only have 'si' (and possibly some passed-in data in
20147 * 'invlist', which is handled below) If the caller only wants
20148 * 'si', use that. */
20149 if (! output_invlist) {
20150 matches_string = newSVsv(si);
20153 /* But if the caller wants an inversion list of the node, we
20154 * need to parse 'si' and place as much as possible in the
20155 * desired output inversion list, making 'matches_string' only
20156 * contain the currently unresolvable things */
20157 const char *si_string = SvPVX(si);
20158 STRLEN remaining = SvCUR(si);
20162 /* Ignore everything before and including the first new-line */
20163 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20164 assert (si_string != NULL);
20166 remaining = SvPVX(si) + SvCUR(si) - si_string;
20168 while (remaining > 0) {
20170 /* The data consists of just strings defining user-defined
20171 * property names, but in prior incarnations, and perhaps
20172 * somehow from pluggable regex engines, it could still
20173 * hold hex code point definitions, all of which should be
20174 * legal (or it wouldn't have gotten this far). Each
20175 * component of a range would be separated by a tab, and
20176 * each range by a new-line. If these are found, instead
20177 * add them to the inversion list */
20178 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
20179 |PERL_SCAN_SILENT_NON_PORTABLE;
20180 STRLEN len = remaining;
20181 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20183 /* If the hex decode routine found something, it should go
20184 * up to the next \n */
20185 if ( *(si_string + len) == '\n') {
20186 if (count) { /* 2nd code point on line */
20187 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20190 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20193 goto prepare_for_next_iteration;
20196 /* If the hex decode was instead for the lower range limit,
20197 * save it, and go parse the upper range limit */
20198 if (*(si_string + len) == '\t') {
20199 assert(count == 0);
20203 prepare_for_next_iteration:
20204 si_string += len + 1;
20205 remaining -= len + 1;
20209 /* Here, didn't find a legal hex number. Just add the text
20210 * from here up to the next \n, omitting any trailing
20214 len = strcspn(si_string,
20215 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20217 if (matches_string) {
20218 sv_catpvn(matches_string, si_string, len);
20221 matches_string = newSVpvn(si_string, len);
20223 sv_catpvs(matches_string, " ");
20227 && UCHARAT(si_string)
20228 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20233 if (remaining && UCHARAT(si_string) == '\n') {
20237 } /* end of loop through the text */
20239 assert(matches_string);
20240 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
20241 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20243 } /* end of has an 'si' */
20246 /* Add the stuff that's already known */
20249 /* Again, if the caller doesn't want the output inversion list, put
20250 * everything in 'matches-string' */
20251 if (! output_invlist) {
20252 if ( ! matches_string) {
20253 matches_string = newSVpvs("\n");
20255 sv_catsv(matches_string, invlist_contents(invlist,
20256 TRUE /* traditional style */
20259 else if (! *output_invlist) {
20260 *output_invlist = invlist_clone(invlist, NULL);
20263 _invlist_union(*output_invlist, invlist, output_invlist);
20267 *listsvp = matches_string;
20273 /* reg_skipcomment()
20275 Absorbs an /x style # comment from the input stream,
20276 returning a pointer to the first character beyond the comment, or if the
20277 comment terminates the pattern without anything following it, this returns
20278 one past the final character of the pattern (in other words, RExC_end) and
20279 sets the REG_RUN_ON_COMMENT_SEEN flag.
20281 Note it's the callers responsibility to ensure that we are
20282 actually in /x mode
20286 PERL_STATIC_INLINE char*
20287 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20289 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20293 while (p < RExC_end) {
20294 if (*(++p) == '\n') {
20299 /* we ran off the end of the pattern without ending the comment, so we have
20300 * to add an \n when wrapping */
20301 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20306 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20308 const bool force_to_xmod
20311 /* If the text at the current parse position '*p' is a '(?#...)' comment,
20312 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20313 * is /x whitespace, advance '*p' so that on exit it points to the first
20314 * byte past all such white space and comments */
20316 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20318 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20320 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20323 if (RExC_end - (*p) >= 3
20325 && *(*p + 1) == '?'
20326 && *(*p + 2) == '#')
20328 while (*(*p) != ')') {
20329 if ((*p) == RExC_end)
20330 FAIL("Sequence (?#... not terminated");
20338 const char * save_p = *p;
20339 while ((*p) < RExC_end) {
20341 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20344 else if (*(*p) == '#') {
20345 (*p) = reg_skipcomment(pRExC_state, (*p));
20351 if (*p != save_p) {
20364 Advances the parse position by one byte, unless that byte is the beginning
20365 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
20366 those two cases, the parse position is advanced beyond all such comments and
20369 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20373 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20375 PERL_ARGS_ASSERT_NEXTCHAR;
20377 if (RExC_parse < RExC_end) {
20379 || UTF8_IS_INVARIANT(*RExC_parse)
20380 || UTF8_IS_START(*RExC_parse));
20382 RExC_parse += (UTF)
20383 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20386 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20387 FALSE /* Don't force /x */ );
20392 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20394 /* 'size' is the delta number of smallest regnode equivalents to add or
20395 * subtract from the current memory allocated to the regex engine being
20398 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20403 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20404 /* +1 for REG_MAGIC */
20407 if ( RExC_rxi == NULL )
20408 FAIL("Regexp out of space");
20409 RXi_SET(RExC_rx, RExC_rxi);
20411 RExC_emit_start = RExC_rxi->program;
20413 Zero(REGNODE_p(RExC_emit), size, regnode);
20416 #ifdef RE_TRACK_PATTERN_OFFSETS
20417 Renew(RExC_offsets, 2*RExC_size+1, U32);
20419 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20421 RExC_offsets[0] = RExC_size;
20425 STATIC regnode_offset
20426 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20428 /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20429 * equivalents space. It aligns and increments RExC_size
20431 * It returns the regnode's offset into the regex engine program */
20433 const regnode_offset ret = RExC_emit;
20435 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20437 PERL_ARGS_ASSERT_REGNODE_GUTS;
20439 SIZE_ALIGN(RExC_size);
20440 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20441 NODE_ALIGN_FILL(REGNODE_p(ret));
20442 #ifndef RE_TRACK_PATTERN_OFFSETS
20443 PERL_UNUSED_ARG(name);
20444 PERL_UNUSED_ARG(op);
20446 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20448 if (RExC_offsets) { /* MJD */
20450 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20453 (UV)(RExC_emit) > RExC_offsets[0]
20454 ? "Overwriting end of array!\n" : "OK",
20456 (UV)(RExC_parse - RExC_start),
20457 (UV)RExC_offsets[0]));
20458 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20465 - reg_node - emit a node
20467 STATIC regnode_offset /* Location. */
20468 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20470 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20471 regnode_offset ptr = ret;
20473 PERL_ARGS_ASSERT_REG_NODE;
20475 assert(regarglen[op] == 0);
20477 FILL_ADVANCE_NODE(ptr, op);
20483 - reganode - emit a node with an argument
20485 STATIC regnode_offset /* Location. */
20486 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20488 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20489 regnode_offset ptr = ret;
20491 PERL_ARGS_ASSERT_REGANODE;
20493 /* ANYOF are special cased to allow non-length 1 args */
20494 assert(regarglen[op] == 1);
20496 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20502 - regpnode - emit a temporary node with a SV* argument
20504 STATIC regnode_offset /* Location. */
20505 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20507 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20508 regnode_offset ptr = ret;
20510 PERL_ARGS_ASSERT_REGPNODE;
20512 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20517 STATIC regnode_offset
20518 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20520 /* emit a node with U32 and I32 arguments */
20522 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20523 regnode_offset ptr = ret;
20525 PERL_ARGS_ASSERT_REG2LANODE;
20527 assert(regarglen[op] == 2);
20529 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20535 - reginsert - insert an operator in front of already-emitted operand
20537 * That means that on exit 'operand' is the offset of the newly inserted
20538 * operator, and the original operand has been relocated.
20540 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20541 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20543 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20544 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20546 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20549 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20550 const regnode_offset operand, const U32 depth)
20555 const int offset = regarglen[(U8)op];
20556 const int size = NODE_STEP_REGNODE + offset;
20557 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20559 PERL_ARGS_ASSERT_REGINSERT;
20560 PERL_UNUSED_CONTEXT;
20561 PERL_UNUSED_ARG(depth);
20562 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20563 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20564 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20565 studying. If this is wrong then we need to adjust RExC_recurse
20566 below like we do with RExC_open_parens/RExC_close_parens. */
20567 change_engine_size(pRExC_state, (Ptrdiff_t) size);
20568 src = REGNODE_p(RExC_emit);
20570 dst = REGNODE_p(RExC_emit);
20572 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20573 * and [perl #133871] shows this can lead to problems, so skip this
20574 * realignment of parens until a later pass when they are reliable */
20575 if (! IN_PARENS_PASS && RExC_open_parens) {
20577 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20578 /* remember that RExC_npar is rex->nparens + 1,
20579 * iow it is 1 more than the number of parens seen in
20580 * the pattern so far. */
20581 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20582 /* note, RExC_open_parens[0] is the start of the
20583 * regex, it can't move. RExC_close_parens[0] is the end
20584 * of the regex, it *can* move. */
20585 if ( paren && RExC_open_parens[paren] >= operand ) {
20586 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20587 RExC_open_parens[paren] += size;
20589 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20591 if ( RExC_close_parens[paren] >= operand ) {
20592 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20593 RExC_close_parens[paren] += size;
20595 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20600 RExC_end_op += size;
20602 while (src > REGNODE_p(operand)) {
20603 StructCopy(--src, --dst, regnode);
20604 #ifdef RE_TRACK_PATTERN_OFFSETS
20605 if (RExC_offsets) { /* MJD 20010112 */
20607 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20611 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20612 ? "Overwriting end of array!\n" : "OK",
20613 (UV)REGNODE_OFFSET(src),
20614 (UV)REGNODE_OFFSET(dst),
20615 (UV)RExC_offsets[0]));
20616 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20617 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20622 place = REGNODE_p(operand); /* Op node, where operand used to be. */
20623 #ifdef RE_TRACK_PATTERN_OFFSETS
20624 if (RExC_offsets) { /* MJD */
20626 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20630 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20631 ? "Overwriting end of array!\n" : "OK",
20632 (UV)REGNODE_OFFSET(place),
20633 (UV)(RExC_parse - RExC_start),
20634 (UV)RExC_offsets[0]));
20635 Set_Node_Offset(place, RExC_parse);
20636 Set_Node_Length(place, 1);
20639 src = NEXTOPER(place);
20641 FILL_NODE(operand, op);
20643 /* Zero out any arguments in the new node */
20644 Zero(src, offset, regnode);
20648 - regtail - set the next-pointer at the end of a node chain of p to val. If
20649 that value won't fit in the space available, instead returns FALSE.
20650 (Except asserts if we can't fit in the largest space the regex
20651 engine is designed for.)
20652 - SEE ALSO: regtail_study
20655 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20656 const regnode_offset p,
20657 const regnode_offset val,
20660 regnode_offset scan;
20661 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20663 PERL_ARGS_ASSERT_REGTAIL;
20665 PERL_UNUSED_ARG(depth);
20668 /* The final node in the chain is the first one with a nonzero next pointer
20670 scan = (regnode_offset) p;
20672 regnode * const temp = regnext(REGNODE_p(scan));
20674 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20675 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20676 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
20677 SvPV_nolen_const(RExC_mysv), scan,
20678 (temp == NULL ? "->" : ""),
20679 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20684 scan = REGNODE_OFFSET(temp);
20687 /* Populate this node's next pointer */
20688 assert(val >= scan);
20689 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20690 assert((UV) (val - scan) <= U32_MAX);
20691 ARG_SET(REGNODE_p(scan), val - scan);
20694 if (val - scan > U16_MAX) {
20695 /* Populate this with something that won't loop and will likely
20696 * lead to a crash if the caller ignores the failure return, and
20697 * execution continues */
20698 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20701 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20709 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20710 - Look for optimizable sequences at the same time.
20711 - currently only looks for EXACT chains.
20713 This is experimental code. The idea is to use this routine to perform
20714 in place optimizations on branches and groups as they are constructed,
20715 with the long term intention of removing optimization from study_chunk so
20716 that it is purely analytical.
20718 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20719 to control which is which.
20721 This used to return a value that was ignored. It was a problem that it is
20722 #ifdef'd to be another function that didn't return a value. khw has changed it
20723 so both currently return a pass/fail return.
20726 /* TODO: All four parms should be const */
20729 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20730 const regnode_offset val, U32 depth)
20732 regnode_offset scan;
20734 #ifdef EXPERIMENTAL_INPLACESCAN
20737 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20739 PERL_ARGS_ASSERT_REGTAIL_STUDY;
20742 /* Find last node. */
20746 regnode * const temp = regnext(REGNODE_p(scan));
20747 #ifdef EXPERIMENTAL_INPLACESCAN
20748 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20749 bool unfolded_multi_char; /* Unexamined in this routine */
20750 if (join_exact(pRExC_state, scan, &min,
20751 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20752 return TRUE; /* Was return EXACT */
20756 switch (OP(REGNODE_p(scan))) {
20763 case EXACTFU_S_EDGE:
20764 case EXACTFAA_NO_TRIE:
20771 if( exact == PSEUDO )
20772 exact= OP(REGNODE_p(scan));
20773 else if ( exact != OP(REGNODE_p(scan)) )
20782 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20783 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20784 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
20785 SvPV_nolen_const(RExC_mysv),
20787 PL_reg_name[exact]);
20791 scan = REGNODE_OFFSET(temp);
20794 DEBUG_PARSE_MSG("");
20795 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20796 Perl_re_printf( aTHX_
20797 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20798 SvPV_nolen_const(RExC_mysv),
20803 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20804 assert((UV) (val - scan) <= U32_MAX);
20805 ARG_SET(REGNODE_p(scan), val - scan);
20808 if (val - scan > U16_MAX) {
20809 /* Populate this with something that won't loop and will likely
20810 * lead to a crash if the caller ignores the failure return, and
20811 * execution continues */
20812 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20815 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20818 return TRUE; /* Was 'return exact' */
20823 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20825 /* Returns an inversion list of all the code points matched by the
20826 * ANYOFM/NANYOFM node 'n' */
20828 SV * cp_list = _new_invlist(-1);
20829 const U8 lowest = (U8) ARG(n);
20832 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20834 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20836 /* Starting with the lowest code point, any code point that ANDed with the
20837 * mask yields the lowest code point is in the set */
20838 for (i = lowest; i <= 0xFF; i++) {
20839 if ((i & FLAGS(n)) == ARG(n)) {
20840 cp_list = add_cp_to_invlist(cp_list, i);
20843 /* We know how many code points (a power of two) that are in the
20844 * set. No use looking once we've got that number */
20845 if (count >= needed) break;
20849 if (OP(n) == NANYOFM) {
20850 _invlist_invert(cp_list);
20856 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20861 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20866 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20868 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20869 if (flags & (1<<bit)) {
20870 if (!set++ && lead)
20871 Perl_re_printf( aTHX_ "%s", lead);
20872 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20877 Perl_re_printf( aTHX_ "\n");
20879 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20884 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20890 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20892 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20893 if (flags & (1<<bit)) {
20894 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20897 if (!set++ && lead)
20898 Perl_re_printf( aTHX_ "%s", lead);
20899 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20902 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20903 if (!set++ && lead) {
20904 Perl_re_printf( aTHX_ "%s", lead);
20907 case REGEX_UNICODE_CHARSET:
20908 Perl_re_printf( aTHX_ "UNICODE");
20910 case REGEX_LOCALE_CHARSET:
20911 Perl_re_printf( aTHX_ "LOCALE");
20913 case REGEX_ASCII_RESTRICTED_CHARSET:
20914 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20916 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20917 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20920 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20926 Perl_re_printf( aTHX_ "\n");
20928 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20934 Perl_regdump(pTHX_ const regexp *r)
20938 SV * const sv = sv_newmortal();
20939 SV *dsv= sv_newmortal();
20940 RXi_GET_DECL(r, ri);
20941 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20943 PERL_ARGS_ASSERT_REGDUMP;
20945 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20947 /* Header fields of interest. */
20948 for (i = 0; i < 2; i++) {
20949 if (r->substrs->data[i].substr) {
20950 RE_PV_QUOTED_DECL(s, 0, dsv,
20951 SvPVX_const(r->substrs->data[i].substr),
20952 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20953 PL_dump_re_max_len);
20954 Perl_re_printf( aTHX_
20955 "%s %s%s at %" IVdf "..%" UVuf " ",
20956 i ? "floating" : "anchored",
20958 RE_SV_TAIL(r->substrs->data[i].substr),
20959 (IV)r->substrs->data[i].min_offset,
20960 (UV)r->substrs->data[i].max_offset);
20962 else if (r->substrs->data[i].utf8_substr) {
20963 RE_PV_QUOTED_DECL(s, 1, dsv,
20964 SvPVX_const(r->substrs->data[i].utf8_substr),
20965 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20967 Perl_re_printf( aTHX_
20968 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20969 i ? "floating" : "anchored",
20971 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20972 (IV)r->substrs->data[i].min_offset,
20973 (UV)r->substrs->data[i].max_offset);
20977 if (r->check_substr || r->check_utf8)
20978 Perl_re_printf( aTHX_
20980 ( r->check_substr == r->substrs->data[1].substr
20981 && r->check_utf8 == r->substrs->data[1].utf8_substr
20982 ? "(checking floating" : "(checking anchored"));
20983 if (r->intflags & PREGf_NOSCAN)
20984 Perl_re_printf( aTHX_ " noscan");
20985 if (r->extflags & RXf_CHECK_ALL)
20986 Perl_re_printf( aTHX_ " isall");
20987 if (r->check_substr || r->check_utf8)
20988 Perl_re_printf( aTHX_ ") ");
20990 if (ri->regstclass) {
20991 regprop(r, sv, ri->regstclass, NULL, NULL);
20992 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
20994 if (r->intflags & PREGf_ANCH) {
20995 Perl_re_printf( aTHX_ "anchored");
20996 if (r->intflags & PREGf_ANCH_MBOL)
20997 Perl_re_printf( aTHX_ "(MBOL)");
20998 if (r->intflags & PREGf_ANCH_SBOL)
20999 Perl_re_printf( aTHX_ "(SBOL)");
21000 if (r->intflags & PREGf_ANCH_GPOS)
21001 Perl_re_printf( aTHX_ "(GPOS)");
21002 Perl_re_printf( aTHX_ " ");
21004 if (r->intflags & PREGf_GPOS_SEEN)
21005 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
21006 if (r->intflags & PREGf_SKIP)
21007 Perl_re_printf( aTHX_ "plus ");
21008 if (r->intflags & PREGf_IMPLICIT)
21009 Perl_re_printf( aTHX_ "implicit ");
21010 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
21011 if (r->extflags & RXf_EVAL_SEEN)
21012 Perl_re_printf( aTHX_ "with eval ");
21013 Perl_re_printf( aTHX_ "\n");
21015 regdump_extflags("r->extflags: ", r->extflags);
21016 regdump_intflags("r->intflags: ", r->intflags);
21019 PERL_ARGS_ASSERT_REGDUMP;
21020 PERL_UNUSED_CONTEXT;
21021 PERL_UNUSED_ARG(r);
21022 #endif /* DEBUGGING */
21025 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21028 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
21029 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
21030 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
21031 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
21032 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
21033 || _CC_VERTSPACE != 15
21034 # error Need to adjust order of anyofs[]
21036 static const char * const anyofs[] = {
21073 - regprop - printable representation of opcode, with run time support
21077 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21081 RXi_GET_DECL(prog, progi);
21082 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21084 PERL_ARGS_ASSERT_REGPROP;
21088 if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
21089 if (pRExC_state) { /* This gives more info, if we have it */
21090 FAIL3("panic: corrupted regexp opcode %d > %d",
21091 (int)OP(o), (int)REGNODE_MAX);
21094 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21095 (int)OP(o), (int)REGNODE_MAX);
21098 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21100 k = PL_regkind[OP(o)];
21103 sv_catpvs(sv, " ");
21104 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21105 * is a crude hack but it may be the best for now since
21106 * we have no flag "this EXACTish node was UTF-8"
21108 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21109 PL_colors[0], PL_colors[1],
21110 PERL_PV_ESCAPE_UNI_DETECT |
21111 PERL_PV_ESCAPE_NONASCII |
21112 PERL_PV_PRETTY_ELLIPSES |
21113 PERL_PV_PRETTY_LTGT |
21114 PERL_PV_PRETTY_NOCLEAR
21116 } else if (k == TRIE) {
21117 /* print the details of the trie in dumpuntil instead, as
21118 * progi->data isn't available here */
21119 const char op = OP(o);
21120 const U32 n = ARG(o);
21121 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21122 (reg_ac_data *)progi->data->data[n] :
21124 const reg_trie_data * const trie
21125 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21127 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21128 DEBUG_TRIE_COMPILE_r({
21130 sv_catpvs(sv, "(JUMP)");
21131 Perl_sv_catpvf(aTHX_ sv,
21132 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21133 (UV)trie->startstate,
21134 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21135 (UV)trie->wordcount,
21138 (UV)TRIE_CHARCOUNT(trie),
21139 (UV)trie->uniquecharcount
21142 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21143 sv_catpvs(sv, "[");
21144 (void) put_charclass_bitmap_innards(sv,
21145 ((IS_ANYOF_TRIE(op))
21147 : TRIE_BITMAP(trie)),
21154 sv_catpvs(sv, "]");
21156 } else if (k == CURLY) {
21157 U32 lo = ARG1(o), hi = ARG2(o);
21158 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21159 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21160 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21161 if (hi == REG_INFTY)
21162 sv_catpvs(sv, "INFTY");
21164 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21165 sv_catpvs(sv, "}");
21167 else if (k == WHILEM && o->flags) /* Ordinal/of */
21168 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21169 else if (k == REF || k == OPEN || k == CLOSE
21170 || k == GROUPP || OP(o)==ACCEPT)
21172 AV *name_list= NULL;
21173 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21174 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21175 if ( RXp_PAREN_NAMES(prog) ) {
21176 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21177 } else if ( pRExC_state ) {
21178 name_list= RExC_paren_name_list;
21181 if ( k != REF || (OP(o) < REFN)) {
21182 SV **name= av_fetch(name_list, parno, 0 );
21184 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21187 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21188 I32 *nums=(I32*)SvPVX(sv_dat);
21189 SV **name= av_fetch(name_list, nums[0], 0 );
21192 for ( n=0; n<SvIVX(sv_dat); n++ ) {
21193 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21194 (n ? "," : ""), (IV)nums[n]);
21196 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21200 if ( k == REF && reginfo) {
21201 U32 n = ARG(o); /* which paren pair */
21202 I32 ln = prog->offs[n].start;
21203 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21204 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21205 else if (ln == prog->offs[n].end)
21206 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21208 const char *s = reginfo->strbeg + ln;
21209 Perl_sv_catpvf(aTHX_ sv, ": ");
21210 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21211 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21214 } else if (k == GOSUB) {
21215 AV *name_list= NULL;
21216 if ( RXp_PAREN_NAMES(prog) ) {
21217 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21218 } else if ( pRExC_state ) {
21219 name_list= RExC_paren_name_list;
21222 /* Paren and offset */
21223 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21224 (int)((o + (int)ARG2L(o)) - progi->program) );
21226 SV **name= av_fetch(name_list, ARG(o), 0 );
21228 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21231 else if (k == LOGICAL)
21232 /* 2: embedded, otherwise 1 */
21233 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21234 else if (k == ANYOF || k == ANYOFR) {
21238 bool do_sep = FALSE; /* Do we need to separate various components of
21240 /* Set if there is still an unresolved user-defined property */
21241 SV *unresolved = NULL;
21243 /* Things that are ignored except when the runtime locale is UTF-8 */
21244 SV *only_utf8_locale_invlist = NULL;
21246 /* Code points that don't fit in the bitmap */
21247 SV *nonbitmap_invlist = NULL;
21249 /* And things that aren't in the bitmap, but are small enough to be */
21250 SV* bitmap_range_not_in_bitmap = NULL;
21254 if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21260 flags = ANYOF_FLAGS(o);
21261 bitmap = ANYOF_BITMAP(o);
21265 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21266 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21267 sv_catpvs(sv, "{utf8-locale-reqd}");
21269 if (flags & ANYOFL_FOLD) {
21270 sv_catpvs(sv, "{i}");
21274 inverted = flags & ANYOF_INVERT;
21276 /* If there is stuff outside the bitmap, get it */
21277 if (arg != ANYOF_ONLY_HAS_BITMAP) {
21278 if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21279 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21281 ANYOFRbase(o) + ANYOFRdelta(o));
21284 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21285 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21287 &only_utf8_locale_invlist,
21288 &nonbitmap_invlist);
21290 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21292 &only_utf8_locale_invlist,
21293 &nonbitmap_invlist);
21297 /* The non-bitmap data may contain stuff that could fit in the
21298 * bitmap. This could come from a user-defined property being
21299 * finally resolved when this call was done; or much more likely
21300 * because there are matches that require UTF-8 to be valid, and so
21301 * aren't in the bitmap (or ANYOFR). This is teased apart later */
21302 _invlist_intersection(nonbitmap_invlist,
21304 &bitmap_range_not_in_bitmap);
21305 /* Leave just the things that don't fit into the bitmap */
21306 _invlist_subtract(nonbitmap_invlist,
21308 &nonbitmap_invlist);
21311 /* Obey this flag to add all above-the-bitmap code points */
21312 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21313 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21314 NUM_ANYOF_CODE_POINTS,
21318 /* Ready to start outputting. First, the initial left bracket */
21319 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21321 /* ANYOFH by definition doesn't have anything that will fit inside the
21322 * bitmap; ANYOFR may or may not. */
21323 if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21324 && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21325 || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21327 /* Then all the things that could fit in the bitmap */
21328 do_sep = put_charclass_bitmap_innards(sv,
21330 bitmap_range_not_in_bitmap,
21331 only_utf8_locale_invlist,
21335 /* Can't try inverting for a
21336 * better display if there
21337 * are things that haven't
21340 || inRANGE(OP(o), ANYOFR, ANYOFRb));
21341 SvREFCNT_dec(bitmap_range_not_in_bitmap);
21343 /* If there are user-defined properties which haven't been defined
21344 * yet, output them. If the result is not to be inverted, it is
21345 * clearest to output them in a separate [] from the bitmap range
21346 * stuff. If the result is to be complemented, we have to show
21347 * everything in one [], as the inversion applies to the whole
21348 * thing. Use {braces} to separate them from anything in the
21349 * bitmap and anything above the bitmap. */
21352 if (! do_sep) { /* If didn't output anything in the bitmap
21354 sv_catpvs(sv, "^");
21356 sv_catpvs(sv, "{");
21359 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21362 sv_catsv(sv, unresolved);
21364 sv_catpvs(sv, "}");
21366 do_sep = ! inverted;
21370 /* And, finally, add the above-the-bitmap stuff */
21371 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21374 /* See if truncation size is overridden */
21375 const STRLEN dump_len = (PL_dump_re_max_len > 256)
21376 ? PL_dump_re_max_len
21379 /* This is output in a separate [] */
21381 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21384 /* And, for easy of understanding, it is shown in the
21385 * uncomplemented form if possible. The one exception being if
21386 * there are unresolved items, where the inversion has to be
21387 * delayed until runtime */
21388 if (inverted && ! unresolved) {
21389 _invlist_invert(nonbitmap_invlist);
21390 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21393 contents = invlist_contents(nonbitmap_invlist,
21394 FALSE /* output suitable for catsv */
21397 /* If the output is shorter than the permissible maximum, just do it. */
21398 if (SvCUR(contents) <= dump_len) {
21399 sv_catsv(sv, contents);
21402 const char * contents_string = SvPVX(contents);
21403 STRLEN i = dump_len;
21405 /* Otherwise, start at the permissible max and work back to the
21406 * first break possibility */
21407 while (i > 0 && contents_string[i] != ' ') {
21410 if (i == 0) { /* Fail-safe. Use the max if we couldn't
21411 find a legal break */
21415 sv_catpvn(sv, contents_string, i);
21416 sv_catpvs(sv, "...");
21419 SvREFCNT_dec_NN(contents);
21420 SvREFCNT_dec_NN(nonbitmap_invlist);
21423 /* And finally the matching, closing ']' */
21424 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21426 if (OP(o) == ANYOFHs) {
21427 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21429 else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21430 U8 lowest = (OP(o) != ANYOFHr)
21432 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21433 U8 highest = (OP(o) == ANYOFHr)
21434 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21435 : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21439 if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21442 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21443 if (lowest != highest) {
21444 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21446 Perl_sv_catpvf(aTHX_ sv, ")");
21450 SvREFCNT_dec(unresolved);
21452 else if (k == ANYOFM) {
21453 SV * cp_list = get_ANYOFM_contents(o);
21455 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21456 if (OP(o) == NANYOFM) {
21457 _invlist_invert(cp_list);
21460 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21461 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21463 SvREFCNT_dec(cp_list);
21465 else if (k == POSIXD || k == NPOSIXD) {
21466 U8 index = FLAGS(o) * 2;
21467 if (index < C_ARRAY_LENGTH(anyofs)) {
21468 if (*anyofs[index] != '[') {
21469 sv_catpvs(sv, "[");
21471 sv_catpv(sv, anyofs[index]);
21472 if (*anyofs[index] != '[') {
21473 sv_catpvs(sv, "]");
21477 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21480 else if (k == BOUND || k == NBOUND) {
21481 /* Must be synced with order of 'bound_type' in regcomp.h */
21482 const char * const bounds[] = {
21483 "", /* Traditional */
21489 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21490 sv_catpv(sv, bounds[FLAGS(o)]);
21492 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21493 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21495 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21497 Perl_sv_catpvf(aTHX_ sv, "]");
21499 else if (OP(o) == SBOL)
21500 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21502 /* add on the verb argument if there is one */
21503 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21505 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21506 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21508 sv_catpvs(sv, ":NULL");
21511 PERL_UNUSED_CONTEXT;
21512 PERL_UNUSED_ARG(sv);
21513 PERL_UNUSED_ARG(o);
21514 PERL_UNUSED_ARG(prog);
21515 PERL_UNUSED_ARG(reginfo);
21516 PERL_UNUSED_ARG(pRExC_state);
21517 #endif /* DEBUGGING */
21523 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21524 { /* Assume that RE_INTUIT is set */
21525 /* Returns an SV containing a string that must appear in the target for it
21526 * to match, or NULL if nothing is known that must match.
21528 * CAUTION: the SV can be freed during execution of the regex engine */
21530 struct regexp *const prog = ReANY(r);
21531 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21533 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21534 PERL_UNUSED_CONTEXT;
21538 if (prog->maxlen > 0) {
21539 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21540 ? prog->check_utf8 : prog->check_substr);
21542 if (!PL_colorset) reginitcolors();
21543 Perl_re_printf( aTHX_
21544 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21546 RX_UTF8(r) ? "utf8 " : "",
21547 PL_colors[5], PL_colors[0],
21550 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21554 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21555 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21561 handles refcounting and freeing the perl core regexp structure. When
21562 it is necessary to actually free the structure the first thing it
21563 does is call the 'free' method of the regexp_engine associated to
21564 the regexp, allowing the handling of the void *pprivate; member
21565 first. (This routine is not overridable by extensions, which is why
21566 the extensions free is called first.)
21568 See regdupe and regdupe_internal if you change anything here.
21570 #ifndef PERL_IN_XSUB_RE
21572 Perl_pregfree(pTHX_ REGEXP *r)
21578 Perl_pregfree2(pTHX_ REGEXP *rx)
21580 struct regexp *const r = ReANY(rx);
21581 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21583 PERL_ARGS_ASSERT_PREGFREE2;
21588 if (r->mother_re) {
21589 ReREFCNT_dec(r->mother_re);
21591 CALLREGFREE_PVT(rx); /* free the private data */
21592 SvREFCNT_dec(RXp_PAREN_NAMES(r));
21596 for (i = 0; i < 2; i++) {
21597 SvREFCNT_dec(r->substrs->data[i].substr);
21598 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21600 Safefree(r->substrs);
21602 RX_MATCH_COPY_FREE(rx);
21603 #ifdef PERL_ANY_COW
21604 SvREFCNT_dec(r->saved_copy);
21607 SvREFCNT_dec(r->qr_anoncv);
21608 if (r->recurse_locinput)
21609 Safefree(r->recurse_locinput);
21615 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21616 except that dsv will be created if NULL.
21618 This function is used in two main ways. First to implement
21619 $r = qr/....; $s = $$r;
21621 Secondly, it is used as a hacky workaround to the structural issue of
21623 being stored in the regexp structure which is in turn stored in
21624 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21625 could be PL_curpm in multiple contexts, and could require multiple
21626 result sets being associated with the pattern simultaneously, such
21627 as when doing a recursive match with (??{$qr})
21629 The solution is to make a lightweight copy of the regexp structure
21630 when a qr// is returned from the code executed by (??{$qr}) this
21631 lightweight copy doesn't actually own any of its data except for
21632 the starp/end and the actual regexp structure itself.
21638 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21640 struct regexp *drx;
21641 struct regexp *const srx = ReANY(ssv);
21642 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21644 PERL_ARGS_ASSERT_REG_TEMP_COPY;
21647 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21649 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21651 /* our only valid caller, sv_setsv_flags(), should have done
21652 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21653 assert(!SvOOK(dsv));
21654 assert(!SvIsCOW(dsv));
21655 assert(!SvROK(dsv));
21657 if (SvPVX_const(dsv)) {
21659 Safefree(SvPVX(dsv));
21664 SvOK_off((SV *)dsv);
21667 /* For PVLVs, the head (sv_any) points to an XPVLV, while
21668 * the LV's xpvlenu_rx will point to a regexp body, which
21669 * we allocate here */
21670 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21671 assert(!SvPVX(dsv));
21672 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21673 temp->sv_any = NULL;
21674 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21675 SvREFCNT_dec_NN(temp);
21676 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21677 ing below will not set it. */
21678 SvCUR_set(dsv, SvCUR(ssv));
21681 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21682 sv_force_normal(sv) is called. */
21686 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21687 SvPV_set(dsv, RX_WRAPPED(ssv));
21688 /* We share the same string buffer as the original regexp, on which we
21689 hold a reference count, incremented when mother_re is set below.
21690 The string pointer is copied here, being part of the regexp struct.
21692 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21693 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21697 const I32 npar = srx->nparens+1;
21698 Newx(drx->offs, npar, regexp_paren_pair);
21699 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21701 if (srx->substrs) {
21703 Newx(drx->substrs, 1, struct reg_substr_data);
21704 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21706 for (i = 0; i < 2; i++) {
21707 SvREFCNT_inc_void(drx->substrs->data[i].substr);
21708 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21711 /* check_substr and check_utf8, if non-NULL, point to either their
21712 anchored or float namesakes, and don't hold a second reference. */
21714 RX_MATCH_COPIED_off(dsv);
21715 #ifdef PERL_ANY_COW
21716 drx->saved_copy = NULL;
21718 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21719 SvREFCNT_inc_void(drx->qr_anoncv);
21720 if (srx->recurse_locinput)
21721 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21728 /* regfree_internal()
21730 Free the private data in a regexp. This is overloadable by
21731 extensions. Perl takes care of the regexp structure in pregfree(),
21732 this covers the *pprivate pointer which technically perl doesn't
21733 know about, however of course we have to handle the
21734 regexp_internal structure when no extension is in use.
21736 Note this is called before freeing anything in the regexp
21741 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21743 struct regexp *const r = ReANY(rx);
21744 RXi_GET_DECL(r, ri);
21745 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21747 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21757 SV *dsv= sv_newmortal();
21758 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21759 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21760 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21761 PL_colors[4], PL_colors[5], s);
21765 #ifdef RE_TRACK_PATTERN_OFFSETS
21767 Safefree(ri->u.offsets); /* 20010421 MJD */
21769 if (ri->code_blocks)
21770 S_free_codeblocks(aTHX_ ri->code_blocks);
21773 int n = ri->data->count;
21776 /* If you add a ->what type here, update the comment in regcomp.h */
21777 switch (ri->data->what[n]) {
21783 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21786 Safefree(ri->data->data[n]);
21792 { /* Aho Corasick add-on structure for a trie node.
21793 Used in stclass optimization only */
21795 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21796 #ifdef USE_ITHREADS
21799 refcount = --aho->refcount;
21802 PerlMemShared_free(aho->states);
21803 PerlMemShared_free(aho->fail);
21804 /* do this last!!!! */
21805 PerlMemShared_free(ri->data->data[n]);
21806 /* we should only ever get called once, so
21807 * assert as much, and also guard the free
21808 * which /might/ happen twice. At the least
21809 * it will make code anlyzers happy and it
21810 * doesn't cost much. - Yves */
21811 assert(ri->regstclass);
21812 if (ri->regstclass) {
21813 PerlMemShared_free(ri->regstclass);
21814 ri->regstclass = 0;
21821 /* trie structure. */
21823 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21824 #ifdef USE_ITHREADS
21827 refcount = --trie->refcount;
21830 PerlMemShared_free(trie->charmap);
21831 PerlMemShared_free(trie->states);
21832 PerlMemShared_free(trie->trans);
21834 PerlMemShared_free(trie->bitmap);
21836 PerlMemShared_free(trie->jump);
21837 PerlMemShared_free(trie->wordinfo);
21838 /* do this last!!!! */
21839 PerlMemShared_free(ri->data->data[n]);
21844 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21845 ri->data->what[n]);
21848 Safefree(ri->data->what);
21849 Safefree(ri->data);
21855 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21856 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21857 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
21860 =for apidoc_section REGEXP Functions
21861 =for apidoc re_dup_guts
21862 Duplicate a regexp.
21864 This routine is expected to clone a given regexp structure. It is only
21865 compiled under USE_ITHREADS.
21867 After all of the core data stored in struct regexp is duplicated
21868 the regexp_engine.dupe method is used to copy any private data
21869 stored in the *pprivate pointer. This allows extensions to handle
21870 any duplication they need to do.
21874 See pregfree() and regfree_internal() if you change anything here.
21876 #if defined(USE_ITHREADS)
21877 #ifndef PERL_IN_XSUB_RE
21879 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21882 const struct regexp *r = ReANY(sstr);
21883 struct regexp *ret = ReANY(dstr);
21885 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21887 npar = r->nparens+1;
21888 Newx(ret->offs, npar, regexp_paren_pair);
21889 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21891 if (ret->substrs) {
21892 /* Do it this way to avoid reading from *r after the StructCopy().
21893 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21894 cache, it doesn't matter. */
21896 const bool anchored = r->check_substr
21897 ? r->check_substr == r->substrs->data[0].substr
21898 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21899 Newx(ret->substrs, 1, struct reg_substr_data);
21900 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21902 for (i = 0; i < 2; i++) {
21903 ret->substrs->data[i].substr =
21904 sv_dup_inc(ret->substrs->data[i].substr, param);
21905 ret->substrs->data[i].utf8_substr =
21906 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21909 /* check_substr and check_utf8, if non-NULL, point to either their
21910 anchored or float namesakes, and don't hold a second reference. */
21912 if (ret->check_substr) {
21914 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21916 ret->check_substr = ret->substrs->data[0].substr;
21917 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21919 assert(r->check_substr == r->substrs->data[1].substr);
21920 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21922 ret->check_substr = ret->substrs->data[1].substr;
21923 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21925 } else if (ret->check_utf8) {
21927 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21929 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21934 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21935 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21936 if (r->recurse_locinput)
21937 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21940 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21942 if (RX_MATCH_COPIED(dstr))
21943 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21945 ret->subbeg = NULL;
21946 #ifdef PERL_ANY_COW
21947 ret->saved_copy = NULL;
21950 /* Whether mother_re be set or no, we need to copy the string. We
21951 cannot refrain from copying it when the storage points directly to
21952 our mother regexp, because that's
21953 1: a buffer in a different thread
21954 2: something we no longer hold a reference on
21955 so we need to copy it locally. */
21956 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21957 /* set malloced length to a non-zero value so it will be freed
21958 * (otherwise in combination with SVf_FAKE it looks like an alien
21959 * buffer). It doesn't have to be the actual malloced size, since it
21960 * should never be grown */
21961 SvLEN_set(dstr, SvCUR(sstr)+1);
21962 ret->mother_re = NULL;
21964 #endif /* PERL_IN_XSUB_RE */
21969 This is the internal complement to regdupe() which is used to copy
21970 the structure pointed to by the *pprivate pointer in the regexp.
21971 This is the core version of the extension overridable cloning hook.
21972 The regexp structure being duplicated will be copied by perl prior
21973 to this and will be provided as the regexp *r argument, however
21974 with the /old/ structures pprivate pointer value. Thus this routine
21975 may override any copying normally done by perl.
21977 It returns a pointer to the new regexp_internal structure.
21981 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21983 struct regexp *const r = ReANY(rx);
21984 regexp_internal *reti;
21986 RXi_GET_DECL(r, ri);
21988 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21992 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21993 char, regexp_internal);
21994 Copy(ri->program, reti->program, len+1, regnode);
21997 if (ri->code_blocks) {
21999 Newx(reti->code_blocks, 1, struct reg_code_blocks);
22000 Newx(reti->code_blocks->cb, ri->code_blocks->count,
22001 struct reg_code_block);
22002 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22003 ri->code_blocks->count, struct reg_code_block);
22004 for (n = 0; n < ri->code_blocks->count; n++)
22005 reti->code_blocks->cb[n].src_regex = (REGEXP*)
22006 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22007 reti->code_blocks->count = ri->code_blocks->count;
22008 reti->code_blocks->refcnt = 1;
22011 reti->code_blocks = NULL;
22013 reti->regstclass = NULL;
22016 struct reg_data *d;
22017 const int count = ri->data->count;
22020 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22021 char, struct reg_data);
22022 Newx(d->what, count, U8);
22025 for (i = 0; i < count; i++) {
22026 d->what[i] = ri->data->what[i];
22027 switch (d->what[i]) {
22028 /* see also regcomp.h and regfree_internal() */
22029 case 'a': /* actually an AV, but the dup function is identical.
22030 values seem to be "plain sv's" generally. */
22031 case 'r': /* a compiled regex (but still just another SV) */
22032 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22033 this use case should go away, the code could have used
22034 'a' instead - see S_set_ANYOF_arg() for array contents. */
22035 case 'S': /* actually an SV, but the dup function is identical. */
22036 case 'u': /* actually an HV, but the dup function is identical.
22037 values are "plain sv's" */
22038 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22041 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22042 * patterns which could start with several different things. Pre-TRIE
22043 * this was more important than it is now, however this still helps
22044 * in some places, for instance /x?a+/ might produce a SSC equivalent
22045 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22048 /* This is cheating. */
22049 Newx(d->data[i], 1, regnode_ssc);
22050 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22051 reti->regstclass = (regnode*)d->data[i];
22054 /* AHO-CORASICK fail table */
22055 /* Trie stclasses are readonly and can thus be shared
22056 * without duplication. We free the stclass in pregfree
22057 * when the corresponding reg_ac_data struct is freed.
22059 reti->regstclass= ri->regstclass;
22062 /* TRIE transition table */
22064 ((reg_trie_data*)ri->data->data[i])->refcount++;
22067 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22068 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22069 is not from another regexp */
22070 d->data[i] = ri->data->data[i];
22073 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22074 ri->data->what[i]);
22083 reti->name_list_idx = ri->name_list_idx;
22085 #ifdef RE_TRACK_PATTERN_OFFSETS
22086 if (ri->u.offsets) {
22087 Newx(reti->u.offsets, 2*len+1, U32);
22088 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22091 SetProgLen(reti, len);
22094 return (void*)reti;
22097 #endif /* USE_ITHREADS */
22099 #ifndef PERL_IN_XSUB_RE
22102 - regnext - dig the "next" pointer out of a node
22105 Perl_regnext(pTHX_ regnode *p)
22112 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
22113 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22114 (int)OP(p), (int)REGNODE_MAX);
22117 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22127 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22130 STRLEN len = strlen(pat);
22133 const char *message;
22135 PERL_ARGS_ASSERT_RE_CROAK;
22139 Copy(pat, buf, len , char);
22141 buf[len + 1] = '\0';
22142 va_start(args, pat);
22143 msv = vmess(buf, &args);
22145 message = SvPV_const(msv, len);
22148 Copy(message, buf, len , char);
22149 /* len-1 to avoid \n */
22150 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22153 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22155 #ifndef PERL_IN_XSUB_RE
22157 Perl_save_re_context(pTHX)
22162 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22165 const REGEXP * const rx = PM_GETRE(PL_curpm);
22167 nparens = RX_NPARENS(rx);
22170 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22171 * that PL_curpm will be null, but that utf8.pm and the modules it
22172 * loads will only use $1..$3.
22173 * The t/porting/re_context.t test file checks this assumption.
22178 for (i = 1; i <= nparens; i++) {
22179 char digits[TYPE_CHARS(long)];
22180 const STRLEN len = my_snprintf(digits, sizeof(digits),
22182 GV *const *const gvp
22183 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22186 GV * const gv = *gvp;
22187 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22197 S_put_code_point(pTHX_ SV *sv, UV c)
22199 PERL_ARGS_ASSERT_PUT_CODE_POINT;
22202 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22204 else if (isPRINT(c)) {
22205 const char string = (char) c;
22207 /* We use {phrase} as metanotation in the class, so also escape literal
22209 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22210 sv_catpvs(sv, "\\");
22211 sv_catpvn(sv, &string, 1);
22213 else if (isMNEMONIC_CNTRL(c)) {
22214 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22217 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22221 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22224 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22226 /* Appends to 'sv' a displayable version of the range of code points from
22227 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
22228 * that have them, when they occur at the beginning or end of the range.
22229 * It uses hex to output the remaining code points, unless 'allow_literals'
22230 * is true, in which case the printable ASCII ones are output as-is (though
22231 * some of these will be escaped by put_code_point()).
22233 * NOTE: This is designed only for printing ranges of code points that fit
22234 * inside an ANYOF bitmap. Higher code points are simply suppressed
22237 const unsigned int min_range_count = 3;
22239 assert(start <= end);
22241 PERL_ARGS_ASSERT_PUT_RANGE;
22243 while (start <= end) {
22245 const char * format;
22247 if ( end - start < min_range_count
22248 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22250 /* Output a range of 1 or 2 chars individually, or longer ranges
22251 * when printable */
22252 for (; start <= end; start++) {
22253 put_code_point(sv, start);
22258 /* If permitted by the input options, and there is a possibility that
22259 * this range contains a printable literal, look to see if there is
22261 if (allow_literals && start <= MAX_PRINT_A) {
22263 /* If the character at the beginning of the range isn't an ASCII
22264 * printable, effectively split the range into two parts:
22265 * 1) the portion before the first such printable,
22267 * and output them separately. */
22268 if (! isPRINT_A(start)) {
22269 UV temp_end = start + 1;
22271 /* There is no point looking beyond the final possible
22272 * printable, in MAX_PRINT_A */
22273 UV max = MIN(end, MAX_PRINT_A);
22275 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22279 /* Here, temp_end points to one beyond the first printable if
22280 * found, or to one beyond 'max' if not. If none found, make
22281 * sure that we use the entire range */
22282 if (temp_end > MAX_PRINT_A) {
22283 temp_end = end + 1;
22286 /* Output the first part of the split range: the part that
22287 * doesn't have printables, with the parameter set to not look
22288 * for literals (otherwise we would infinitely recurse) */
22289 put_range(sv, start, temp_end - 1, FALSE);
22291 /* The 2nd part of the range (if any) starts here. */
22294 /* We do a continue, instead of dropping down, because even if
22295 * the 2nd part is non-empty, it could be so short that we want
22296 * to output it as individual characters, as tested for at the
22297 * top of this loop. */
22301 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
22302 * output a sub-range of just the digits or letters, then process
22303 * the remaining portion as usual. */
22304 if (isALPHANUMERIC_A(start)) {
22305 UV mask = (isDIGIT_A(start))
22310 UV temp_end = start + 1;
22312 /* Find the end of the sub-range that includes just the
22313 * characters in the same class as the first character in it */
22314 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22319 /* For short ranges, don't duplicate the code above to output
22320 * them; just call recursively */
22321 if (temp_end - start < min_range_count) {
22322 put_range(sv, start, temp_end, FALSE);
22324 else { /* Output as a range */
22325 put_code_point(sv, start);
22326 sv_catpvs(sv, "-");
22327 put_code_point(sv, temp_end);
22329 start = temp_end + 1;
22333 /* We output any other printables as individual characters */
22334 if (isPUNCT_A(start) || isSPACE_A(start)) {
22335 while (start <= end && (isPUNCT_A(start)
22336 || isSPACE_A(start)))
22338 put_code_point(sv, start);
22343 } /* End of looking for literals */
22345 /* Here is not to output as a literal. Some control characters have
22346 * mnemonic names. Split off any of those at the beginning and end of
22347 * the range to print mnemonically. It isn't possible for many of
22348 * these to be in a row, so this won't overwhelm with output */
22350 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22352 while (isMNEMONIC_CNTRL(start) && start <= end) {
22353 put_code_point(sv, start);
22357 /* If this didn't take care of the whole range ... */
22358 if (start <= end) {
22360 /* Look backwards from the end to find the final non-mnemonic
22363 while (isMNEMONIC_CNTRL(temp_end)) {
22367 /* And separately output the interior range that doesn't start
22368 * or end with mnemonics */
22369 put_range(sv, start, temp_end, FALSE);
22371 /* Then output the mnemonic trailing controls */
22372 start = temp_end + 1;
22373 while (start <= end) {
22374 put_code_point(sv, start);
22381 /* As a final resort, output the range or subrange as hex. */
22383 if (start >= NUM_ANYOF_CODE_POINTS) {
22386 else { /* Have to split range at the bitmap boundary */
22387 this_end = (end < NUM_ANYOF_CODE_POINTS)
22389 : NUM_ANYOF_CODE_POINTS - 1;
22391 #if NUM_ANYOF_CODE_POINTS > 256
22392 format = (this_end < 256)
22393 ? "\\x%02" UVXf "-\\x%02" UVXf
22394 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22396 format = "\\x%02" UVXf "-\\x%02" UVXf;
22398 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22399 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22400 GCC_DIAG_RESTORE_STMT;
22406 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22408 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22412 bool allow_literals = TRUE;
22414 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22416 /* Generally, it is more readable if printable characters are output as
22417 * literals, but if a range (nearly) spans all of them, it's best to output
22418 * it as a single range. This code will use a single range if all but 2
22419 * ASCII printables are in it */
22420 invlist_iterinit(invlist);
22421 while (invlist_iternext(invlist, &start, &end)) {
22423 /* If the range starts beyond the final printable, it doesn't have any
22425 if (start > MAX_PRINT_A) {
22429 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
22430 * all but two, the range must start and end no later than 2 from
22432 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22433 if (end > MAX_PRINT_A) {
22439 if (end - start >= MAX_PRINT_A - ' ' - 2) {
22440 allow_literals = FALSE;
22445 invlist_iterfinish(invlist);
22447 /* Here we have figured things out. Output each range */
22448 invlist_iterinit(invlist);
22449 while (invlist_iternext(invlist, &start, &end)) {
22450 if (start >= NUM_ANYOF_CODE_POINTS) {
22453 put_range(sv, start, end, allow_literals);
22455 invlist_iterfinish(invlist);
22461 S_put_charclass_bitmap_innards_common(pTHX_
22462 SV* invlist, /* The bitmap */
22463 SV* posixes, /* Under /l, things like [:word:], \S */
22464 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
22465 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
22466 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
22467 const bool invert /* Is the result to be inverted? */
22470 /* Create and return an SV containing a displayable version of the bitmap
22471 * and associated information determined by the input parameters. If the
22472 * output would have been only the inversion indicator '^', NULL is instead
22477 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22480 output = newSVpvs("^");
22483 output = newSVpvs("");
22486 /* First, the code points in the bitmap that are unconditionally there */
22487 put_charclass_bitmap_innards_invlist(output, invlist);
22489 /* Traditionally, these have been placed after the main code points */
22491 sv_catsv(output, posixes);
22494 if (only_utf8 && _invlist_len(only_utf8)) {
22495 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22496 put_charclass_bitmap_innards_invlist(output, only_utf8);
22499 if (not_utf8 && _invlist_len(not_utf8)) {
22500 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22501 put_charclass_bitmap_innards_invlist(output, not_utf8);
22504 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22505 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22506 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22508 /* This is the only list in this routine that can legally contain code
22509 * points outside the bitmap range. The call just above to
22510 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22511 * output them here. There's about a half-dozen possible, and none in
22512 * contiguous ranges longer than 2 */
22513 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22515 SV* above_bitmap = NULL;
22517 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22519 invlist_iterinit(above_bitmap);
22520 while (invlist_iternext(above_bitmap, &start, &end)) {
22523 for (i = start; i <= end; i++) {
22524 put_code_point(output, i);
22527 invlist_iterfinish(above_bitmap);
22528 SvREFCNT_dec_NN(above_bitmap);
22532 if (invert && SvCUR(output) == 1) {
22540 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22542 SV *nonbitmap_invlist,
22543 SV *only_utf8_locale_invlist,
22544 const regnode * const node,
22546 const bool force_as_is_display)
22548 /* Appends to 'sv' a displayable version of the innards of the bracketed
22549 * character class defined by the other arguments:
22550 * 'bitmap' points to the bitmap, or NULL if to ignore that.
22551 * 'nonbitmap_invlist' is an inversion list of the code points that are in
22552 * the bitmap range, but for some reason aren't in the bitmap; NULL if
22553 * none. The reasons for this could be that they require some
22554 * condition such as the target string being or not being in UTF-8
22555 * (under /d), or because they came from a user-defined property that
22556 * was not resolved at the time of the regex compilation (under /u)
22557 * 'only_utf8_locale_invlist' is an inversion list of the code points that
22558 * are valid only if the runtime locale is a UTF-8 one; NULL if none
22559 * 'node' is the regex pattern ANYOF node. It is needed only when the
22560 * above two parameters are not null, and is passed so that this
22561 * routine can tease apart the various reasons for them.
22562 * 'flags' is the flags field of 'node'
22563 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
22564 * to invert things to see if that leads to a cleaner display. If
22565 * FALSE, this routine is free to use its judgment about doing this.
22567 * It returns TRUE if there was actually something output. (It may be that
22568 * the bitmap, etc is empty.)
22570 * When called for outputting the bitmap of a non-ANYOF node, just pass the
22571 * bitmap, with the succeeding parameters set to NULL, and the final one to
22575 /* In general, it tries to display the 'cleanest' representation of the
22576 * innards, choosing whether to display them inverted or not, regardless of
22577 * whether the class itself is to be inverted. However, there are some
22578 * cases where it can't try inverting, as what actually matches isn't known
22579 * until runtime, and hence the inversion isn't either. */
22581 bool inverting_allowed = ! force_as_is_display;
22584 STRLEN orig_sv_cur = SvCUR(sv);
22586 SV* invlist; /* Inversion list we accumulate of code points that
22587 are unconditionally matched */
22588 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
22590 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
22592 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
22593 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
22596 SV* as_is_display; /* The output string when we take the inputs
22598 SV* inverted_display; /* The output string when we invert the inputs */
22600 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
22602 /* We are biased in favor of displaying things without them being inverted,
22603 * as that is generally easier to understand */
22604 const int bias = 5;
22606 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22608 /* Start off with whatever code points are passed in. (We clone, so we
22609 * don't change the caller's list) */
22610 if (nonbitmap_invlist) {
22611 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22612 invlist = invlist_clone(nonbitmap_invlist, NULL);
22614 else { /* Worst case size is every other code point is matched */
22615 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22619 if (OP(node) == ANYOFD) {
22621 /* This flag indicates that the code points below 0x100 in the
22622 * nonbitmap list are precisely the ones that match only when the
22623 * target is UTF-8 (they should all be non-ASCII). */
22624 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22626 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22627 _invlist_subtract(invlist, only_utf8, &invlist);
22630 /* And this flag for matching all non-ASCII 0xFF and below */
22631 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22633 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22636 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22638 /* If either of these flags are set, what matches isn't
22639 * determinable except during execution, so don't know enough here
22641 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22642 inverting_allowed = FALSE;
22645 /* What the posix classes match also varies at runtime, so these
22646 * will be output symbolically. */
22647 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22650 posixes = newSVpvs("");
22651 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22652 if (ANYOF_POSIXL_TEST(node, i)) {
22653 sv_catpv(posixes, anyofs[i]);
22660 /* Accumulate the bit map into the unconditional match list */
22662 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22663 if (BITMAP_TEST(bitmap, i)) {
22666 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22669 invlist = _add_range_to_invlist(invlist, start, i-1);
22674 /* Make sure that the conditional match lists don't have anything in them
22675 * that match unconditionally; otherwise the output is quite confusing.
22676 * This could happen if the code that populates these misses some
22679 _invlist_subtract(only_utf8, invlist, &only_utf8);
22682 _invlist_subtract(not_utf8, invlist, ¬_utf8);
22685 if (only_utf8_locale_invlist) {
22687 /* Since this list is passed in, we have to make a copy before
22689 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22691 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22693 /* And, it can get really weird for us to try outputting an inverted
22694 * form of this list when it has things above the bitmap, so don't even
22696 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22697 inverting_allowed = FALSE;
22701 /* Calculate what the output would be if we take the input as-is */
22702 as_is_display = put_charclass_bitmap_innards_common(invlist,
22709 /* If have to take the output as-is, just do that */
22710 if (! inverting_allowed) {
22711 if (as_is_display) {
22712 sv_catsv(sv, as_is_display);
22713 SvREFCNT_dec_NN(as_is_display);
22716 else { /* But otherwise, create the output again on the inverted input, and
22717 use whichever version is shorter */
22719 int inverted_bias, as_is_bias;
22721 /* We will apply our bias to whichever of the results doesn't have
22731 inverted_bias = bias;
22734 /* Now invert each of the lists that contribute to the output,
22735 * excluding from the result things outside the possible range */
22737 /* For the unconditional inversion list, we have to add in all the
22738 * conditional code points, so that when inverted, they will be gone
22740 _invlist_union(only_utf8, invlist, &invlist);
22741 _invlist_union(not_utf8, invlist, &invlist);
22742 _invlist_union(only_utf8_locale, invlist, &invlist);
22743 _invlist_invert(invlist);
22744 _invlist_intersection(invlist, PL_InBitmap, &invlist);
22747 _invlist_invert(only_utf8);
22748 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22750 else if (not_utf8) {
22752 /* If a code point matches iff the target string is not in UTF-8,
22753 * then complementing the result has it not match iff not in UTF-8,
22754 * which is the same thing as matching iff it is UTF-8. */
22755 only_utf8 = not_utf8;
22759 if (only_utf8_locale) {
22760 _invlist_invert(only_utf8_locale);
22761 _invlist_intersection(only_utf8_locale,
22763 &only_utf8_locale);
22766 inverted_display = put_charclass_bitmap_innards_common(
22771 only_utf8_locale, invert);
22773 /* Use the shortest representation, taking into account our bias
22774 * against showing it inverted */
22775 if ( inverted_display
22776 && ( ! as_is_display
22777 || ( SvCUR(inverted_display) + inverted_bias
22778 < SvCUR(as_is_display) + as_is_bias)))
22780 sv_catsv(sv, inverted_display);
22782 else if (as_is_display) {
22783 sv_catsv(sv, as_is_display);
22786 SvREFCNT_dec(as_is_display);
22787 SvREFCNT_dec(inverted_display);
22790 SvREFCNT_dec_NN(invlist);
22791 SvREFCNT_dec(only_utf8);
22792 SvREFCNT_dec(not_utf8);
22793 SvREFCNT_dec(posixes);
22794 SvREFCNT_dec(only_utf8_locale);
22796 return SvCUR(sv) > orig_sv_cur;
22799 #define CLEAR_OPTSTART \
22800 if (optstart) STMT_START { \
22801 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
22802 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22806 #define DUMPUNTIL(b,e) \
22808 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22810 STATIC const regnode *
22811 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22812 const regnode *last, const regnode *plast,
22813 SV* sv, I32 indent, U32 depth)
22815 U8 op = PSEUDO; /* Arbitrary non-END op. */
22816 const regnode *next;
22817 const regnode *optstart= NULL;
22819 RXi_GET_DECL(r, ri);
22820 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22822 PERL_ARGS_ASSERT_DUMPUNTIL;
22824 #ifdef DEBUG_DUMPUNTIL
22825 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
22826 last ? last-start : 0, plast ? plast-start : 0);
22829 if (plast && plast < last)
22832 while (PL_regkind[op] != END && (!last || node < last)) {
22834 /* While that wasn't END last time... */
22837 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22839 next = regnext((regnode *)node);
22842 if (OP(node) == OPTIMIZED) {
22843 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22850 regprop(r, sv, node, NULL, NULL);
22851 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
22852 (int)(2*indent + 1), "", SvPVX_const(sv));
22854 if (OP(node) != OPTIMIZED) {
22855 if (next == NULL) /* Next ptr. */
22856 Perl_re_printf( aTHX_ " (0)");
22857 else if (PL_regkind[(U8)op] == BRANCH
22858 && PL_regkind[OP(next)] != BRANCH )
22859 Perl_re_printf( aTHX_ " (FAIL)");
22861 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
22862 Perl_re_printf( aTHX_ "\n");
22866 if (PL_regkind[(U8)op] == BRANCHJ) {
22869 const regnode *nnode = (OP(next) == LONGJMP
22870 ? regnext((regnode *)next)
22872 if (last && nnode > last)
22874 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22877 else if (PL_regkind[(U8)op] == BRANCH) {
22879 DUMPUNTIL(NEXTOPER(node), next);
22881 else if ( PL_regkind[(U8)op] == TRIE ) {
22882 const regnode *this_trie = node;
22883 const char op = OP(node);
22884 const U32 n = ARG(node);
22885 const reg_ac_data * const ac = op>=AHOCORASICK ?
22886 (reg_ac_data *)ri->data->data[n] :
22888 const reg_trie_data * const trie =
22889 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22891 AV *const trie_words
22892 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22894 const regnode *nextbranch= NULL;
22897 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22898 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22900 Perl_re_indentf( aTHX_ "%s ",
22903 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22904 SvCUR(*elem_ptr), PL_dump_re_max_len,
22905 PL_colors[0], PL_colors[1],
22907 ? PERL_PV_ESCAPE_UNI
22909 | PERL_PV_PRETTY_ELLIPSES
22910 | PERL_PV_PRETTY_LTGT
22915 U16 dist= trie->jump[word_idx+1];
22916 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22917 (UV)((dist ? this_trie + dist : next) - start));
22920 nextbranch= this_trie + trie->jump[0];
22921 DUMPUNTIL(this_trie + dist, nextbranch);
22923 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22924 nextbranch= regnext((regnode *)nextbranch);
22926 Perl_re_printf( aTHX_ "\n");
22929 if (last && next > last)
22934 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22935 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22936 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22938 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22940 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22942 else if ( op == PLUS || op == STAR) {
22943 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22945 else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22946 /* Literal string, where present. */
22947 node += NODE_SZ_STR(node) - 1;
22948 node = NEXTOPER(node);
22951 node = NEXTOPER(node);
22952 node += regarglen[(U8)op];
22954 if (op == CURLYX || op == OPEN || op == SROPEN)
22958 #ifdef DEBUG_DUMPUNTIL
22959 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22964 #endif /* DEBUGGING */
22966 #ifndef PERL_IN_XSUB_RE
22968 # include "uni_keywords.h"
22971 Perl_init_uniprops(pTHX)
22975 char * dump_len_string;
22977 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22978 if ( ! dump_len_string
22979 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22981 PL_dump_re_max_len = 60; /* A reasonable default */
22985 PL_user_def_props = newHV();
22987 # ifdef USE_ITHREADS
22989 HvSHAREKEYS_off(PL_user_def_props);
22990 PL_user_def_props_aTHX = aTHX;
22994 /* Set up the inversion list interpreter-level variables */
22996 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22997 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22998 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22999 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23000 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23001 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23002 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23003 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23004 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23005 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23006 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23007 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23008 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23009 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23010 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23011 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23013 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23014 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23015 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23016 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23017 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23018 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23019 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23020 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23021 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23022 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23023 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23024 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23025 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23026 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23027 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23028 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23030 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23031 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23032 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23033 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23034 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23036 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23037 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23038 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23039 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23041 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23043 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23044 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23046 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23047 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23049 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23050 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23051 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23052 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23053 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23054 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23055 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23056 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23057 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23058 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23059 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23060 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23061 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23062 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23065 /* The below are used only by deprecated functions. They could be removed */
23066 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23067 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23068 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23072 /* These four functions are compiled only in regcomp.c, where they have access
23073 * to the data they return. They are a way for re_comp.c to get access to that
23074 * data without having to compile the whole data structures. */
23077 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23079 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23081 return match_uniprop((U8 *) key, key_len);
23085 Perl_get_prop_definition(pTHX_ const int table_index)
23087 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23089 /* Create and return the inversion list */
23090 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23093 const char * const *
23094 Perl_get_prop_values(const int table_index)
23096 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23098 return UNI_prop_value_ptrs[table_index];
23102 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23104 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23106 return deprecated_property_msgs[warning_offset];
23111 This code was mainly added for backcompat to give a warning for non-portable
23112 code points in user-defined properties. But experiments showed that the
23113 warning in earlier perls were only omitted on overflow, which should be an
23114 error, so there really isnt a backcompat issue, and actually adding the
23115 warning when none was present before might cause breakage, for little gain. So
23116 khw left this code in, but not enabled. Tests were never added.
23119 Ei |const char *|get_extended_utf8_msg|const UV cp
23121 PERL_STATIC_INLINE const char *
23122 S_get_extended_utf8_msg(pTHX_ const UV cp)
23124 U8 dummy[UTF8_MAXBYTES + 1];
23128 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23131 msg = hv_fetchs(msgs, "text", 0);
23134 (void) sv_2mortal((SV *) msgs);
23136 return SvPVX(*msg);
23140 #endif /* end of ! PERL_IN_XSUB_RE */
23143 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23144 const bool ignore_case)
23146 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23147 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23148 * because nothing outside of ASCII will match. Use /m because the input
23149 * string may be a bunch of lines strung together.
23151 * Also sets up the debugging info */
23153 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23155 SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23156 REGEXP * subpattern_re;
23157 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23159 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23164 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23166 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23167 rx_flags = flags & RXf_PMf_COMPILETIME;
23169 #ifndef PERL_IN_XSUB_RE
23170 /* Use the core engine if this file is regcomp.c. That means no
23171 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23172 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23173 &PL_core_reg_engine,
23177 if (isDEBUG_WILDCARD) {
23178 /* Use the special debugging engine if this file is re_comp.c and wants
23179 * to output the wildcard matching. This uses whatever
23180 * 'use re "Debug ..." is in effect */
23181 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23187 /* Use the special wildcard engine if this file is re_comp.c and
23188 * doesn't want to output the wildcard matching. This uses whatever
23189 * 'use re "Debug ..." is in effect for compilation, but this engine
23190 * structure has been set up so that it uses the core engine for
23191 * execution, so no execution debugging as a result of re.pm will be
23193 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23197 /* XXX The above has the effect that any user-supplied regex engine
23198 * won't be called for matching wildcards. That might be good, or bad.
23199 * It could be changed in several ways. The reason it is done the
23200 * current way is to avoid having to save and restore
23201 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
23202 * could be used. Another suggestion is to keep the authoritative
23203 * value of the debug flags in a thread-local variable and add set/get
23204 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23205 * Still another is to pass a flag, say in the engine's intflags that
23206 * would be checked each time before doing the debug output */
23210 assert(subpattern_re); /* Should have died if didn't compile successfully */
23211 return subpattern_re;
23215 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23216 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23219 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23221 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23225 /* The compilation has set things up so that if the program doesn't want to
23226 * see the wildcard matching procedure, it will get the core execution
23227 * engine, which is subject only to -Dr. So we have to turn that off
23228 * around this procedure */
23229 if (! isDEBUG_WILDCARD) {
23230 /* Note! Casts away 'volatile' */
23232 PL_debug &= ~ DEBUG_r_FLAG;
23235 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23243 S_handle_user_defined_property(pTHX_
23245 /* Parses the contents of a user-defined property definition; returning the
23246 * expanded definition if possible. If so, the return is an inversion
23249 * If there are subroutines that are part of the expansion and which aren't
23250 * known at the time of the call to this function, this returns what
23251 * parse_uniprop_string() returned for the first one encountered.
23253 * If an error was found, NULL is returned, and 'msg' gets a suitable
23254 * message appended to it. (Appending allows the back trace of how we got
23255 * to the faulty definition to be displayed through nested calls of
23256 * user-defined subs.)
23258 * The caller IS responsible for freeing any returned SV.
23260 * The syntax of the contents is pretty much described in perlunicode.pod,
23261 * but we also allow comments on each line */
23263 const char * name, /* Name of property */
23264 const STRLEN name_len, /* The name's length in bytes */
23265 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23266 const bool to_fold, /* ? Is this under /i */
23267 const bool runtime, /* ? Are we in compile- or run-time */
23268 const bool deferrable, /* Is it ok for this property's full definition
23269 to be deferred until later? */
23270 SV* contents, /* The property's definition */
23271 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
23272 getting called unless this is thought to be
23273 a user-defined property */
23274 SV * msg, /* Any error or warning msg(s) are appended to
23276 const STRLEN level) /* Recursion level of this call */
23279 const char * string = SvPV_const(contents, len);
23280 const char * const e = string + len;
23281 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23282 const STRLEN msgs_length_on_entry = SvCUR(msg);
23284 const char * s0 = string; /* Points to first byte in the current line
23285 being parsed in 'string' */
23286 const char overflow_msg[] = "Code point too large in \"";
23287 SV* running_definition = NULL;
23289 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23291 *user_defined_ptr = TRUE;
23293 /* Look at each line */
23295 const char * s; /* Current byte */
23296 char op = '+'; /* Default operation is 'union' */
23297 IV min = 0; /* range begin code point */
23298 IV max = -1; /* and range end */
23299 SV* this_definition;
23301 /* Skip comment lines */
23303 s0 = strchr(s0, '\n');
23311 /* For backcompat, allow an empty first line */
23317 /* First character in the line may optionally be the operation */
23326 /* If the line is one or two hex digits separated by blank space, its
23327 * a range; otherwise it is either another user-defined property or an
23332 if (! isXDIGIT(*s)) {
23333 goto check_if_property;
23336 do { /* Each new hex digit will add 4 bits. */
23337 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23338 s = strchr(s, '\n');
23342 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23343 sv_catpv(msg, overflow_msg);
23344 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23345 UTF8fARG(is_contents_utf8, s - s0, s0));
23346 sv_catpvs(msg, "\"");
23347 goto return_failure;
23350 /* Accumulate this digit into the value */
23351 min = (min << 4) + READ_XDIGIT(s);
23352 } while (isXDIGIT(*s));
23354 while (isBLANK(*s)) { s++; }
23356 /* We allow comments at the end of the line */
23358 s = strchr(s, '\n');
23364 else if (s < e && *s != '\n') {
23365 if (! isXDIGIT(*s)) {
23366 goto check_if_property;
23369 /* Look for the high point of the range */
23372 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23373 s = strchr(s, '\n');
23377 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23378 sv_catpv(msg, overflow_msg);
23379 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23380 UTF8fARG(is_contents_utf8, s - s0, s0));
23381 sv_catpvs(msg, "\"");
23382 goto return_failure;
23385 max = (max << 4) + READ_XDIGIT(s);
23386 } while (isXDIGIT(*s));
23388 while (isBLANK(*s)) { s++; }
23391 s = strchr(s, '\n');
23396 else if (s < e && *s != '\n') {
23397 goto check_if_property;
23401 if (max == -1) { /* The line only had one entry */
23404 else if (max < min) {
23405 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23406 sv_catpvs(msg, "Illegal range in \"");
23407 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23408 UTF8fARG(is_contents_utf8, s - s0, s0));
23409 sv_catpvs(msg, "\"");
23410 goto return_failure;
23413 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
23415 if ( UNICODE_IS_PERL_EXTENDED(min)
23416 || UNICODE_IS_PERL_EXTENDED(max))
23418 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23420 /* If both code points are non-portable, warn only on the lower
23422 sv_catpv(msg, get_extended_utf8_msg(
23423 (UNICODE_IS_PERL_EXTENDED(min))
23425 sv_catpvs(msg, " in \"");
23426 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23427 UTF8fARG(is_contents_utf8, s - s0, s0));
23428 sv_catpvs(msg, "\"");
23433 /* Here, this line contains a legal range */
23434 this_definition = sv_2mortal(_new_invlist(2));
23435 this_definition = _add_range_to_invlist(this_definition, min, max);
23440 /* Here it isn't a legal range line. See if it is a legal property
23441 * line. First find the end of the meat of the line */
23442 s = strpbrk(s, "#\n");
23447 /* Ignore trailing blanks in keeping with the requirements of
23448 * parse_uniprop_string() */
23450 while (s > s0 && isBLANK_A(*s)) {
23455 this_definition = parse_uniprop_string(s0, s - s0,
23456 is_utf8, to_fold, runtime,
23459 user_defined_ptr, msg,
23461 ? level /* Don't increase level
23462 if input is empty */
23465 if (this_definition == NULL) {
23466 goto return_failure; /* 'msg' should have had the reason
23467 appended to it by the above call */
23470 if (! is_invlist(this_definition)) { /* Unknown at this time */
23471 return newSVsv(this_definition);
23475 s = strchr(s, '\n');
23485 _invlist_union(running_definition, this_definition,
23486 &running_definition);
23489 _invlist_subtract(running_definition, this_definition,
23490 &running_definition);
23493 _invlist_intersection(running_definition, this_definition,
23494 &running_definition);
23497 _invlist_union_complement_2nd(running_definition,
23498 this_definition, &running_definition);
23501 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23502 __FILE__, __LINE__, op);
23506 /* Position past the '\n' */
23508 } /* End of loop through the lines of 'contents' */
23510 /* Here, we processed all the lines in 'contents' without error. If we
23511 * didn't add any warnings, simply return success */
23512 if (msgs_length_on_entry == SvCUR(msg)) {
23514 /* If the expansion was empty, the answer isn't nothing: its an empty
23515 * inversion list */
23516 if (running_definition == NULL) {
23517 running_definition = _new_invlist(1);
23520 return running_definition;
23523 /* Otherwise, add some explanatory text, but we will return success */
23527 running_definition = NULL;
23531 if (name_len > 0) {
23532 sv_catpvs(msg, " in expansion of ");
23533 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23536 return running_definition;
23539 /* As explained below, certain operations need to take place in the first
23540 * thread created. These macros switch contexts */
23541 # ifdef USE_ITHREADS
23542 # define DECLARATION_FOR_GLOBAL_CONTEXT \
23543 PerlInterpreter * save_aTHX = aTHX;
23544 # define SWITCH_TO_GLOBAL_CONTEXT \
23545 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23546 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
23547 # define CUR_CONTEXT aTHX
23548 # define ORIGINAL_CONTEXT save_aTHX
23550 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
23551 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
23552 # define RESTORE_CONTEXT NOOP
23553 # define CUR_CONTEXT NULL
23554 # define ORIGINAL_CONTEXT NULL
23558 S_delete_recursion_entry(pTHX_ void *key)
23560 /* Deletes the entry used to detect recursion when expanding user-defined
23561 * properties. This is a function so it can be set up to be called even if
23562 * the program unexpectedly quits */
23564 SV ** current_entry;
23565 const STRLEN key_len = strlen((const char *) key);
23566 DECLARATION_FOR_GLOBAL_CONTEXT;
23568 SWITCH_TO_GLOBAL_CONTEXT;
23570 /* If the entry is one of these types, it is a permanent entry, and not the
23571 * one used to detect recursions. This function should delete only the
23572 * recursion entry */
23573 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23575 && ! is_invlist(*current_entry)
23576 && ! SvPOK(*current_entry))
23578 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23586 S_get_fq_name(pTHX_
23587 const char * const name, /* The first non-blank in the \p{}, \P{} */
23588 const Size_t name_len, /* Its length in bytes, not including any trailing space */
23589 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23590 const bool has_colon_colon
23593 /* Returns a mortal SV containing the fully qualified version of the input
23598 fq_name = newSVpvs_flags("", SVs_TEMP);
23600 /* Use the current package if it wasn't included in our input */
23601 if (! has_colon_colon) {
23602 const HV * pkg = (IN_PERL_COMPILETIME)
23604 : CopSTASH(PL_curcop);
23605 const char* pkgname = HvNAME(pkg);
23607 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23608 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23609 sv_catpvs(fq_name, "::");
23612 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23613 UTF8fARG(is_utf8, name_len, name));
23618 S_parse_uniprop_string(pTHX_
23620 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
23621 * now. If so, the return is an inversion list.
23623 * If the property is user-defined, it is a subroutine, which in turn
23624 * may call other subroutines. This function will call the whole nest of
23625 * them to get the definition they return; if some aren't known at the time
23626 * of the call to this function, the fully qualified name of the highest
23627 * level sub is returned. It is an error to call this function at runtime
23628 * without every sub defined.
23630 * If an error was found, NULL is returned, and 'msg' gets a suitable
23631 * message appended to it. (Appending allows the back trace of how we got
23632 * to the faulty definition to be displayed through nested calls of
23633 * user-defined subs.)
23635 * The caller should NOT try to free any returned inversion list.
23637 * Other parameters will be set on return as described below */
23639 const char * const name, /* The first non-blank in the \p{}, \P{} */
23640 Size_t name_len, /* Its length in bytes, not including any
23642 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23643 const bool to_fold, /* ? Is this under /i */
23644 const bool runtime, /* TRUE if this is being called at run time */
23645 const bool deferrable, /* TRUE if it's ok for the definition to not be
23646 known at this call */
23647 AV ** strings, /* To return string property values, like named
23649 bool *user_defined_ptr, /* Upon return from this function it will be
23650 set to TRUE if any component is a
23651 user-defined property */
23652 SV * msg, /* Any error or warning msg(s) are appended to
23654 const STRLEN level) /* Recursion level of this call */
23656 char* lookup_name; /* normalized name for lookup in our tables */
23657 unsigned lookup_len; /* Its length */
23658 enum { Not_Strict = 0, /* Some properties have stricter name */
23659 Strict, /* normalization rules, which we decide */
23660 As_Is /* upon based on parsing */
23661 } stricter = Not_Strict;
23663 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23664 * (though it requires extra effort to download them from Unicode and
23665 * compile perl to know about them) */
23666 bool is_nv_type = FALSE;
23668 unsigned int i, j = 0;
23669 int equals_pos = -1; /* Where the '=' is found, or negative if none */
23670 int slash_pos = -1; /* Where the '/' is found, or negative if none */
23671 int table_index = 0; /* The entry number for this property in the table
23672 of all Unicode property names */
23673 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
23674 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
23675 the normalized name in certain situations */
23676 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
23677 part of a package name */
23678 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
23679 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
23680 property rather than a Unicode
23682 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
23683 if an error. If it is an inversion list,
23684 it is the definition. Otherwise it is a
23685 string containing the fully qualified sub
23687 SV * fq_name = NULL; /* For user-defined properties, the fully
23689 bool invert_return = FALSE; /* ? Do we need to complement the result before
23691 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23692 explicit utf8:: package that we strip
23694 /* The expansion of properties that could be either user-defined or
23695 * official unicode ones is deferred until runtime, including a marker for
23696 * those that might be in the latter category. This boolean indicates if
23697 * we've seen that marker. If not, what we're parsing can't be such an
23698 * official Unicode property whose expansion was deferred */
23699 bool could_be_deferred_official = FALSE;
23701 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23703 /* The input will be normalized into 'lookup_name' */
23704 Newx(lookup_name, name_len, char);
23705 SAVEFREEPV(lookup_name);
23707 /* Parse the input. */
23708 for (i = 0; i < name_len; i++) {
23709 char cur = name[i];
23711 /* Most of the characters in the input will be of this ilk, being parts
23713 if (isIDCONT_A(cur)) {
23715 /* Case differences are ignored. Our lookup routine assumes
23716 * everything is lowercase, so normalize to that */
23717 if (isUPPER_A(cur)) {
23718 lookup_name[j++] = toLOWER_A(cur);
23722 if (cur == '_') { /* Don't include these in the normalized name */
23726 lookup_name[j++] = cur;
23728 /* The first character in a user-defined name must be of this type.
23730 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23731 could_be_user_defined = FALSE;
23737 /* Here, the character is not something typically in a name, But these
23738 * two types of characters (and the '_' above) can be freely ignored in
23739 * most situations. Later it may turn out we shouldn't have ignored
23740 * them, and we have to reparse, but we don't have enough information
23741 * yet to make that decision */
23742 if (cur == '-' || isSPACE_A(cur)) {
23743 could_be_user_defined = FALSE;
23747 /* An equals sign or single colon mark the end of the first part of
23748 * the property name */
23750 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23752 lookup_name[j++] = '='; /* Treat the colon as an '=' */
23753 equals_pos = j; /* Note where it occurred in the input */
23754 could_be_user_defined = FALSE;
23758 /* If this looks like it is a marker we inserted at compile time,
23759 * set a flag and otherwise ignore it. If it isn't in the final
23760 * position, keep it as it would have been user input. */
23761 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23763 && could_be_user_defined
23764 && i == name_len - 1)
23767 could_be_deferred_official = TRUE;
23771 /* Otherwise, this character is part of the name. */
23772 lookup_name[j++] = cur;
23774 /* Here it isn't a single colon, so if it is a colon, it must be a
23778 /* A double colon should be a package qualifier. We note its
23779 * position and continue. Note that one could have
23780 * pkg1::pkg2::...::foo
23781 * so that the position at the end of the loop will be just after
23782 * the final qualifier */
23785 non_pkg_begin = i + 1;
23786 lookup_name[j++] = ':';
23787 lun_non_pkg_begin = j;
23789 else { /* Only word chars (and '::') can be in a user-defined name */
23790 could_be_user_defined = FALSE;
23792 } /* End of parsing through the lhs of the property name (or all of it if
23795 # define STRLENs(s) (sizeof("" s "") - 1)
23797 /* If there is a single package name 'utf8::', it is ambiguous. It could
23798 * be for a user-defined property, or it could be a Unicode property, as
23799 * all of them are considered to be for that package. For the purposes of
23800 * parsing the rest of the property, strip it off */
23801 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23802 lookup_name += STRLENs("utf8::");
23803 j -= STRLENs("utf8::");
23804 equals_pos -= STRLENs("utf8::");
23805 stripped_utf8_pkg = TRUE;
23808 /* Here, we are either done with the whole property name, if it was simple;
23809 * or are positioned just after the '=' if it is compound. */
23811 if (equals_pos >= 0) {
23812 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23814 /* Space immediately after the '=' is ignored */
23816 for (; i < name_len; i++) {
23817 if (! isSPACE_A(name[i])) {
23822 /* Most punctuation after the equals indicates a subpattern, like
23824 if ( isPUNCT_A(name[i])
23829 /* A backslash means the real delimitter is the next character,
23830 * but it must be punctuation */
23831 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23833 bool special_property = memEQs(lookup_name, j - 1, "name")
23834 || memEQs(lookup_name, j - 1, "na");
23835 if (! special_property) {
23836 /* Find the property. The table includes the equals sign, so
23837 * we use 'j' as-is */
23838 table_index = do_uniprop_match(lookup_name, j);
23840 if (special_property || table_index) {
23841 REGEXP * subpattern_re;
23842 char open = name[i++];
23844 const char * pos_in_brackets;
23845 const char * const * prop_values;
23848 /* Backslash => delimitter is the character following. We
23849 * already checked that it is punctuation */
23850 if (open == '\\') {
23855 /* This data structure is constructed so that the matching
23856 * closing bracket is 3 past its matching opening. The second
23857 * set of closing is so that if the opening is something like
23858 * ']', the closing will be that as well. Something similar is
23859 * done in toke.c */
23860 pos_in_brackets = memCHRs("([<)]>)]>", open);
23861 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23864 || name[name_len-1] != close
23865 || (escaped && name[name_len-2] != '\\')
23866 /* Also make sure that there are enough characters.
23867 * e.g., '\\\' would show up incorrectly as legal even
23868 * though it is too short */
23869 || (SSize_t) (name_len - i - 1 - escaped) < 0)
23871 sv_catpvs(msg, "Unicode property wildcard not terminated");
23872 goto append_name_to_msg;
23875 Perl_ck_warner_d(aTHX_
23876 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23877 "The Unicode property wildcards feature is experimental");
23879 if (special_property) {
23880 const char * error_msg;
23881 const char * revised_name = name + i;
23882 Size_t revised_name_len = name_len - (i + 1 + escaped);
23884 /* Currently, the only 'special_property' is name, which we
23885 * lookup in _charnames.pm */
23887 if (! load_charnames(newSVpvs("placeholder"),
23888 revised_name, revised_name_len,
23891 sv_catpv(msg, error_msg);
23892 goto append_name_to_msg;
23895 /* Farm this out to a function just to make the current
23896 * function less unwieldy */
23897 if (handle_names_wildcard(revised_name, revised_name_len,
23901 return prop_definition;
23907 prop_values = get_prop_values(table_index);
23909 /* Now create and compile the wildcard subpattern. Use /i
23910 * because the property values are supposed to match with case
23912 subpattern_re = compile_wildcard(name + i,
23913 name_len - i - 1 - escaped,
23917 /* For each legal property value, see if the supplied pattern
23919 while (*prop_values) {
23920 const char * const entry = *prop_values;
23921 const Size_t len = strlen(entry);
23922 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23924 if (execute_wildcard(subpattern_re,
23926 (char *) entry + len,
23930 { /* Here, matched. Add to the returned list */
23931 Size_t total_len = j + len;
23932 SV * sub_invlist = NULL;
23933 char * this_string;
23935 /* We know this is a legal \p{property=value}. Call
23936 * the function to return the list of code points that
23938 Newxz(this_string, total_len + 1, char);
23939 Copy(lookup_name, this_string, j, char);
23940 my_strlcat(this_string, entry, total_len + 1);
23941 SAVEFREEPV(this_string);
23942 sub_invlist = parse_uniprop_string(this_string,
23952 _invlist_union(prop_definition, sub_invlist,
23956 prop_values++; /* Next iteration, look at next propvalue */
23957 } /* End of looking through property values; (the data
23958 structure is terminated by a NULL ptr) */
23960 SvREFCNT_dec_NN(subpattern_re);
23962 if (prop_definition) {
23963 return prop_definition;
23966 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23967 goto append_name_to_msg;
23970 /* Here's how khw thinks we should proceed to handle the properties
23971 * not yet done: Bidi Mirroring Glyph can map to ""
23972 Bidi Paired Bracket can map to ""
23973 Case Folding (both full and simple)
23974 Shouldn't /i be good enough for Full
23975 Decomposition Mapping
23976 Equivalent Unified Ideograph can map to ""
23977 Lowercase Mapping (both full and simple)
23978 NFKC Case Fold can map to ""
23979 Titlecase Mapping (both full and simple)
23980 Uppercase Mapping (both full and simple)
23981 * Handle these the same way Name is done, using say, _wild.pm, but
23982 * having both loose and full, like in charclass_invlists.h.
23983 * Perhaps move block and script to that as they are somewhat large
23984 * in charclass_invlists.h.
23985 * For properties where the default is the code point itself, such
23986 * as any of the case changing mappings, the string would otherwise
23987 * consist of all Unicode code points in UTF-8 strung together.
23988 * This would be impractical. So instead, examine their compiled
23989 * pattern, looking at the ssc. If none, reject the pattern as an
23990 * error. Otherwise run the pattern against every code point in
23991 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
23992 * And it might be good to create an API to return the ssc.
23993 * Or handle them like the algorithmic names are done
23995 } /* End of is a wildcard subppattern */
23997 /* \p{name=...} is handled specially. Instead of using the normal
23998 * mechanism involving charclass_invlists.h, it uses _charnames.pm
23999 * which has the necessary (huge) data accessible to it, and which
24000 * doesn't get loaded unless necessary. The legal syntax for names is
24001 * somewhat different than other properties due both to the vagaries of
24002 * a few outlier official names, and the fact that only a few ASCII
24003 * characters are permitted in them */
24004 if ( memEQs(lookup_name, j - 1, "name")
24005 || memEQs(lookup_name, j - 1, "na"))
24010 const char * error_msg;
24012 SV * character_name;
24013 STRLEN character_len;
24018 /* Since the RHS (after skipping initial space) is passed unchanged
24019 * to charnames, and there are different criteria for what are
24020 * legal characters in the name, just parse it here. A character
24021 * name must begin with an ASCII alphabetic */
24022 if (! isALPHA(name[i])) {
24025 lookup_name[j++] = name[i];
24027 for (++i; i < name_len; i++) {
24028 /* Official names can only be in the ASCII range, and only
24029 * certain characters */
24030 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24033 lookup_name[j++] = name[i];
24036 /* Finished parsing, save the name into an SV */
24037 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24039 /* Make sure _charnames is loaded. (The parameters give context
24040 * for any errors generated */
24041 table = load_charnames(character_name, name, name_len, &error_msg);
24042 if (table == NULL) {
24043 sv_catpv(msg, error_msg);
24044 goto append_name_to_msg;
24047 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24048 if (! lookup_loose) {
24050 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24053 PUSHSTACKi(PERLSI_REGCOMP);
24059 XPUSHs(character_name);
24061 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24066 SvREFCNT_inc_simple_void_NN(character);
24073 if (! SvOK(character)) {
24077 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24078 if (character_len == SvCUR(character)) {
24079 prop_definition = add_cp_to_invlist(NULL, cp);
24084 /* First of the remaining characters in the string. */
24085 char * remaining = SvPVX(character) + character_len;
24087 if (strings == NULL) {
24088 goto failed; /* XXX Perhaps a specific msg instead, like
24089 'not available here' */
24092 if (*strings == NULL) {
24093 *strings = newAV();
24096 this_string = newAV();
24097 av_push(this_string, newSVuv(cp));
24100 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24101 av_push(this_string, newSVuv(cp));
24102 remaining += character_len;
24103 } while (remaining < SvEND(character));
24105 av_push(*strings, (SV *) this_string);
24108 return prop_definition;
24111 /* Certain properties whose values are numeric need special handling.
24112 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24113 * purposes of checking if this is one of those properties */
24114 if (memBEGINPs(lookup_name, j, "is")) {
24118 /* Then check if it is one of these specially-handled properties. The
24119 * possibilities are hard-coded because easier this way, and the list
24120 * is unlikely to change.
24122 * All numeric value type properties are of this ilk, and are also
24123 * special in a different way later on. So find those first. There
24124 * are several numeric value type properties in the Unihan DB (which is
24125 * unlikely to be compiled with perl, but we handle it here in case it
24126 * does get compiled). They all end with 'numeric'. The interiors
24127 * aren't checked for the precise property. This would stop working if
24128 * a cjk property were to be created that ended with 'numeric' and
24129 * wasn't a numeric type */
24130 is_nv_type = memEQs(lookup_name + lookup_offset,
24131 j - 1 - lookup_offset, "numericvalue")
24132 || memEQs(lookup_name + lookup_offset,
24133 j - 1 - lookup_offset, "nv")
24134 || ( memENDPs(lookup_name + lookup_offset,
24135 j - 1 - lookup_offset, "numeric")
24136 && ( memBEGINPs(lookup_name + lookup_offset,
24137 j - 1 - lookup_offset, "cjk")
24138 || memBEGINPs(lookup_name + lookup_offset,
24139 j - 1 - lookup_offset, "k")));
24141 || memEQs(lookup_name + lookup_offset,
24142 j - 1 - lookup_offset, "canonicalcombiningclass")
24143 || memEQs(lookup_name + lookup_offset,
24144 j - 1 - lookup_offset, "ccc")
24145 || memEQs(lookup_name + lookup_offset,
24146 j - 1 - lookup_offset, "age")
24147 || memEQs(lookup_name + lookup_offset,
24148 j - 1 - lookup_offset, "in")
24149 || memEQs(lookup_name + lookup_offset,
24150 j - 1 - lookup_offset, "presentin"))
24154 /* Since the stuff after the '=' is a number, we can't throw away
24155 * '-' willy-nilly, as those could be a minus sign. Other stricter
24156 * rules also apply. However, these properties all can have the
24157 * rhs not be a number, in which case they contain at least one
24158 * alphabetic. In those cases, the stricter rules don't apply.
24159 * But the numeric type properties can have the alphas [Ee] to
24160 * signify an exponent, and it is still a number with stricter
24161 * rules. So look for an alpha that signifies not-strict */
24163 for (k = i; k < name_len; k++) {
24164 if ( isALPHA_A(name[k])
24165 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24167 stricter = Not_Strict;
24175 /* A number may have a leading '+' or '-'. The latter is retained
24177 if (name[i] == '+') {
24180 else if (name[i] == '-') {
24181 lookup_name[j++] = '-';
24185 /* Skip leading zeros including single underscores separating the
24186 * zeros, or between the final leading zero and the first other
24188 for (; i < name_len - 1; i++) {
24189 if ( name[i] != '0'
24190 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24197 else { /* No '=' */
24199 /* Only a few properties without an '=' should be parsed with stricter
24200 * rules. The list is unlikely to change. */
24201 if ( memBEGINPs(lookup_name, j, "perl")
24202 && memNEs(lookup_name + 4, j - 4, "space")
24203 && memNEs(lookup_name + 4, j - 4, "word"))
24207 /* We set the inputs back to 0 and the code below will reparse,
24213 /* Here, we have either finished the property, or are positioned to parse
24214 * the remainder, and we know if stricter rules apply. Finish out, if not
24216 for (; i < name_len; i++) {
24217 char cur = name[i];
24219 /* In all instances, case differences are ignored, and we normalize to
24221 if (isUPPER_A(cur)) {
24222 lookup_name[j++] = toLOWER(cur);
24226 /* An underscore is skipped, but not under strict rules unless it
24227 * separates two digits */
24230 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
24231 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24233 lookup_name[j++] = '_';
24238 /* Hyphens are skipped except under strict */
24239 if (cur == '-' && ! stricter) {
24243 /* XXX Bug in documentation. It says white space skipped adjacent to
24244 * non-word char. Maybe we should, but shouldn't skip it next to a dot
24246 if (isSPACE_A(cur) && ! stricter) {
24250 lookup_name[j++] = cur;
24252 /* Unless this is a non-trailing slash, we are done with it */
24253 if (i >= name_len - 1 || cur != '/') {
24259 /* A slash in the 'numeric value' property indicates that what follows
24260 * is a denominator. It can have a leading '+' and '0's that should be
24261 * skipped. But we have never allowed a negative denominator, so treat
24262 * a minus like every other character. (No need to rule out a second
24263 * '/', as that won't match anything anyway */
24266 if (i < name_len && name[i] == '+') {
24270 /* Skip leading zeros including underscores separating digits */
24271 for (; i < name_len - 1; i++) {
24272 if ( name[i] != '0'
24273 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24279 /* Store the first real character in the denominator */
24280 if (i < name_len) {
24281 lookup_name[j++] = name[i];
24286 /* Here are completely done parsing the input 'name', and 'lookup_name'
24287 * contains a copy, normalized.
24289 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24290 * different from without the underscores. */
24291 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
24292 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24293 && UNLIKELY(name[name_len-1] == '_'))
24295 lookup_name[j++] = '&';
24298 /* If the original input began with 'In' or 'Is', it could be a subroutine
24299 * call to a user-defined property instead of a Unicode property name. */
24300 if ( name_len - non_pkg_begin > 2
24301 && name[non_pkg_begin+0] == 'I'
24302 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24304 /* Names that start with In have different characterstics than those
24305 * that start with Is */
24306 if (name[non_pkg_begin+1] == 's') {
24307 starts_with_Is = TRUE;
24311 could_be_user_defined = FALSE;
24314 if (could_be_user_defined) {
24317 /* If the user defined property returns the empty string, it could
24318 * easily be because the pattern is being compiled before the data it
24319 * actually needs to compile is available. This could be argued to be
24320 * a bug in the perl code, but this is a change of behavior for Perl,
24321 * so we handle it. This means that intentionally returning nothing
24322 * will not be resolved until runtime */
24323 bool empty_return = FALSE;
24325 /* Here, the name could be for a user defined property, which are
24326 * implemented as subs. */
24327 user_sub = get_cvn_flags(name, name_len, 0);
24330 /* Here, the property name could be a user-defined one, but there
24331 * is no subroutine to handle it (as of now). Defer handling it
24332 * until runtime. Otherwise, a block defined by Unicode in a later
24333 * release would get the synonym InFoo added for it, and existing
24334 * code that used that name would suddenly break if it referred to
24335 * the property before the sub was declared. See [perl #134146] */
24337 goto definition_deferred;
24340 /* Here, we are at runtime, and didn't find the user property. It
24341 * could be an official property, but only if no package was
24342 * specified, or just the utf8:: package. */
24343 if (could_be_deferred_official) {
24344 lookup_name += lun_non_pkg_begin;
24345 j -= lun_non_pkg_begin;
24347 else if (! stripped_utf8_pkg) {
24348 goto unknown_user_defined;
24351 /* Drop down to look up in the official properties */
24354 const char insecure[] = "Insecure user-defined property";
24356 /* Here, there is a sub by the correct name. Normally we call it
24357 * to get the property definition */
24359 SV * user_sub_sv = MUTABLE_SV(user_sub);
24360 SV * error; /* Any error returned by calling 'user_sub' */
24361 SV * key; /* The key into the hash of user defined sub names
24364 SV ** saved_user_prop_ptr; /* Hash entry for this property */
24366 /* How many times to retry when another thread is in the middle of
24367 * expanding the same definition we want */
24368 PERL_INT_FAST8_T retry_countdown = 10;
24370 DECLARATION_FOR_GLOBAL_CONTEXT;
24372 /* If we get here, we know this property is user-defined */
24373 *user_defined_ptr = TRUE;
24375 /* We refuse to call a potentially tainted subroutine; returning an
24378 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24379 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24380 goto append_name_to_msg;
24383 /* In principal, we only call each subroutine property definition
24384 * once during the life of the program. This guarantees that the
24385 * property definition never changes. The results of the single
24386 * sub call are stored in a hash, which is used instead for future
24387 * references to this property. The property definition is thus
24388 * immutable. But, to allow the user to have a /i-dependent
24389 * definition, we call the sub once for non-/i, and once for /i,
24390 * should the need arise, passing the /i status as a parameter.
24392 * We start by constructing the hash key name, consisting of the
24393 * fully qualified subroutine name, preceded by the /i status, so
24394 * that there is a key for /i and a different key for non-/i */
24395 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24396 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24397 non_pkg_begin != 0);
24398 sv_catsv(key, fq_name);
24401 /* We only call the sub once throughout the life of the program
24402 * (with the /i, non-/i exception noted above). That means the
24403 * hash must be global and accessible to all threads. It is
24404 * created at program start-up, before any threads are created, so
24405 * is accessible to all children. But this creates some
24408 * 1) The keys can't be shared, or else problems arise; sharing is
24409 * turned off at hash creation time
24410 * 2) All SVs in it are there for the remainder of the life of the
24411 * program, and must be created in the same interpreter context
24412 * as the hash, or else they will be freed from the wrong pool
24413 * at global destruction time. This is handled by switching to
24414 * the hash's context to create each SV going into it, and then
24415 * immediately switching back
24416 * 3) All accesses to the hash must be controlled by a mutex, to
24417 * prevent two threads from getting an unstable state should
24418 * they simultaneously be accessing it. The code below is
24419 * crafted so that the mutex is locked whenever there is an
24420 * access and unlocked only when the next stable state is
24423 * The hash stores either the definition of the property if it was
24424 * valid, or, if invalid, the error message that was raised. We
24425 * use the type of SV to distinguish.
24427 * There's also the need to guard against the definition expansion
24428 * from infinitely recursing. This is handled by storing the aTHX
24429 * of the expanding thread during the expansion. Again the SV type
24430 * is used to distinguish this from the other two cases. If we
24431 * come to here and the hash entry for this property is our aTHX,
24432 * it means we have recursed, and the code assumes that we would
24433 * infinitely recurse, so instead stops and raises an error.
24434 * (Any recursion has always been treated as infinite recursion in
24437 * If instead, the entry is for a different aTHX, it means that
24438 * that thread has gotten here first, and hasn't finished expanding
24439 * the definition yet. We just have to wait until it is done. We
24440 * sleep and retry a few times, returning an error if the other
24441 * thread doesn't complete. */
24444 USER_PROP_MUTEX_LOCK;
24446 /* If we have an entry for this key, the subroutine has already
24447 * been called once with this /i status. */
24448 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24449 SvPVX(key), SvCUR(key), 0);
24450 if (saved_user_prop_ptr) {
24452 /* If the saved result is an inversion list, it is the valid
24453 * definition of this property */
24454 if (is_invlist(*saved_user_prop_ptr)) {
24455 prop_definition = *saved_user_prop_ptr;
24457 /* The SV in the hash won't be removed until global
24458 * destruction, so it is stable and we can unlock */
24459 USER_PROP_MUTEX_UNLOCK;
24461 /* The caller shouldn't try to free this SV */
24462 return prop_definition;
24465 /* Otherwise, if it is a string, it is the error message
24466 * that was returned when we first tried to evaluate this
24467 * property. Fail, and append the message */
24468 if (SvPOK(*saved_user_prop_ptr)) {
24469 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24470 sv_catsv(msg, *saved_user_prop_ptr);
24472 /* The SV in the hash won't be removed until global
24473 * destruction, so it is stable and we can unlock */
24474 USER_PROP_MUTEX_UNLOCK;
24479 assert(SvIOK(*saved_user_prop_ptr));
24481 /* Here, we have an unstable entry in the hash. Either another
24482 * thread is in the middle of expanding the property's
24483 * definition, or we are ourselves recursing. We use the aTHX
24484 * in it to distinguish */
24485 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24487 /* Here, it's another thread doing the expanding. We've
24488 * looked as much as we are going to at the contents of the
24489 * hash entry. It's safe to unlock. */
24490 USER_PROP_MUTEX_UNLOCK;
24492 /* Retry a few times */
24493 if (retry_countdown-- > 0) {
24498 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24499 sv_catpvs(msg, "Timeout waiting for another thread to "
24501 goto append_name_to_msg;
24504 /* Here, we are recursing; don't dig any deeper */
24505 USER_PROP_MUTEX_UNLOCK;
24507 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24509 "Infinite recursion in user-defined property");
24510 goto append_name_to_msg;
24513 /* Here, this thread has exclusive control, and there is no entry
24514 * for this property in the hash. So we have the go ahead to
24515 * expand the definition ourselves. */
24517 PUSHSTACKi(PERLSI_REGCOMP);
24520 /* Create a temporary placeholder in the hash to detect recursion
24522 SWITCH_TO_GLOBAL_CONTEXT;
24523 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24524 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24527 /* Now that we have a placeholder, we can let other threads
24529 USER_PROP_MUTEX_UNLOCK;
24531 /* Make sure the placeholder always gets destroyed */
24532 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24537 /* Call the user's function, with the /i status as a parameter.
24538 * Note that we have gone to a lot of trouble to keep this call
24539 * from being within the locked mutex region. */
24540 XPUSHs(boolSV(to_fold));
24543 /* The following block was taken from swash_init(). Presumably
24544 * they apply to here as well, though we no longer use a swash --
24548 /* We might get here via a subroutine signature which uses a utf8
24549 * parameter name, at which point PL_subname will have been set
24550 * but not yet used. */
24551 save_item(PL_subname);
24553 /* G_SCALAR guarantees a single return value */
24554 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24559 if (TAINT_get || SvTRUE(error)) {
24560 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24561 if (SvTRUE(error)) {
24562 sv_catpvs(msg, "Error \"");
24563 sv_catsv(msg, error);
24564 sv_catpvs(msg, "\"");
24567 if (SvTRUE(error)) sv_catpvs(msg, "; ");
24568 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24571 if (name_len > 0) {
24572 sv_catpvs(msg, " in expansion of ");
24573 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24579 prop_definition = NULL;
24582 SV * contents = POPs;
24584 /* The contents is supposed to be the expansion of the property
24585 * definition. If the definition is deferrable, and we got an
24586 * empty string back, set a flag to later defer it (after clean
24589 && (! SvPOK(contents) || SvCUR(contents) == 0))
24591 empty_return = TRUE;
24593 else { /* Otherwise, call a function to check for valid syntax,
24596 prop_definition = handle_user_defined_property(
24598 is_utf8, to_fold, runtime,
24600 contents, user_defined_ptr,
24606 /* Here, we have the results of the expansion. Delete the
24607 * placeholder, and if the definition is now known, replace it with
24608 * that definition. We need exclusive access to the hash, and we
24609 * can't let anyone else in, between when we delete the placeholder
24610 * and add the permanent entry */
24611 USER_PROP_MUTEX_LOCK;
24613 S_delete_recursion_entry(aTHX_ SvPVX(key));
24615 if ( ! empty_return
24616 && (! prop_definition || is_invlist(prop_definition)))
24618 /* If we got success we use the inversion list defining the
24619 * property; otherwise use the error message */
24620 SWITCH_TO_GLOBAL_CONTEXT;
24621 (void) hv_store_ent(PL_user_def_props,
24624 ? newSVsv(prop_definition)
24630 /* All done, and the hash now has a permanent entry for this
24631 * property. Give up exclusive control */
24632 USER_PROP_MUTEX_UNLOCK;
24638 if (empty_return) {
24639 goto definition_deferred;
24642 if (prop_definition) {
24644 /* If the definition is for something not known at this time,
24645 * we toss it, and go return the main property name, as that's
24646 * the one the user will be aware of */
24647 if (! is_invlist(prop_definition)) {
24648 SvREFCNT_dec_NN(prop_definition);
24649 goto definition_deferred;
24652 sv_2mortal(prop_definition);
24656 return prop_definition;
24658 } /* End of calling the subroutine for the user-defined property */
24659 } /* End of it could be a user-defined property */
24661 /* Here it wasn't a user-defined property that is known at this time. See
24662 * if it is a Unicode property */
24664 lookup_len = j; /* This is a more mnemonic name than 'j' */
24666 /* Get the index into our pointer table of the inversion list corresponding
24667 * to the property */
24668 table_index = do_uniprop_match(lookup_name, lookup_len);
24670 /* If it didn't find the property ... */
24671 if (table_index == 0) {
24673 /* Try again stripping off any initial 'Is'. This is because we
24674 * promise that an initial Is is optional. The same isn't true of
24675 * names that start with 'In'. Those can match only blocks, and the
24676 * lookup table already has those accounted for. The lookup table also
24677 * has already accounted for Perl extensions (without and = sign)
24678 * starting with 'i's'. */
24679 if (starts_with_Is && equals_pos >= 0) {
24685 table_index = do_uniprop_match(lookup_name, lookup_len);
24688 if (table_index == 0) {
24691 /* Here, we didn't find it. If not a numeric type property, and
24692 * can't be a user-defined one, it isn't a legal property */
24693 if (! is_nv_type) {
24694 if (! could_be_user_defined) {
24698 /* Here, the property name is legal as a user-defined one. At
24699 * compile time, it might just be that the subroutine for that
24700 * property hasn't been encountered yet, but at runtime, it's
24701 * an error to try to use an undefined one */
24702 if (! deferrable) {
24703 goto unknown_user_defined;;
24706 goto definition_deferred;
24707 } /* End of isn't a numeric type property */
24709 /* The numeric type properties need more work to decide. What we
24710 * do is make sure we have the number in canonical form and look
24713 if (slash_pos < 0) { /* No slash */
24715 /* When it isn't a rational, take the input, convert it to a
24716 * NV, then create a canonical string representation of that
24720 SSize_t value_len = lookup_len - equals_pos;
24722 /* Get the value */
24723 if ( value_len <= 0
24724 || my_atof3(lookup_name + equals_pos, &value,
24726 != lookup_name + lookup_len)
24731 /* If the value is an integer, the canonical value is integral
24733 if (Perl_ceil(value) == value) {
24734 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24735 equals_pos, lookup_name, value);
24737 else { /* Otherwise, it is %e with a known precision */
24740 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24741 equals_pos, lookup_name,
24742 PL_E_FORMAT_PRECISION, value);
24744 /* The exponent generated is expecting two digits, whereas
24745 * %e on some systems will generate three. Remove leading
24746 * zeros in excess of 2 from the exponent. We start
24747 * looking for them after the '=' */
24748 exp_ptr = strchr(canonical + equals_pos, 'e');
24750 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24751 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24753 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24755 if (excess_exponent_len > 0) {
24756 SSize_t leading_zeros = strspn(cur_ptr, "0");
24757 SSize_t excess_leading_zeros
24758 = MIN(leading_zeros, excess_exponent_len);
24759 if (excess_leading_zeros > 0) {
24760 Move(cur_ptr + excess_leading_zeros,
24762 strlen(cur_ptr) - excess_leading_zeros
24763 + 1, /* Copy the NUL as well */
24770 else { /* Has a slash. Create a rational in canonical form */
24771 UV numerator, denominator, gcd, trial;
24772 const char * end_ptr;
24773 const char * sign = "";
24775 /* We can't just find the numerator, denominator, and do the
24776 * division, then use the method above, because that is
24777 * inexact. And the input could be a rational that is within
24778 * epsilon (given our precision) of a valid rational, and would
24779 * then incorrectly compare valid.
24781 * We're only interested in the part after the '=' */
24782 const char * this_lookup_name = lookup_name + equals_pos;
24783 lookup_len -= equals_pos;
24784 slash_pos -= equals_pos;
24786 /* Handle any leading minus */
24787 if (this_lookup_name[0] == '-') {
24789 this_lookup_name++;
24794 /* Convert the numerator to numeric */
24795 end_ptr = this_lookup_name + slash_pos;
24796 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24800 /* It better have included all characters before the slash */
24801 if (*end_ptr != '/') {
24805 /* Set to look at just the denominator */
24806 this_lookup_name += slash_pos;
24807 lookup_len -= slash_pos;
24808 end_ptr = this_lookup_name + lookup_len;
24810 /* Convert the denominator to numeric */
24811 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24815 /* It better be the rest of the characters, and don't divide by
24817 if ( end_ptr != this_lookup_name + lookup_len
24818 || denominator == 0)
24823 /* Get the greatest common denominator using
24824 http://en.wikipedia.org/wiki/Euclidean_algorithm */
24826 trial = denominator;
24827 while (trial != 0) {
24829 trial = gcd % trial;
24833 /* If already in lowest possible terms, we have already tried
24834 * looking this up */
24839 /* Reduce the rational, which should put it in canonical form
24842 denominator /= gcd;
24844 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24845 equals_pos, lookup_name, sign, numerator, denominator);
24848 /* Here, we have the number in canonical form. Try that */
24849 table_index = do_uniprop_match(canonical, strlen(canonical));
24850 if (table_index == 0) {
24853 } /* End of still didn't find the property in our table */
24854 } /* End of didn't find the property in our table */
24856 /* Here, we have a non-zero return, which is an index into a table of ptrs.
24857 * A negative return signifies that the real index is the absolute value,
24858 * but the result needs to be inverted */
24859 if (table_index < 0) {
24860 invert_return = TRUE;
24861 table_index = -table_index;
24864 /* Out-of band indices indicate a deprecated property. The proper index is
24865 * modulo it with the table size. And dividing by the table size yields
24866 * an offset into a table constructed by regen/mk_invlists.pl to contain
24867 * the corresponding warning message */
24868 if (table_index > MAX_UNI_KEYWORD_INDEX) {
24869 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24870 table_index %= MAX_UNI_KEYWORD_INDEX;
24871 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24872 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24873 (int) name_len, name,
24874 get_deprecated_property_msg(warning_offset));
24877 /* In a few properties, a different property is used under /i. These are
24878 * unlikely to change, so are hard-coded here. */
24880 if ( table_index == UNI_XPOSIXUPPER
24881 || table_index == UNI_XPOSIXLOWER
24882 || table_index == UNI_TITLE)
24884 table_index = UNI_CASED;
24886 else if ( table_index == UNI_UPPERCASELETTER
24887 || table_index == UNI_LOWERCASELETTER
24888 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
24889 || table_index == UNI_TITLECASELETTER
24892 table_index = UNI_CASEDLETTER;
24894 else if ( table_index == UNI_POSIXUPPER
24895 || table_index == UNI_POSIXLOWER)
24897 table_index = UNI_POSIXALPHA;
24901 /* Create and return the inversion list */
24902 prop_definition = get_prop_definition(table_index);
24903 sv_2mortal(prop_definition);
24905 /* See if there is a private use override to add to this definition */
24907 COPHH * hinthash = (IN_PERL_COMPILETIME)
24908 ? CopHINTHASH_get(&PL_compiling)
24909 : CopHINTHASH_get(PL_curcop);
24910 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24912 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24914 /* See if there is an element in the hints hash for this table */
24915 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24916 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24920 SV * pu_definition;
24922 SV * expanded_prop_definition =
24923 sv_2mortal(invlist_clone(prop_definition, NULL));
24925 /* If so, it's definition is the string from here to the next
24926 * \a character. And its format is the same as a user-defined
24928 pos += SvCUR(pu_lookup);
24929 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24930 pu_invlist = handle_user_defined_property(lookup_name,
24933 0, /* Not folded */
24941 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24942 sv_catpvs(msg, "Insecure private-use override");
24943 goto append_name_to_msg;
24946 /* For now, as a safety measure, make sure that it doesn't
24947 * override non-private use code points */
24948 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24950 /* Add it to the list to be returned */
24951 _invlist_union(prop_definition, pu_invlist,
24952 &expanded_prop_definition);
24953 prop_definition = expanded_prop_definition;
24954 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24959 if (invert_return) {
24960 _invlist_invert(prop_definition);
24962 return prop_definition;
24964 unknown_user_defined:
24965 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24966 sv_catpvs(msg, "Unknown user-defined property name");
24967 goto append_name_to_msg;
24970 if (non_pkg_begin != 0) {
24971 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24972 sv_catpvs(msg, "Illegal user-defined property name");
24975 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24976 sv_catpvs(msg, "Can't find Unicode property definition");
24980 append_name_to_msg:
24982 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
24983 const char * suffix = (runtime && level == 0) ? "}" : "\"";
24985 sv_catpv(msg, prefix);
24986 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24987 sv_catpv(msg, suffix);
24992 definition_deferred:
24995 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
24997 /* Here it could yet to be defined, so defer evaluation of this until
24998 * its needed at runtime. We need the fully qualified property name to
24999 * avoid ambiguity */
25001 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25005 /* If it didn't come with a package, or the package is utf8::, this
25006 * actually could be an official Unicode property whose inclusion we
25007 * are deferring until runtime to make sure that it isn't overridden by
25008 * a user-defined property of the same name (which we haven't
25009 * encountered yet). Add a marker to indicate this possibility, for
25010 * use at such time when we first need the definition during pattern
25011 * matching execution */
25012 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25013 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25016 /* We also need a trailing newline */
25017 sv_catpvs(fq_name, "\n");
25019 *user_defined_ptr = TRUE;
25025 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25026 const STRLEN wname_len, /* Its length */
25027 SV ** prop_definition,
25030 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25031 * any matches, adding them to prop_definition */
25035 CV * get_names_info; /* entry to charnames.pm to get info we need */
25036 SV * names_string; /* Contains all character names, except algo */
25037 SV * algorithmic_names; /* Contains info about algorithmically
25038 generated character names */
25039 REGEXP * subpattern_re; /* The user's pattern to match with */
25040 struct regexp * prog; /* The compiled pattern */
25041 char * all_names_start; /* lib/unicore/Name.pl string of every
25042 (non-algorithmic) character name */
25043 char * cur_pos; /* We match, effectively using /gc; this is
25044 where we are now */
25045 bool found_matches = FALSE; /* Did any name match so far? */
25046 SV * empty; /* For matching zero length names */
25047 SV * must_sv; /* Contains the substring, if any, that must be
25048 in a name for the subpattern to match */
25049 const char * must; /* The PV of 'must' */
25050 STRLEN must_len; /* And its length */
25051 SV * syllable_name = NULL; /* For Hangul syllables */
25052 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25053 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25055 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25056 * syllable name, and these are immutable and guaranteed by the Unicode
25057 * standard to never be extended */
25058 const STRLEN syl_max_len = hangul_prefix_len + 7;
25062 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25064 /* Make sure _charnames is loaded. (The parameters give context
25065 * for any errors generated */
25066 get_names_info = get_cv("_charnames::_get_names_info", 0);
25067 if (! get_names_info) {
25068 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25071 /* Get the charnames data */
25072 PUSHSTACKi(PERLSI_REGCOMP);
25080 /* Special _charnames entry point that returns the info this routine
25082 call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25086 /* Data structure for names which end in their very own code points */
25087 algorithmic_names = POPs;
25088 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25090 /* The lib/unicore/Name.pl string */
25091 names_string = POPs;
25092 SvREFCNT_inc_simple_void_NN(names_string);
25099 if ( ! SvROK(names_string)
25100 || ! SvROK(algorithmic_names))
25101 { /* Perhaps should panic instead XXX */
25102 SvREFCNT_dec(names_string);
25103 SvREFCNT_dec(algorithmic_names);
25107 names_string = sv_2mortal(SvRV(names_string));
25108 all_names_start = SvPVX(names_string);
25109 cur_pos = all_names_start;
25111 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25113 /* Compile the subpattern consisting of the name being looked for */
25114 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25116 must_sv = re_intuit_string(subpattern_re);
25118 /* regexec.c can free the re_intuit_string() return. GH #17734 */
25119 must_sv = sv_2mortal(newSVsv(must_sv));
25120 must = SvPV(must_sv, must_len);
25127 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
25128 * This works because the NUL causes the function to return early, thus
25129 * showing that there are characters in it other than the acceptable ones,
25130 * which is our desired result.) */
25132 prog = ReANY(subpattern_re);
25134 /* If only nothing is matched, skip to where empty names are looked for */
25135 if (prog->maxlen == 0) {
25139 /* And match against the string of all names /gc. Don't even try if it
25140 * must match a character not found in any name. */
25141 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25143 while (execute_wildcard(subpattern_re,
25145 SvEND(names_string),
25146 all_names_start, 0,
25149 { /* Here, matched. */
25151 /* Note the string entries look like
25152 * 00001\nSTART OF HEADING\n\n
25153 * so we could match anywhere in that string. We have to rule out
25154 * matching a code point line */
25155 char * this_name_start = all_names_start
25156 + RX_OFFS(subpattern_re)->start;
25157 char * this_name_end = all_names_start
25158 + RX_OFFS(subpattern_re)->end;
25161 UV cp = 0; /* Silences some compilers */
25162 AV * this_string = NULL;
25163 bool is_multi = FALSE;
25165 /* If matched nothing, advance to next possible match */
25166 if (this_name_start == this_name_end) {
25167 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25168 SvEND(names_string) - this_name_end);
25169 if (cur_pos == NULL) {
25174 /* Position the next match to start beyond the current returned
25176 cur_pos = (char *) memchr(this_name_end, '\n',
25177 SvEND(names_string) - this_name_end);
25180 /* Back up to the \n just before the beginning of the character. */
25181 cp_end = (char *) my_memrchr(all_names_start,
25183 this_name_start - all_names_start);
25185 /* If we didn't find a \n, it means it matched somewhere in the
25186 * initial '00000' in the string, so isn't a real match */
25187 if (cp_end == NULL) {
25191 this_name_start = cp_end + 1; /* The name starts just after */
25192 cp_end--; /* the \n, and the code point */
25193 /* ends just before it */
25195 /* All code points are 5 digits long */
25196 cp_start = cp_end - 4;
25198 /* This shouldn't happen, as we found a \n, and the first \n is
25199 * further along than what we subtracted */
25200 assert(cp_start >= all_names_start);
25202 if (cp_start == all_names_start) {
25203 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25207 /* If the character is a blank, we either have a named sequence, or
25208 * something is wrong */
25209 if (*(cp_start - 1) == ' ') {
25210 cp_start = (char *) my_memrchr(all_names_start,
25212 cp_start - all_names_start);
25216 assert(cp_start != NULL && cp_start >= all_names_start + 2);
25218 /* Except for the first line in the string, the sequence before the
25219 * code point is \n\n. If that isn't the case here, we didn't
25220 * match the name of a character. (We could have matched a named
25221 * sequence, not currently handled */
25222 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25226 /* We matched! Add this to the list */
25227 found_matches = TRUE;
25229 /* Loop through all the code points in the sequence */
25230 while (cp_start < cp_end) {
25232 /* Calculate this code point from its 5 digits */
25233 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25234 + (XDIGIT_VALUE(cp_start[1]) << 12)
25235 + (XDIGIT_VALUE(cp_start[2]) << 8)
25236 + (XDIGIT_VALUE(cp_start[3]) << 4)
25237 + XDIGIT_VALUE(cp_start[4]);
25239 cp_start += 6; /* Go past any blank */
25241 if (cp_start < cp_end || is_multi) {
25242 if (this_string == NULL) {
25243 this_string = newAV();
25247 av_push(this_string, newSVuv(cp));
25251 if (is_multi) { /* Was more than one code point */
25252 if (*strings == NULL) {
25253 *strings = newAV();
25256 av_push(*strings, (SV *) this_string);
25258 else { /* Only a single code point */
25259 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25261 } /* End of loop through the non-algorithmic names string */
25264 /* There are also character names not in 'names_string'. These are
25265 * algorithmically generatable. Try this pattern on each possible one.
25266 * (khw originally planned to leave this out given the large number of
25267 * matches attempted; but the speed turned out to be quite acceptable
25269 * There are plenty of opportunities to optimize to skip many of the tests.
25270 * beyond the rudimentary ones already here */
25272 /* First see if the subpattern matches any of the algorithmic generatable
25273 * Hangul syllable names.
25275 * We know none of these syllable names will match if the input pattern
25276 * requires more bytes than any syllable has, or if the input pattern only
25277 * matches an empty name, or if the pattern has something it must match and
25278 * one of the characters in that isn't in any Hangul syllable. */
25279 if ( prog->minlen <= (SSize_t) syl_max_len
25280 && prog->maxlen > 0
25281 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25283 /* These constants, names, values, and algorithm are adapted from the
25284 * Unicode standard, version 5.1, section 3.12, and should never
25286 const char * JamoL[] = {
25287 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25288 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25290 const int LCount = C_ARRAY_LENGTH(JamoL);
25292 const char * JamoV[] = {
25293 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25294 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25297 const int VCount = C_ARRAY_LENGTH(JamoV);
25299 const char * JamoT[] = {
25300 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25301 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25302 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25304 const int TCount = C_ARRAY_LENGTH(JamoT);
25308 /* This is the initial Hangul syllable code point; each time through the
25309 * inner loop, it maps to the next higher code point. For more info,
25310 * see the Hangul syllable section of the Unicode standard. */
25313 syllable_name = sv_2mortal(newSV(syl_max_len));
25314 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25316 for (L = 0; L < LCount; L++) {
25317 for (V = 0; V < VCount; V++) {
25318 for (T = 0; T < TCount; T++) {
25320 /* Truncate back to the prefix, which is unvarying */
25321 SvCUR_set(syllable_name, hangul_prefix_len);
25323 sv_catpv(syllable_name, JamoL[L]);
25324 sv_catpv(syllable_name, JamoV[V]);
25325 sv_catpv(syllable_name, JamoT[T]);
25327 if (execute_wildcard(subpattern_re,
25328 SvPVX(syllable_name),
25329 SvEND(syllable_name),
25330 SvPVX(syllable_name), 0,
25334 *prop_definition = add_cp_to_invlist(*prop_definition,
25336 found_matches = TRUE;
25345 /* The rest of the algorithmically generatable names are of the form
25346 * "PREFIX-code_point". The prefixes and the code point limits of each
25347 * were returned to us in the array 'algorithmic_names' from data in
25348 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
25349 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25352 /* Each element of the array is a hash, giving the details for the
25353 * series of names it covers. There is the base name of the characters
25354 * in the series, and the low and high code points in the series. And,
25355 * for optimization purposes a string containing all the legal
25356 * characters that could possibly be in a name in this series. */
25357 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25358 SV * prefix = * hv_fetchs(this_series, "name", 0);
25359 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25360 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25361 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25363 /* Pre-allocate an SV with enough space */
25364 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25366 if (high >= 0x10000) {
25367 sv_catpvs(algo_name, "0");
25370 /* This series can be skipped entirely if the pattern requires
25371 * something longer than any name in the series, or can only match an
25372 * empty name, or contains a character not found in any name in the
25374 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
25375 && prog->maxlen > 0
25376 && (strspn(must, legal) == must_len))
25378 for (j = low; j <= high; j++) { /* For each code point in the series */
25380 /* Get its name, and see if it matches the subpattern */
25381 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25384 if (execute_wildcard(subpattern_re,
25387 SvPVX(algo_name), 0,
25391 *prop_definition = add_cp_to_invlist(*prop_definition, j);
25392 found_matches = TRUE;
25399 /* Finally, see if the subpattern matches an empty string */
25400 empty = newSVpvs("");
25401 if (execute_wildcard(subpattern_re,
25408 /* Many code points have empty names. Currently these are the \p{GC=C}
25409 * ones, minus CC and CF */
25411 SV * empty_names_ref = get_prop_definition(UNI_C);
25412 SV * empty_names = invlist_clone(empty_names_ref, NULL);
25414 SV * subtract = get_prop_definition(UNI_CC);
25416 _invlist_subtract(empty_names, subtract, &empty_names);
25417 SvREFCNT_dec_NN(empty_names_ref);
25418 SvREFCNT_dec_NN(subtract);
25420 subtract = get_prop_definition(UNI_CF);
25421 _invlist_subtract(empty_names, subtract, &empty_names);
25422 SvREFCNT_dec_NN(subtract);
25424 _invlist_union(*prop_definition, empty_names, prop_definition);
25425 found_matches = TRUE;
25426 SvREFCNT_dec_NN(empty_names);
25428 SvREFCNT_dec_NN(empty);
25431 /* If we ever were to accept aliases for, say private use names, we would
25432 * need to do something fancier to find empty names. The code below works
25433 * (at the time it was written), and is slower than the above */
25434 const char empties_pat[] = "^.";
25435 if (strNE(name, empties_pat)) {
25436 SV * empty = newSVpvs("");
25437 if (execute_wildcard(subpattern_re,
25444 SV * empties = NULL;
25446 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25448 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25449 SvREFCNT_dec_NN(empties);
25451 found_matches = TRUE;
25453 SvREFCNT_dec_NN(empty);
25457 SvREFCNT_dec_NN(subpattern_re);
25458 return found_matches;
25462 * ex: set ts=8 sts=4 sw=4 et: