5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
154 #define STATIC static
157 /* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
161 typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
166 bool in_gosub; /* this or an outer frame is for GOSUB */
168 struct scan_frame *this_prev_frame; /* this previous frame */
169 struct scan_frame *prev_frame; /* previous frame */
170 struct scan_frame *next_frame; /* next frame */
173 /* Certain characters are output as a sequence with the first being a
175 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
178 struct RExC_state_t {
179 U32 flags; /* RXf_* are we folding, multilining? */
180 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
181 char *precomp; /* uncompiled string. */
182 char *precomp_end; /* pointer to end of uncompiled string. */
183 REGEXP *rx_sv; /* The SV that is the regexp. */
184 regexp *rx; /* perl core regexp structure */
185 regexp_internal *rxi; /* internal data for regexp object
187 char *start; /* Start of input for compile */
188 char *end; /* End of input for compile */
189 char *parse; /* Input-scan pointer. */
190 char *copy_start; /* start of copy of input within
191 constructed parse string */
192 char *save_copy_start; /* Provides one level of saving
193 and restoring 'copy_start' */
194 char *copy_start_in_input; /* Position in input string
195 corresponding to copy_start */
196 SSize_t whilem_seen; /* number of WHILEM in this expr */
197 regnode *emit_start; /* Start of emitted-code area */
198 regnode_offset emit; /* Code-emit pointer */
199 I32 naughty; /* How bad is this pattern? */
200 I32 sawback; /* Did we see \1, ...? */
201 SSize_t size; /* Number of regnode equivalents in
203 Size_t sets_depth; /* Counts recursion depth of already-
204 compiled regex set patterns */
207 I32 parens_buf_size; /* #slots malloced open/close_parens */
208 regnode_offset *open_parens; /* offsets to open parens */
209 regnode_offset *close_parens; /* offsets to close parens */
210 HV *paren_names; /* Paren names */
212 /* position beyond 'precomp' of the warning message furthest away from
213 * 'precomp'. During the parse, no warnings are raised for any problems
214 * earlier in the parse than this position. This works if warnings are
215 * raised the first time a given spot is parsed, and if only one
216 * independent warning is raised for any given spot */
217 Size_t latest_warn_offset;
219 I32 npar; /* Capture buffer count so far in the
220 parse, (OPEN) plus one. ("par" 0 is
222 I32 total_par; /* During initial parse, is either 0,
223 or -1; the latter indicating a
224 reparse is needed. After that pass,
225 it is what 'npar' became after the
226 pass. Hence, it being > 0 indicates
227 we are in a reparse situation */
228 I32 nestroot; /* root parens we are in - used by
231 regnode *end_op; /* END node in program */
232 I32 utf8; /* whether the pattern is utf8 or not */
233 I32 orig_utf8; /* whether the pattern was originally in utf8 */
234 /* XXX use this for future optimisation of case
235 * where pattern must be upgraded to utf8. */
236 I32 uni_semantics; /* If a d charset modifier should use unicode
237 rules, even if the pattern is not in
240 I32 recurse_count; /* Number of recurse regops we have generated */
241 regnode **recurse; /* Recurse regops */
242 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
244 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
248 I32 override_recoding;
249 I32 recode_x_to_native;
250 I32 in_multi_char_class;
251 int code_index; /* next code_blocks[] slot */
252 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
254 SSize_t maxlen; /* mininum possible number of chars in string to match */
255 scan_frame *frame_head;
256 scan_frame *frame_last;
260 SV *runtime_code_qr; /* qr with the runtime code blocks */
262 const char *lastparse;
264 U32 study_chunk_recursed_count;
265 AV *paren_name_list; /* idx -> name */
269 #define RExC_lastparse (pRExC_state->lastparse)
270 #define RExC_lastnum (pRExC_state->lastnum)
271 #define RExC_paren_name_list (pRExC_state->paren_name_list)
272 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
273 #define RExC_mysv (pRExC_state->mysv1)
274 #define RExC_mysv1 (pRExC_state->mysv1)
275 #define RExC_mysv2 (pRExC_state->mysv2)
283 bool sWARN_EXPERIMENTAL__VLB;
284 bool sWARN_EXPERIMENTAL__REGEX_SETS;
287 #define RExC_flags (pRExC_state->flags)
288 #define RExC_pm_flags (pRExC_state->pm_flags)
289 #define RExC_precomp (pRExC_state->precomp)
290 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
291 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
292 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
293 #define RExC_precomp_end (pRExC_state->precomp_end)
294 #define RExC_rx_sv (pRExC_state->rx_sv)
295 #define RExC_rx (pRExC_state->rx)
296 #define RExC_rxi (pRExC_state->rxi)
297 #define RExC_start (pRExC_state->start)
298 #define RExC_end (pRExC_state->end)
299 #define RExC_parse (pRExC_state->parse)
300 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
301 #define RExC_whilem_seen (pRExC_state->whilem_seen)
302 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
303 under /d from /u ? */
305 #ifdef RE_TRACK_PATTERN_OFFSETS
306 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
309 #define RExC_emit (pRExC_state->emit)
310 #define RExC_emit_start (pRExC_state->emit_start)
311 #define RExC_sawback (pRExC_state->sawback)
312 #define RExC_seen (pRExC_state->seen)
313 #define RExC_size (pRExC_state->size)
314 #define RExC_maxlen (pRExC_state->maxlen)
315 #define RExC_npar (pRExC_state->npar)
316 #define RExC_total_parens (pRExC_state->total_par)
317 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
318 #define RExC_nestroot (pRExC_state->nestroot)
319 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
320 #define RExC_utf8 (pRExC_state->utf8)
321 #define RExC_uni_semantics (pRExC_state->uni_semantics)
322 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
323 #define RExC_open_parens (pRExC_state->open_parens)
324 #define RExC_close_parens (pRExC_state->close_parens)
325 #define RExC_end_op (pRExC_state->end_op)
326 #define RExC_paren_names (pRExC_state->paren_names)
327 #define RExC_recurse (pRExC_state->recurse)
328 #define RExC_recurse_count (pRExC_state->recurse_count)
329 #define RExC_sets_depth (pRExC_state->sets_depth)
330 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
331 #define RExC_study_chunk_recursed_bytes \
332 (pRExC_state->study_chunk_recursed_bytes)
333 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
334 #define RExC_in_lookahead (pRExC_state->in_lookahead)
335 #define RExC_contains_locale (pRExC_state->contains_locale)
336 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
339 # define SET_recode_x_to_native(x) \
340 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
342 # define SET_recode_x_to_native(x) NOOP
345 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
346 #define RExC_frame_head (pRExC_state->frame_head)
347 #define RExC_frame_last (pRExC_state->frame_last)
348 #define RExC_frame_count (pRExC_state->frame_count)
349 #define RExC_strict (pRExC_state->strict)
350 #define RExC_study_started (pRExC_state->study_started)
351 #define RExC_warn_text (pRExC_state->warn_text)
352 #define RExC_in_script_run (pRExC_state->in_script_run)
353 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
354 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
355 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
356 #define RExC_unlexed_names (pRExC_state->unlexed_names)
358 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
359 * a flag to disable back-off on the fixed/floating substrings - if it's
360 * a high complexity pattern we assume the benefit of avoiding a full match
361 * is worth the cost of checking for the substrings even if they rarely help.
363 #define RExC_naughty (pRExC_state->naughty)
364 #define TOO_NAUGHTY (10)
365 #define MARK_NAUGHTY(add) \
366 if (RExC_naughty < TOO_NAUGHTY) \
367 RExC_naughty += (add)
368 #define MARK_NAUGHTY_EXP(exp, add) \
369 if (RExC_naughty < TOO_NAUGHTY) \
370 RExC_naughty += RExC_naughty / (exp) + (add)
372 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
373 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
374 ((*s) == '{' && regcurly(s)))
377 * Flags to be passed up and down.
379 #define WORST 0 /* Worst case. */
380 #define HASWIDTH 0x01 /* Known to not match null strings, could match
383 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
384 * character. (There needs to be a case: in the switch statement in regexec.c
385 * for any node marked SIMPLE.) Note that this is not the same thing as
388 #define SPSTART 0x04 /* Starts with * or + */
389 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
390 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
391 #define RESTART_PARSE 0x20 /* Need to redo the parse */
392 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
393 calcuate sizes as UTF-8 */
395 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
397 /* whether trie related optimizations are enabled */
398 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
399 #define TRIE_STUDY_OPT
400 #define FULL_TRIE_STUDY
406 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
407 #define PBITVAL(paren) (1 << ((paren) & 7))
408 #define PAREN_OFFSET(depth) \
409 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
410 #define PAREN_TEST(depth, paren) \
411 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
412 #define PAREN_SET(depth, paren) \
413 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
414 #define PAREN_UNSET(depth, paren) \
415 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
417 #define REQUIRE_UTF8(flagp) STMT_START { \
419 *flagp = RESTART_PARSE|NEED_UTF8; \
424 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
425 * a flag that indicates we need to override /d with /u as a result of
426 * something in the pattern. It should only be used in regards to calling
427 * set_regex_charset() or get_regex_charset() */
428 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
430 if (DEPENDS_SEMANTICS) { \
431 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
432 RExC_uni_semantics = 1; \
433 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
434 /* No need to restart the parse if we haven't seen \
435 * anything that differs between /u and /d, and no need \
436 * to restart immediately if we're going to reparse \
437 * anyway to count parens */ \
438 *flagp |= RESTART_PARSE; \
439 return restart_retval; \
444 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
446 RExC_use_BRANCHJ = 1; \
447 *flagp |= RESTART_PARSE; \
448 return restart_retval; \
451 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
452 * less. After that, it must always be positive, because the whole re is
453 * considered to be surrounded by virtual parens. Setting it to negative
454 * indicates there is some construct that needs to know the actual number of
455 * parens to be properly handled. And that means an extra pass will be
456 * required after we've counted them all */
457 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
458 #define REQUIRE_PARENS_PASS \
459 STMT_START { /* No-op if have completed a pass */ \
460 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
462 #define IN_PARENS_PASS (RExC_total_parens < 0)
465 /* This is used to return failure (zero) early from the calling function if
466 * various flags in 'flags' are set. Two flags always cause a return:
467 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
468 * additional flags that should cause a return; 0 if none. If the return will
469 * be done, '*flagp' is first set to be all of the flags that caused the
471 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
473 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
474 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
479 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
481 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
482 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
483 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
484 if (MUST_RESTART(*(flagp))) return 0
486 /* This converts the named class defined in regcomp.h to its equivalent class
487 * number defined in handy.h. */
488 #define namedclass_to_classnum(class) ((int) ((class) / 2))
489 #define classnum_to_namedclass(classnum) ((classnum) * 2)
491 #define _invlist_union_complement_2nd(a, b, output) \
492 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
493 #define _invlist_intersection_complement_2nd(a, b, output) \
494 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
496 /* We add a marker if we are deferring expansion of a property that is both
497 * 1) potentiallly user-defined; and
498 * 2) could also be an official Unicode property.
500 * Without this marker, any deferred expansion can only be for a user-defined
501 * one. This marker shouldn't conflict with any that could be in a legal name,
502 * and is appended to its name to indicate this. There is a string and
504 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
505 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
507 /* What is infinity for optimization purposes */
508 #define OPTIMIZE_INFTY SSize_t_MAX
510 /* About scan_data_t.
512 During optimisation we recurse through the regexp program performing
513 various inplace (keyhole style) optimisations. In addition study_chunk
514 and scan_commit populate this data structure with information about
515 what strings MUST appear in the pattern. We look for the longest
516 string that must appear at a fixed location, and we look for the
517 longest string that may appear at a floating location. So for instance
522 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
523 strings (because they follow a .* construct). study_chunk will identify
524 both FOO and BAR as being the longest fixed and floating strings respectively.
526 The strings can be composites, for instance
530 will result in a composite fixed substring 'foo'.
532 For each string some basic information is maintained:
535 This is the position the string must appear at, or not before.
536 It also implicitly (when combined with minlenp) tells us how many
537 characters must match before the string we are searching for.
538 Likewise when combined with minlenp and the length of the string it
539 tells us how many characters must appear after the string we have
543 Only used for floating strings. This is the rightmost point that
544 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
545 string can occur infinitely far to the right.
546 For fixed strings, it is equal to min_offset.
549 A pointer to the minimum number of characters of the pattern that the
550 string was found inside. This is important as in the case of positive
551 lookahead or positive lookbehind we can have multiple patterns
556 The minimum length of the pattern overall is 3, the minimum length
557 of the lookahead part is 3, but the minimum length of the part that
558 will actually match is 1. So 'FOO's minimum length is 3, but the
559 minimum length for the F is 1. This is important as the minimum length
560 is used to determine offsets in front of and behind the string being
561 looked for. Since strings can be composites this is the length of the
562 pattern at the time it was committed with a scan_commit. Note that
563 the length is calculated by study_chunk, so that the minimum lengths
564 are not known until the full pattern has been compiled, thus the
565 pointer to the value.
569 In the case of lookbehind the string being searched for can be
570 offset past the start point of the final matching string.
571 If this value was just blithely removed from the min_offset it would
572 invalidate some of the calculations for how many chars must match
573 before or after (as they are derived from min_offset and minlen and
574 the length of the string being searched for).
575 When the final pattern is compiled and the data is moved from the
576 scan_data_t structure into the regexp structure the information
577 about lookbehind is factored in, with the information that would
578 have been lost precalculated in the end_shift field for the
581 The fields pos_min and pos_delta are used to store the minimum offset
582 and the delta to the maximum offset at the current point in the pattern.
586 struct scan_data_substrs {
587 SV *str; /* longest substring found in pattern */
588 SSize_t min_offset; /* earliest point in string it can appear */
589 SSize_t max_offset; /* latest point in string it can appear */
590 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
591 SSize_t lookbehind; /* is the pos of the string modified by LB */
592 I32 flags; /* per substring SF_* and SCF_* flags */
595 typedef struct scan_data_t {
596 /*I32 len_min; unused */
597 /*I32 len_delta; unused */
601 SSize_t last_end; /* min value, <0 unless valid. */
602 SSize_t last_start_min;
603 SSize_t last_start_max;
604 U8 cur_is_floating; /* whether the last_* values should be set as
605 * the next fixed (0) or floating (1)
608 /* [0] is longest fixed substring so far, [1] is longest float so far */
609 struct scan_data_substrs substrs[2];
611 I32 flags; /* common SF_* and SCF_* flags */
613 SSize_t *last_closep;
614 regnode_ssc *start_class;
618 * Forward declarations for pregcomp()'s friends.
621 static const scan_data_t zero_scan_data = {
622 0, 0, NULL, 0, 0, 0, 0,
624 { NULL, 0, 0, 0, 0, 0 },
625 { NULL, 0, 0, 0, 0, 0 },
632 #define SF_BEFORE_SEOL 0x0001
633 #define SF_BEFORE_MEOL 0x0002
634 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
636 #define SF_IS_INF 0x0040
637 #define SF_HAS_PAR 0x0080
638 #define SF_IN_PAR 0x0100
639 #define SF_HAS_EVAL 0x0200
642 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
643 * longest substring in the pattern. When it is not set the optimiser keeps
644 * track of position, but does not keep track of the actual strings seen,
646 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
649 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
650 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
651 * turned off because of the alternation (BRANCH). */
652 #define SCF_DO_SUBSTR 0x0400
654 #define SCF_DO_STCLASS_AND 0x0800
655 #define SCF_DO_STCLASS_OR 0x1000
656 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
657 #define SCF_WHILEM_VISITED_POS 0x2000
659 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
660 #define SCF_SEEN_ACCEPT 0x8000
661 #define SCF_TRIE_DOING_RESTUDY 0x10000
662 #define SCF_IN_DEFINE 0x20000
667 #define UTF cBOOL(RExC_utf8)
669 /* The enums for all these are ordered so things work out correctly */
670 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
671 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
672 == REGEX_DEPENDS_CHARSET)
673 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
674 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
675 >= REGEX_UNICODE_CHARSET)
676 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
677 == REGEX_ASCII_RESTRICTED_CHARSET)
678 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
679 >= REGEX_ASCII_RESTRICTED_CHARSET)
680 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
681 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
683 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
685 /* For programs that want to be strictly Unicode compatible by dying if any
686 * attempt is made to match a non-Unicode code point against a Unicode
688 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
690 #define OOB_NAMEDCLASS -1
692 /* There is no code point that is out-of-bounds, so this is problematic. But
693 * its only current use is to initialize a variable that is always set before
695 #define OOB_UNICODE 0xDEADBEEF
697 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
700 /* length of regex to show in messages that don't mark a position within */
701 #define RegexLengthToShowInErrorMessages 127
704 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
705 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
706 * op/pragma/warn/regcomp.
708 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
709 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
711 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
712 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
714 /* The code in this file in places uses one level of recursion with parsing
715 * rebased to an alternate string constructed by us in memory. This can take
716 * the form of something that is completely different from the input, or
717 * something that uses the input as part of the alternate. In the first case,
718 * there should be no possibility of an error, as we are in complete control of
719 * the alternate string. But in the second case we don't completely control
720 * the input portion, so there may be errors in that. Here's an example:
722 * is handled specially because \x{df} folds to a sequence of more than one
723 * character: 'ss'. What is done is to create and parse an alternate string,
724 * which looks like this:
725 * /(?:\x{DF}|[abc\x{DF}def])/ui
726 * where it uses the input unchanged in the middle of something it constructs,
727 * which is a branch for the DF outside the character class, and clustering
728 * parens around the whole thing. (It knows enough to skip the DF inside the
729 * class while in this substitute parse.) 'abc' and 'def' may have errors that
730 * need to be reported. The general situation looks like this:
732 * |<------- identical ------>|
734 * Input: ---------------------------------------------------------------
735 * Constructed: ---------------------------------------------------
737 * |<------- identical ------>|
739 * sI..eI is the portion of the input pattern we are concerned with here.
740 * sC..EC is the constructed substitute parse string.
741 * sC..tC is constructed by us
742 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
743 * In the diagram, these are vertically aligned.
744 * eC..EC is also constructed by us.
745 * xC is the position in the substitute parse string where we found a
747 * xI is the position in the original pattern corresponding to xC.
749 * We want to display a message showing the real input string. Thus we need to
750 * translate from xC to xI. We know that xC >= tC, since the portion of the
751 * string sC..tC has been constructed by us, and so shouldn't have errors. We
753 * xI = tI + (xC - tC)
755 * When the substitute parse is constructed, the code needs to set:
758 * RExC_copy_start_in_input (tI)
759 * RExC_copy_start_in_constructed (tC)
760 * and restore them when done.
762 * During normal processing of the input pattern, both
763 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
764 * sI, so that xC equals xI.
767 #define sI RExC_precomp
768 #define eI RExC_precomp_end
769 #define sC RExC_start
771 #define tI RExC_copy_start_in_input
772 #define tC RExC_copy_start_in_constructed
773 #define xI(xC) (tI + (xC - tC))
774 #define xI_offset(xC) (xI(xC) - sI)
776 #define REPORT_LOCATION_ARGS(xC) \
778 (xI(xC) > eI) /* Don't run off end */ \
779 ? eI - sI /* Length before the <--HERE */ \
780 : ((xI_offset(xC) >= 0) \
782 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
783 IVdf " trying to output message for " \
785 __FILE__, __LINE__, (IV) xI_offset(xC), \
786 ((int) (eC - sC)), sC), 0)), \
787 sI), /* The input pattern printed up to the <--HERE */ \
789 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
790 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
792 /* Used to point after bad bytes for an error message, but avoid skipping
793 * past a nul byte. */
794 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
796 /* Set up to clean up after our imminent demise */
797 #define PREPARE_TO_DIE \
800 SAVEFREESV(RExC_rx_sv); \
801 if (RExC_open_parens) \
802 SAVEFREEPV(RExC_open_parens); \
803 if (RExC_close_parens) \
804 SAVEFREEPV(RExC_close_parens); \
808 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
809 * arg. Show regex, up to a maximum length. If it's too long, chop and add
812 #define _FAIL(code) STMT_START { \
813 const char *ellipses = ""; \
814 IV len = RExC_precomp_end - RExC_precomp; \
817 if (len > RegexLengthToShowInErrorMessages) { \
818 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
819 len = RegexLengthToShowInErrorMessages - 10; \
825 #define FAIL(msg) _FAIL( \
826 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
827 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
829 #define FAIL2(msg,arg) _FAIL( \
830 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
831 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
833 #define FAIL3(msg,arg1,arg2) _FAIL( \
834 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
835 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
838 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
840 #define Simple_vFAIL(m) STMT_START { \
841 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
842 m, REPORT_LOCATION_ARGS(RExC_parse)); \
846 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
848 #define vFAIL(m) STMT_START { \
854 * Like Simple_vFAIL(), but accepts two arguments.
856 #define Simple_vFAIL2(m,a1) STMT_START { \
857 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
858 REPORT_LOCATION_ARGS(RExC_parse)); \
862 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
864 #define vFAIL2(m,a1) STMT_START { \
866 Simple_vFAIL2(m, a1); \
871 * Like Simple_vFAIL(), but accepts three arguments.
873 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
874 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
875 REPORT_LOCATION_ARGS(RExC_parse)); \
879 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
881 #define vFAIL3(m,a1,a2) STMT_START { \
883 Simple_vFAIL3(m, a1, a2); \
887 * Like Simple_vFAIL(), but accepts four arguments.
889 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
890 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
891 REPORT_LOCATION_ARGS(RExC_parse)); \
894 #define vFAIL4(m,a1,a2,a3) STMT_START { \
896 Simple_vFAIL4(m, a1, a2, a3); \
899 /* A specialized version of vFAIL2 that works with UTF8f */
900 #define vFAIL2utf8f(m, a1) STMT_START { \
902 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
903 REPORT_LOCATION_ARGS(RExC_parse)); \
906 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
908 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
909 REPORT_LOCATION_ARGS(RExC_parse)); \
912 /* Setting this to NULL is a signal to not output warnings */
913 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
915 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
916 RExC_copy_start_in_constructed = NULL; \
918 #define RESTORE_WARNINGS \
919 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
921 /* Since a warning can be generated multiple times as the input is reparsed, we
922 * output it the first time we come to that point in the parse, but suppress it
923 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
924 * generate any warnings */
925 #define TO_OUTPUT_WARNINGS(loc) \
926 ( RExC_copy_start_in_constructed \
927 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
929 /* After we've emitted a warning, we save the position in the input so we don't
931 #define UPDATE_WARNINGS_LOC(loc) \
933 if (TO_OUTPUT_WARNINGS(loc)) { \
934 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
939 /* 'warns' is the output of the packWARNx macro used in 'code' */
940 #define _WARN_HELPER(loc, warns, code) \
942 if (! RExC_copy_start_in_constructed) { \
943 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
944 " expected at '%s'", \
945 __FILE__, __LINE__, loc); \
947 if (TO_OUTPUT_WARNINGS(loc)) { \
951 UPDATE_WARNINGS_LOC(loc); \
955 /* m is not necessarily a "literal string", in this macro */
956 #define warn_non_literal_string(loc, packed_warn, m) \
957 _WARN_HELPER(loc, packed_warn, \
958 Perl_warner(aTHX_ packed_warn, \
959 "%s" REPORT_LOCATION, \
960 m, REPORT_LOCATION_ARGS(loc)))
961 #define reg_warn_non_literal_string(loc, m) \
962 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
964 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
967 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
968 Newx(format, format_size, char); \
969 my_strlcpy(format, m, format_size); \
970 my_strlcat(format, REPORT_LOCATION, format_size); \
971 SAVEFREEPV(format); \
972 _WARN_HELPER(loc, packwarn, \
973 Perl_ck_warner(aTHX_ packwarn, \
975 a1, REPORT_LOCATION_ARGS(loc))); \
978 #define ckWARNreg(loc,m) \
979 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
980 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
982 REPORT_LOCATION_ARGS(loc)))
984 #define vWARN(loc, m) \
985 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
986 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
988 REPORT_LOCATION_ARGS(loc))) \
990 #define vWARN_dep(loc, m) \
991 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
992 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
994 REPORT_LOCATION_ARGS(loc)))
996 #define ckWARNdep(loc,m) \
997 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
998 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
1000 REPORT_LOCATION_ARGS(loc)))
1002 #define ckWARNregdep(loc,m) \
1003 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1004 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1006 m REPORT_LOCATION, \
1007 REPORT_LOCATION_ARGS(loc)))
1009 #define ckWARN2reg_d(loc,m, a1) \
1010 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1011 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1012 m REPORT_LOCATION, \
1013 a1, REPORT_LOCATION_ARGS(loc)))
1015 #define ckWARN2reg(loc, m, a1) \
1016 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1017 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1018 m REPORT_LOCATION, \
1019 a1, REPORT_LOCATION_ARGS(loc)))
1021 #define vWARN3(loc, m, a1, a2) \
1022 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1023 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1024 m REPORT_LOCATION, \
1025 a1, a2, REPORT_LOCATION_ARGS(loc)))
1027 #define ckWARN3reg(loc, m, a1, a2) \
1028 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1029 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1030 m REPORT_LOCATION, \
1032 REPORT_LOCATION_ARGS(loc)))
1034 #define vWARN4(loc, m, a1, a2, a3) \
1035 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1036 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1037 m REPORT_LOCATION, \
1039 REPORT_LOCATION_ARGS(loc)))
1041 #define ckWARN4reg(loc, m, a1, a2, a3) \
1042 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1043 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1044 m REPORT_LOCATION, \
1046 REPORT_LOCATION_ARGS(loc)))
1048 #define vWARN5(loc, m, a1, a2, a3, a4) \
1049 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1050 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1051 m REPORT_LOCATION, \
1053 REPORT_LOCATION_ARGS(loc)))
1055 #define ckWARNexperimental(loc, class, m) \
1057 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1058 RExC_warned_ ## class = 1; \
1059 _WARN_HELPER(loc, packWARN(class), \
1060 Perl_ck_warner_d(aTHX_ packWARN(class), \
1061 m REPORT_LOCATION, \
1062 REPORT_LOCATION_ARGS(loc)));\
1066 /* Convert between a pointer to a node and its offset from the beginning of the
1068 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1069 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1071 /* Macros for recording node offsets. 20001227 mjd@plover.com
1072 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1073 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1074 * Element 0 holds the number n.
1075 * Position is 1 indexed.
1077 #ifndef RE_TRACK_PATTERN_OFFSETS
1078 #define Set_Node_Offset_To_R(offset,byte)
1079 #define Set_Node_Offset(node,byte)
1080 #define Set_Cur_Node_Offset
1081 #define Set_Node_Length_To_R(node,len)
1082 #define Set_Node_Length(node,len)
1083 #define Set_Node_Cur_Length(node,start)
1084 #define Node_Offset(n)
1085 #define Node_Length(n)
1086 #define Set_Node_Offset_Length(node,offset,len)
1087 #define ProgLen(ri) ri->u.proglen
1088 #define SetProgLen(ri,x) ri->u.proglen = x
1089 #define Track_Code(code)
1091 #define ProgLen(ri) ri->u.offsets[0]
1092 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1093 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
1094 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
1095 __LINE__, (int)(offset), (int)(byte))); \
1096 if((offset) < 0) { \
1097 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
1100 RExC_offsets[2*(offset)-1] = (byte); \
1104 #define Set_Node_Offset(node,byte) \
1105 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1106 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1108 #define Set_Node_Length_To_R(node,len) STMT_START { \
1109 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1110 __LINE__, (int)(node), (int)(len))); \
1112 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1115 RExC_offsets[2*(node)] = (len); \
1119 #define Set_Node_Length(node,len) \
1120 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1121 #define Set_Node_Cur_Length(node, start) \
1122 Set_Node_Length(node, RExC_parse - start)
1124 /* Get offsets and lengths */
1125 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1126 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1128 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1129 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1130 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1133 #define Track_Code(code) STMT_START { code } STMT_END
1136 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1137 #define EXPERIMENTAL_INPLACESCAN
1138 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1142 Perl_re_printf(pTHX_ const char *fmt, ...)
1146 PerlIO *f= Perl_debug_log;
1147 PERL_ARGS_ASSERT_RE_PRINTF;
1149 result = PerlIO_vprintf(f, fmt, ap);
1155 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1159 PerlIO *f= Perl_debug_log;
1160 PERL_ARGS_ASSERT_RE_INDENTF;
1161 va_start(ap, depth);
1162 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1163 result = PerlIO_vprintf(f, fmt, ap);
1167 #endif /* DEBUGGING */
1169 #define DEBUG_RExC_seen() \
1170 DEBUG_OPTIMISE_MORE_r({ \
1171 Perl_re_printf( aTHX_ "RExC_seen: "); \
1173 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1174 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1176 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1177 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1179 if (RExC_seen & REG_GPOS_SEEN) \
1180 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1182 if (RExC_seen & REG_RECURSE_SEEN) \
1183 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1185 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1186 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1188 if (RExC_seen & REG_VERBARG_SEEN) \
1189 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1191 if (RExC_seen & REG_CUTGROUP_SEEN) \
1192 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1194 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1195 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1197 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1198 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1200 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1201 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1203 Perl_re_printf( aTHX_ "\n"); \
1206 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1207 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1212 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1213 const char *close_str)
1218 Perl_re_printf( aTHX_ "%s", open_str);
1219 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1221 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1227 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1228 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1229 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1230 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1231 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1232 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1233 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1234 Perl_re_printf( aTHX_ "%s", close_str);
1239 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1240 U32 depth, int is_inf)
1242 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1244 DEBUG_OPTIMISE_MORE_r({
1247 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1251 (IV)data->pos_delta,
1255 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1257 Perl_re_printf( aTHX_
1258 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1260 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1261 is_inf ? "INF " : ""
1264 if (data->last_found) {
1266 Perl_re_printf(aTHX_
1267 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1268 SvPVX_const(data->last_found),
1270 (IV)data->last_start_min,
1271 (IV)data->last_start_max
1274 for (i = 0; i < 2; i++) {
1275 Perl_re_printf(aTHX_
1276 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1277 data->cur_is_floating == i ? "*" : "",
1278 i ? "Float" : "Fixed",
1279 SvPVX_const(data->substrs[i].str),
1280 (IV)data->substrs[i].min_offset,
1281 (IV)data->substrs[i].max_offset
1283 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1287 Perl_re_printf( aTHX_ "\n");
1293 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1294 regnode *scan, U32 depth, U32 flags)
1296 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1303 Next = regnext(scan);
1304 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1305 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1308 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1309 Next ? (REG_NODE_NUM(Next)) : 0 );
1310 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1311 Perl_re_printf( aTHX_ "\n");
1316 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1317 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1319 # define DEBUG_PEEP(str, scan, depth, flags) \
1320 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1323 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1324 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1328 /* =========================================================
1329 * BEGIN edit_distance stuff.
1331 * This calculates how many single character changes of any type are needed to
1332 * transform a string into another one. It is taken from version 3.1 of
1334 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1337 /* Our unsorted dictionary linked list. */
1338 /* Note we use UVs, not chars. */
1343 struct dictionary* next;
1345 typedef struct dictionary item;
1348 PERL_STATIC_INLINE item*
1349 push(UV key, item* curr)
1352 Newx(head, 1, item);
1360 PERL_STATIC_INLINE item*
1361 find(item* head, UV key)
1363 item* iterator = head;
1365 if (iterator->key == key){
1368 iterator = iterator->next;
1374 PERL_STATIC_INLINE item*
1375 uniquePush(item* head, UV key)
1377 item* iterator = head;
1380 if (iterator->key == key) {
1383 iterator = iterator->next;
1386 return push(key, head);
1389 PERL_STATIC_INLINE void
1390 dict_free(item* head)
1392 item* iterator = head;
1395 item* temp = iterator;
1396 iterator = iterator->next;
1403 /* End of Dictionary Stuff */
1405 /* All calculations/work are done here */
1407 S_edit_distance(const UV* src,
1409 const STRLEN x, /* length of src[] */
1410 const STRLEN y, /* length of tgt[] */
1411 const SSize_t maxDistance
1415 UV swapCount, swapScore, targetCharCount, i, j;
1417 UV score_ceil = x + y;
1419 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1421 /* intialize matrix start values */
1422 Newx(scores, ( (x + 2) * (y + 2)), UV);
1423 scores[0] = score_ceil;
1424 scores[1 * (y + 2) + 0] = score_ceil;
1425 scores[0 * (y + 2) + 1] = score_ceil;
1426 scores[1 * (y + 2) + 1] = 0;
1427 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1432 for (i=1;i<=x;i++) {
1434 head = uniquePush(head, src[i]);
1435 scores[(i+1) * (y + 2) + 1] = i;
1436 scores[(i+1) * (y + 2) + 0] = score_ceil;
1439 for (j=1;j<=y;j++) {
1442 head = uniquePush(head, tgt[j]);
1443 scores[1 * (y + 2) + (j + 1)] = j;
1444 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1447 targetCharCount = find(head, tgt[j-1])->value;
1448 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1450 if (src[i-1] != tgt[j-1]){
1451 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1455 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1459 find(head, src[i-1])->value = i;
1463 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1466 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1470 /* END of edit_distance() stuff
1471 * ========================================================= */
1473 /* Mark that we cannot extend a found fixed substring at this point.
1474 Update the longest found anchored substring or the longest found
1475 floating substrings if needed. */
1478 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1479 SSize_t *minlenp, int is_inf)
1481 const STRLEN l = CHR_SVLEN(data->last_found);
1482 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1483 const STRLEN old_l = CHR_SVLEN(longest_sv);
1484 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1486 PERL_ARGS_ASSERT_SCAN_COMMIT;
1488 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1489 const U8 i = data->cur_is_floating;
1490 SvSetMagicSV(longest_sv, data->last_found);
1491 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1494 data->substrs[0].max_offset = data->substrs[0].min_offset;
1496 data->substrs[1].max_offset =
1500 ? data->last_start_max
1501 /* temporary underflow guard for 5.32 */
1502 : data->pos_delta < 0 ? OPTIMIZE_INFTY
1503 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1505 : data->pos_min + data->pos_delta));
1508 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1509 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1510 data->substrs[i].minlenp = minlenp;
1511 data->substrs[i].lookbehind = 0;
1514 SvCUR_set(data->last_found, 0);
1516 SV * const sv = data->last_found;
1517 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1518 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1523 data->last_end = -1;
1524 data->flags &= ~SF_BEFORE_EOL;
1525 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1528 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1529 * list that describes which code points it matches */
1532 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1534 /* Set the SSC 'ssc' to match an empty string or any code point */
1536 PERL_ARGS_ASSERT_SSC_ANYTHING;
1538 assert(is_ANYOF_SYNTHETIC(ssc));
1540 /* mortalize so won't leak */
1541 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1542 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1546 S_ssc_is_anything(const regnode_ssc *ssc)
1548 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1549 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1550 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1551 * in any way, so there's no point in using it */
1556 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1558 assert(is_ANYOF_SYNTHETIC(ssc));
1560 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1564 /* See if the list consists solely of the range 0 - Infinity */
1565 invlist_iterinit(ssc->invlist);
1566 ret = invlist_iternext(ssc->invlist, &start, &end)
1570 invlist_iterfinish(ssc->invlist);
1576 /* If e.g., both \w and \W are set, matches everything */
1577 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1579 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1580 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1590 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1592 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1593 * string, any code point, or any posix class under locale */
1595 PERL_ARGS_ASSERT_SSC_INIT;
1597 Zero(ssc, 1, regnode_ssc);
1598 set_ANYOF_SYNTHETIC(ssc);
1599 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1602 /* If any portion of the regex is to operate under locale rules that aren't
1603 * fully known at compile time, initialization includes it. The reason
1604 * this isn't done for all regexes is that the optimizer was written under
1605 * the assumption that locale was all-or-nothing. Given the complexity and
1606 * lack of documentation in the optimizer, and that there are inadequate
1607 * test cases for locale, many parts of it may not work properly, it is
1608 * safest to avoid locale unless necessary. */
1609 if (RExC_contains_locale) {
1610 ANYOF_POSIXL_SETALL(ssc);
1613 ANYOF_POSIXL_ZERO(ssc);
1618 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1619 const regnode_ssc *ssc)
1621 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1622 * to the list of code points matched, and locale posix classes; hence does
1623 * not check its flags) */
1628 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1630 assert(is_ANYOF_SYNTHETIC(ssc));
1632 invlist_iterinit(ssc->invlist);
1633 ret = invlist_iternext(ssc->invlist, &start, &end)
1637 invlist_iterfinish(ssc->invlist);
1643 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1650 #define INVLIST_INDEX 0
1651 #define ONLY_LOCALE_MATCHES_INDEX 1
1652 #define DEFERRED_USER_DEFINED_INDEX 2
1655 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1656 const regnode_charclass* const node)
1658 /* Returns a mortal inversion list defining which code points are matched
1659 * by 'node', which is of type ANYOF. Handles complementing the result if
1660 * appropriate. If some code points aren't knowable at this time, the
1661 * returned list must, and will, contain every code point that is a
1666 SV* only_utf8_locale_invlist = NULL;
1668 const U32 n = ARG(node);
1669 bool new_node_has_latin1 = FALSE;
1670 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1672 : ANYOF_FLAGS(node);
1674 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1676 /* Look at the data structure created by S_set_ANYOF_arg() */
1677 if (n != ANYOF_ONLY_HAS_BITMAP) {
1678 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1679 AV * const av = MUTABLE_AV(SvRV(rv));
1680 SV **const ary = AvARRAY(av);
1681 assert(RExC_rxi->data->what[n] == 's');
1683 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1685 /* Here there are things that won't be known until runtime -- we
1686 * have to assume it could be anything */
1687 invlist = sv_2mortal(_new_invlist(1));
1688 return _add_range_to_invlist(invlist, 0, UV_MAX);
1690 else if (ary[INVLIST_INDEX]) {
1692 /* Use the node's inversion list */
1693 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1696 /* Get the code points valid only under UTF-8 locales */
1697 if ( (flags & ANYOFL_FOLD)
1698 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1700 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1705 invlist = sv_2mortal(_new_invlist(0));
1708 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1709 * code points, and an inversion list for the others, but if there are code
1710 * points that should match only conditionally on the target string being
1711 * UTF-8, those are placed in the inversion list, and not the bitmap.
1712 * Since there are circumstances under which they could match, they are
1713 * included in the SSC. But if the ANYOF node is to be inverted, we have
1714 * to exclude them here, so that when we invert below, the end result
1715 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1716 * have to do this here before we add the unconditionally matched code
1718 if (flags & ANYOF_INVERT) {
1719 _invlist_intersection_complement_2nd(invlist,
1724 /* Add in the points from the bit map */
1725 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1726 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1727 if (ANYOF_BITMAP_TEST(node, i)) {
1728 unsigned int start = i++;
1730 for (; i < NUM_ANYOF_CODE_POINTS
1731 && ANYOF_BITMAP_TEST(node, i); ++i)
1735 invlist = _add_range_to_invlist(invlist, start, i-1);
1736 new_node_has_latin1 = TRUE;
1741 /* If this can match all upper Latin1 code points, have to add them
1742 * as well. But don't add them if inverting, as when that gets done below,
1743 * it would exclude all these characters, including the ones it shouldn't
1744 * that were added just above */
1745 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1746 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1748 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1751 /* Similarly for these */
1752 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1753 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1756 if (flags & ANYOF_INVERT) {
1757 _invlist_invert(invlist);
1759 else if (flags & ANYOFL_FOLD) {
1760 if (new_node_has_latin1) {
1762 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1763 * the locale. We can skip this if there are no 0-255 at all. */
1764 _invlist_union(invlist, PL_Latin1, &invlist);
1766 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1767 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1770 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1771 invlist = add_cp_to_invlist(invlist, 'I');
1773 if (_invlist_contains_cp(invlist,
1774 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1776 invlist = add_cp_to_invlist(invlist, 'i');
1781 /* Similarly add the UTF-8 locale possible matches. These have to be
1782 * deferred until after the non-UTF-8 locale ones are taken care of just
1783 * above, or it leads to wrong results under ANYOF_INVERT */
1784 if (only_utf8_locale_invlist) {
1785 _invlist_union_maybe_complement_2nd(invlist,
1786 only_utf8_locale_invlist,
1787 flags & ANYOF_INVERT,
1794 /* These two functions currently do the exact same thing */
1795 #define ssc_init_zero ssc_init
1797 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1798 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1800 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1801 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1802 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1805 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1806 const regnode_charclass *and_with)
1808 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1809 * another SSC or a regular ANYOF class. Can create false positives. */
1812 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1814 : ANYOF_FLAGS(and_with);
1817 PERL_ARGS_ASSERT_SSC_AND;
1819 assert(is_ANYOF_SYNTHETIC(ssc));
1821 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1822 * the code point inversion list and just the relevant flags */
1823 if (is_ANYOF_SYNTHETIC(and_with)) {
1824 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1825 anded_flags = and_with_flags;
1827 /* XXX This is a kludge around what appears to be deficiencies in the
1828 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1829 * there are paths through the optimizer where it doesn't get weeded
1830 * out when it should. And if we don't make some extra provision for
1831 * it like the code just below, it doesn't get added when it should.
1832 * This solution is to add it only when AND'ing, which is here, and
1833 * only when what is being AND'ed is the pristine, original node
1834 * matching anything. Thus it is like adding it to ssc_anything() but
1835 * only when the result is to be AND'ed. Probably the same solution
1836 * could be adopted for the same problem we have with /l matching,
1837 * which is solved differently in S_ssc_init(), and that would lead to
1838 * fewer false positives than that solution has. But if this solution
1839 * creates bugs, the consequences are only that a warning isn't raised
1840 * that should be; while the consequences for having /l bugs is
1841 * incorrect matches */
1842 if (ssc_is_anything((regnode_ssc *)and_with)) {
1843 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1847 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1848 if (OP(and_with) == ANYOFD) {
1849 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1852 anded_flags = and_with_flags
1853 &( ANYOF_COMMON_FLAGS
1854 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1855 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1856 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1858 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1863 ANYOF_FLAGS(ssc) &= anded_flags;
1865 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1866 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1867 * 'and_with' may be inverted. When not inverted, we have the situation of
1869 * (C1 | P1) & (C2 | P2)
1870 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1871 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1872 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1873 * <= ((C1 & C2) | P1 | P2)
1874 * Alternatively, the last few steps could be:
1875 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1876 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1877 * <= (C1 | C2 | (P1 & P2))
1878 * We favor the second approach if either P1 or P2 is non-empty. This is
1879 * because these components are a barrier to doing optimizations, as what
1880 * they match cannot be known until the moment of matching as they are
1881 * dependent on the current locale, 'AND"ing them likely will reduce or
1883 * But we can do better if we know that C1,P1 are in their initial state (a
1884 * frequent occurrence), each matching everything:
1885 * (<everything>) & (C2 | P2) = C2 | P2
1886 * Similarly, if C2,P2 are in their initial state (again a frequent
1887 * occurrence), the result is a no-op
1888 * (C1 | P1) & (<everything>) = C1 | P1
1891 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1892 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1893 * <= (C1 & ~C2) | (P1 & ~P2)
1896 if ((and_with_flags & ANYOF_INVERT)
1897 && ! is_ANYOF_SYNTHETIC(and_with))
1901 ssc_intersection(ssc,
1903 FALSE /* Has already been inverted */
1906 /* If either P1 or P2 is empty, the intersection will be also; can skip
1908 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1909 ANYOF_POSIXL_ZERO(ssc);
1911 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1913 /* Note that the Posix class component P from 'and_with' actually
1915 * P = Pa | Pb | ... | Pn
1916 * where each component is one posix class, such as in [\w\s].
1918 * ~P = ~(Pa | Pb | ... | Pn)
1919 * = ~Pa & ~Pb & ... & ~Pn
1920 * <= ~Pa | ~Pb | ... | ~Pn
1921 * The last is something we can easily calculate, but unfortunately
1922 * is likely to have many false positives. We could do better
1923 * in some (but certainly not all) instances if two classes in
1924 * P have known relationships. For example
1925 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1927 * :lower: & :print: = :lower:
1928 * And similarly for classes that must be disjoint. For example,
1929 * since \s and \w can have no elements in common based on rules in
1930 * the POSIX standard,
1931 * \w & ^\S = nothing
1932 * Unfortunately, some vendor locales do not meet the Posix
1933 * standard, in particular almost everything by Microsoft.
1934 * The loop below just changes e.g., \w into \W and vice versa */
1936 regnode_charclass_posixl temp;
1937 int add = 1; /* To calculate the index of the complement */
1939 Zero(&temp, 1, regnode_charclass_posixl);
1940 ANYOF_POSIXL_ZERO(&temp);
1941 for (i = 0; i < ANYOF_MAX; i++) {
1943 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1944 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1946 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1947 ANYOF_POSIXL_SET(&temp, i + add);
1949 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1951 ANYOF_POSIXL_AND(&temp, ssc);
1953 } /* else ssc already has no posixes */
1954 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1955 in its initial state */
1956 else if (! is_ANYOF_SYNTHETIC(and_with)
1957 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1959 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1960 * copy it over 'ssc' */
1961 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1962 if (is_ANYOF_SYNTHETIC(and_with)) {
1963 StructCopy(and_with, ssc, regnode_ssc);
1966 ssc->invlist = anded_cp_list;
1967 ANYOF_POSIXL_ZERO(ssc);
1968 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1969 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1973 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1974 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1976 /* One or the other of P1, P2 is non-empty. */
1977 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1978 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1980 ssc_union(ssc, anded_cp_list, FALSE);
1982 else { /* P1 = P2 = empty */
1983 ssc_intersection(ssc, anded_cp_list, FALSE);
1989 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1990 const regnode_charclass *or_with)
1992 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1993 * another SSC or a regular ANYOF class. Can create false positives if
1994 * 'or_with' is to be inverted. */
1998 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
2000 : ANYOF_FLAGS(or_with);
2002 PERL_ARGS_ASSERT_SSC_OR;
2004 assert(is_ANYOF_SYNTHETIC(ssc));
2006 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2007 * the code point inversion list and just the relevant flags */
2008 if (is_ANYOF_SYNTHETIC(or_with)) {
2009 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2010 ored_flags = or_with_flags;
2013 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2014 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2015 if (OP(or_with) != ANYOFD) {
2018 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2019 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2020 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2022 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2027 ANYOF_FLAGS(ssc) |= ored_flags;
2029 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2030 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2031 * 'or_with' may be inverted. When not inverted, we have the simple
2032 * situation of computing:
2033 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2034 * If P1|P2 yields a situation with both a class and its complement are
2035 * set, like having both \w and \W, this matches all code points, and we
2036 * can delete these from the P component of the ssc going forward. XXX We
2037 * might be able to delete all the P components, but I (khw) am not certain
2038 * about this, and it is better to be safe.
2041 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2042 * <= (C1 | P1) | ~C2
2043 * <= (C1 | ~C2) | P1
2044 * (which results in actually simpler code than the non-inverted case)
2047 if ((or_with_flags & ANYOF_INVERT)
2048 && ! is_ANYOF_SYNTHETIC(or_with))
2050 /* We ignore P2, leaving P1 going forward */
2051 } /* else Not inverted */
2052 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2053 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2054 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2056 for (i = 0; i < ANYOF_MAX; i += 2) {
2057 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2059 ssc_match_all_cp(ssc);
2060 ANYOF_POSIXL_CLEAR(ssc, i);
2061 ANYOF_POSIXL_CLEAR(ssc, i+1);
2069 FALSE /* Already has been inverted */
2074 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2076 PERL_ARGS_ASSERT_SSC_UNION;
2078 assert(is_ANYOF_SYNTHETIC(ssc));
2080 _invlist_union_maybe_complement_2nd(ssc->invlist,
2087 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2089 const bool invert2nd)
2091 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2093 assert(is_ANYOF_SYNTHETIC(ssc));
2095 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2102 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2104 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2106 assert(is_ANYOF_SYNTHETIC(ssc));
2108 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2112 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2114 /* AND just the single code point 'cp' into the SSC 'ssc' */
2116 SV* cp_list = _new_invlist(2);
2118 PERL_ARGS_ASSERT_SSC_CP_AND;
2120 assert(is_ANYOF_SYNTHETIC(ssc));
2122 cp_list = add_cp_to_invlist(cp_list, cp);
2123 ssc_intersection(ssc, cp_list,
2124 FALSE /* Not inverted */
2126 SvREFCNT_dec_NN(cp_list);
2130 S_ssc_clear_locale(regnode_ssc *ssc)
2132 /* Set the SSC 'ssc' to not match any locale things */
2133 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2135 assert(is_ANYOF_SYNTHETIC(ssc));
2137 ANYOF_POSIXL_ZERO(ssc);
2138 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2141 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2144 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2146 /* The synthetic start class is used to hopefully quickly winnow down
2147 * places where a pattern could start a match in the target string. If it
2148 * doesn't really narrow things down that much, there isn't much point to
2149 * having the overhead of using it. This function uses some very crude
2150 * heuristics to decide if to use the ssc or not.
2152 * It returns TRUE if 'ssc' rules out more than half what it considers to
2153 * be the "likely" possible matches, but of course it doesn't know what the
2154 * actual things being matched are going to be; these are only guesses
2156 * For /l matches, it assumes that the only likely matches are going to be
2157 * in the 0-255 range, uniformly distributed, so half of that is 127
2158 * For /a and /d matches, it assumes that the likely matches will be just
2159 * the ASCII range, so half of that is 63
2160 * For /u and there isn't anything matching above the Latin1 range, it
2161 * assumes that that is the only range likely to be matched, and uses
2162 * half that as the cut-off: 127. If anything matches above Latin1,
2163 * it assumes that all of Unicode could match (uniformly), except for
2164 * non-Unicode code points and things in the General Category "Other"
2165 * (unassigned, private use, surrogates, controls and formats). This
2166 * is a much large number. */
2168 U32 count = 0; /* Running total of number of code points matched by
2170 UV start, end; /* Start and end points of current range in inversion
2171 XXX outdated. UTF-8 locales are common, what about invert? list */
2172 const U32 max_code_points = (LOC)
2174 : (( ! UNI_SEMANTICS
2175 || invlist_highest(ssc->invlist) < 256)
2178 const U32 max_match = max_code_points / 2;
2180 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2182 invlist_iterinit(ssc->invlist);
2183 while (invlist_iternext(ssc->invlist, &start, &end)) {
2184 if (start >= max_code_points) {
2187 end = MIN(end, max_code_points - 1);
2188 count += end - start + 1;
2189 if (count >= max_match) {
2190 invlist_iterfinish(ssc->invlist);
2200 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2202 /* The inversion list in the SSC is marked mortal; now we need a more
2203 * permanent copy, which is stored the same way that is done in a regular
2204 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2207 SV* invlist = invlist_clone(ssc->invlist, NULL);
2209 PERL_ARGS_ASSERT_SSC_FINALIZE;
2211 assert(is_ANYOF_SYNTHETIC(ssc));
2213 /* The code in this file assumes that all but these flags aren't relevant
2214 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2215 * by the time we reach here */
2216 assert(! (ANYOF_FLAGS(ssc)
2217 & ~( ANYOF_COMMON_FLAGS
2218 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2219 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2221 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2223 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2224 SvREFCNT_dec(invlist);
2226 /* Make sure is clone-safe */
2227 ssc->invlist = NULL;
2229 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2230 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2231 OP(ssc) = ANYOFPOSIXL;
2233 else if (RExC_contains_locale) {
2237 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2240 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2241 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2242 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2243 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2244 ? (TRIE_LIST_CUR( idx ) - 1) \
2250 dump_trie(trie,widecharmap,revcharmap)
2251 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2252 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2254 These routines dump out a trie in a somewhat readable format.
2255 The _interim_ variants are used for debugging the interim
2256 tables that are used to generate the final compressed
2257 representation which is what dump_trie expects.
2259 Part of the reason for their existence is to provide a form
2260 of documentation as to how the different representations function.
2265 Dumps the final compressed table form of the trie to Perl_debug_log.
2266 Used for debugging make_trie().
2270 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2271 AV *revcharmap, U32 depth)
2274 SV *sv=sv_newmortal();
2275 int colwidth= widecharmap ? 6 : 4;
2277 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2279 PERL_ARGS_ASSERT_DUMP_TRIE;
2281 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2282 depth+1, "Match","Base","Ofs" );
2284 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2285 SV ** const tmp = av_fetch( revcharmap, state, 0);
2287 Perl_re_printf( aTHX_ "%*s",
2289 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2290 PL_colors[0], PL_colors[1],
2291 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2292 PERL_PV_ESCAPE_FIRSTCHAR
2297 Perl_re_printf( aTHX_ "\n");
2298 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2300 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2301 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2302 Perl_re_printf( aTHX_ "\n");
2304 for( state = 1 ; state < trie->statecount ; state++ ) {
2305 const U32 base = trie->states[ state ].trans.base;
2307 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2309 if ( trie->states[ state ].wordnum ) {
2310 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2312 Perl_re_printf( aTHX_ "%6s", "" );
2315 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2320 while( ( base + ofs < trie->uniquecharcount ) ||
2321 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2322 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2326 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2328 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2329 if ( ( base + ofs >= trie->uniquecharcount )
2330 && ( base + ofs - trie->uniquecharcount
2332 && trie->trans[ base + ofs
2333 - trie->uniquecharcount ].check == state )
2335 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2336 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2339 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2343 Perl_re_printf( aTHX_ "]");
2346 Perl_re_printf( aTHX_ "\n" );
2348 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2350 for (word=1; word <= trie->wordcount; word++) {
2351 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2352 (int)word, (int)(trie->wordinfo[word].prev),
2353 (int)(trie->wordinfo[word].len));
2355 Perl_re_printf( aTHX_ "\n" );
2358 Dumps a fully constructed but uncompressed trie in list form.
2359 List tries normally only are used for construction when the number of
2360 possible chars (trie->uniquecharcount) is very high.
2361 Used for debugging make_trie().
2364 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2365 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2369 SV *sv=sv_newmortal();
2370 int colwidth= widecharmap ? 6 : 4;
2371 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2373 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2375 /* print out the table precompression. */
2376 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2378 Perl_re_indentf( aTHX_ "%s",
2379 depth+1, "------:-----+-----------------\n" );
2381 for( state=1 ; state < next_alloc ; state ++ ) {
2384 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2385 depth+1, (UV)state );
2386 if ( ! trie->states[ state ].wordnum ) {
2387 Perl_re_printf( aTHX_ "%5s| ","");
2389 Perl_re_printf( aTHX_ "W%4x| ",
2390 trie->states[ state ].wordnum
2393 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2394 SV ** const tmp = av_fetch( revcharmap,
2395 TRIE_LIST_ITEM(state, charid).forid, 0);
2397 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2399 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2401 PL_colors[0], PL_colors[1],
2402 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2403 | PERL_PV_ESCAPE_FIRSTCHAR
2405 TRIE_LIST_ITEM(state, charid).forid,
2406 (UV)TRIE_LIST_ITEM(state, charid).newstate
2409 Perl_re_printf( aTHX_ "\n%*s| ",
2410 (int)((depth * 2) + 14), "");
2413 Perl_re_printf( aTHX_ "\n");
2418 Dumps a fully constructed but uncompressed trie in table form.
2419 This is the normal DFA style state transition table, with a few
2420 twists to facilitate compression later.
2421 Used for debugging make_trie().
2424 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2425 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2430 SV *sv=sv_newmortal();
2431 int colwidth= widecharmap ? 6 : 4;
2432 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2434 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2437 print out the table precompression so that we can do a visual check
2438 that they are identical.
2441 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2443 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2444 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2446 Perl_re_printf( aTHX_ "%*s",
2448 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2449 PL_colors[0], PL_colors[1],
2450 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2451 PERL_PV_ESCAPE_FIRSTCHAR
2457 Perl_re_printf( aTHX_ "\n");
2458 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2460 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2461 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2464 Perl_re_printf( aTHX_ "\n" );
2466 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2468 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2470 (UV)TRIE_NODENUM( state ) );
2472 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2473 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2475 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2477 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2479 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2480 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2481 (UV)trie->trans[ state ].check );
2483 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2484 (UV)trie->trans[ state ].check,
2485 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2493 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2494 startbranch: the first branch in the whole branch sequence
2495 first : start branch of sequence of branch-exact nodes.
2496 May be the same as startbranch
2497 last : Thing following the last branch.
2498 May be the same as tail.
2499 tail : item following the branch sequence
2500 count : words in the sequence
2501 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2502 depth : indent depth
2504 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2506 A trie is an N'ary tree where the branches are determined by digital
2507 decomposition of the key. IE, at the root node you look up the 1st character and
2508 follow that branch repeat until you find the end of the branches. Nodes can be
2509 marked as "accepting" meaning they represent a complete word. Eg:
2513 would convert into the following structure. Numbers represent states, letters
2514 following numbers represent valid transitions on the letter from that state, if
2515 the number is in square brackets it represents an accepting state, otherwise it
2516 will be in parenthesis.
2518 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2522 (1) +-i->(6)-+-s->[7]
2524 +-s->(3)-+-h->(4)-+-e->[5]
2526 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2528 This shows that when matching against the string 'hers' we will begin at state 1
2529 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2530 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2531 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2532 single traverse. We store a mapping from accepting to state to which word was
2533 matched, and then when we have multiple possibilities we try to complete the
2534 rest of the regex in the order in which they occurred in the alternation.
2536 The only prior NFA like behaviour that would be changed by the TRIE support is
2537 the silent ignoring of duplicate alternations which are of the form:
2539 / (DUPE|DUPE) X? (?{ ... }) Y /x
2541 Thus EVAL blocks following a trie may be called a different number of times with
2542 and without the optimisation. With the optimisations dupes will be silently
2543 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2544 the following demonstrates:
2546 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2548 which prints out 'word' three times, but
2550 'words'=~/(word|word|word)(?{ print $1 })S/
2552 which doesnt print it out at all. This is due to other optimisations kicking in.
2554 Example of what happens on a structural level:
2556 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2558 1: CURLYM[1] {1,32767}(18)
2569 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2570 and should turn into:
2572 1: CURLYM[1] {1,32767}(18)
2574 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2582 Cases where tail != last would be like /(?foo|bar)baz/:
2592 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2593 and would end up looking like:
2596 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2603 d = uvchr_to_utf8_flags(d, uv, 0);
2605 is the recommended Unicode-aware way of saying
2610 #define TRIE_STORE_REVCHAR(val) \
2613 SV *zlopp = newSV(UTF8_MAXBYTES); \
2614 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2615 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2617 SvCUR_set(zlopp, kapow - flrbbbbb); \
2620 av_push(revcharmap, zlopp); \
2622 char ooooff = (char)val; \
2623 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2627 /* This gets the next character from the input, folding it if not already
2629 #define TRIE_READ_CHAR STMT_START { \
2632 /* if it is UTF then it is either already folded, or does not need \
2634 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2636 else if (folder == PL_fold_latin1) { \
2637 /* This folder implies Unicode rules, which in the range expressible \
2638 * by not UTF is the lower case, with the two exceptions, one of \
2639 * which should have been taken care of before calling this */ \
2640 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2641 uvc = toLOWER_L1(*uc); \
2642 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2645 /* raw data, will be folded later if needed */ \
2653 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2654 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2655 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2656 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2657 TRIE_LIST_LEN( state ) = ging; \
2659 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2660 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2661 TRIE_LIST_CUR( state )++; \
2664 #define TRIE_LIST_NEW(state) STMT_START { \
2665 Newx( trie->states[ state ].trans.list, \
2666 4, reg_trie_trans_le ); \
2667 TRIE_LIST_CUR( state ) = 1; \
2668 TRIE_LIST_LEN( state ) = 4; \
2671 #define TRIE_HANDLE_WORD(state) STMT_START { \
2672 U16 dupe= trie->states[ state ].wordnum; \
2673 regnode * const noper_next = regnext( noper ); \
2676 /* store the word for dumping */ \
2678 if (OP(noper) != NOTHING) \
2679 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2681 tmp = newSVpvn_utf8( "", 0, UTF ); \
2682 av_push( trie_words, tmp ); \
2686 trie->wordinfo[curword].prev = 0; \
2687 trie->wordinfo[curword].len = wordlen; \
2688 trie->wordinfo[curword].accept = state; \
2690 if ( noper_next < tail ) { \
2692 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2694 trie->jump[curword] = (U16)(noper_next - convert); \
2696 jumper = noper_next; \
2698 nextbranch= regnext(cur); \
2702 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2703 /* chain, so that when the bits of chain are later */\
2704 /* linked together, the dups appear in the chain */\
2705 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2706 trie->wordinfo[dupe].prev = curword; \
2708 /* we haven't inserted this word yet. */ \
2709 trie->states[ state ].wordnum = curword; \
2714 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2715 ( ( base + charid >= ucharcount \
2716 && base + charid < ubound \
2717 && state == trie->trans[ base - ucharcount + charid ].check \
2718 && trie->trans[ base - ucharcount + charid ].next ) \
2719 ? trie->trans[ base - ucharcount + charid ].next \
2720 : ( state==1 ? special : 0 ) \
2723 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2725 TRIE_BITMAP_SET(trie, uvc); \
2726 /* store the folded codepoint */ \
2728 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2731 /* store first byte of utf8 representation of */ \
2732 /* variant codepoints */ \
2733 if (! UVCHR_IS_INVARIANT(uvc)) { \
2734 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2739 #define MADE_JUMP_TRIE 2
2740 #define MADE_EXACT_TRIE 4
2743 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2744 regnode *first, regnode *last, regnode *tail,
2745 U32 word_count, U32 flags, U32 depth)
2747 /* first pass, loop through and scan words */
2748 reg_trie_data *trie;
2749 HV *widecharmap = NULL;
2750 AV *revcharmap = newAV();
2756 regnode *jumper = NULL;
2757 regnode *nextbranch = NULL;
2758 regnode *convert = NULL;
2759 U32 *prev_states; /* temp array mapping each state to previous one */
2760 /* we just use folder as a flag in utf8 */
2761 const U8 * folder = NULL;
2763 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2764 * which stands for one trie structure, one hash, optionally followed
2767 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2768 AV *trie_words = NULL;
2769 /* along with revcharmap, this only used during construction but both are
2770 * useful during debugging so we store them in the struct when debugging.
2773 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2774 STRLEN trie_charcount=0;
2776 SV *re_trie_maxbuff;
2777 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2779 PERL_ARGS_ASSERT_MAKE_TRIE;
2781 PERL_UNUSED_ARG(depth);
2785 case EXACT: case EXACT_REQ8: case EXACTL: break;
2789 case EXACTFLU8: folder = PL_fold_latin1; break;
2790 case EXACTF: folder = PL_fold; break;
2791 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2794 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2796 trie->startstate = 1;
2797 trie->wordcount = word_count;
2798 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2799 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2800 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2801 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2802 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2803 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2806 trie_words = newAV();
2809 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2810 assert(re_trie_maxbuff);
2811 if (!SvIOK(re_trie_maxbuff)) {
2812 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2814 DEBUG_TRIE_COMPILE_r({
2815 Perl_re_indentf( aTHX_
2816 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2818 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2819 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2822 /* Find the node we are going to overwrite */
2823 if ( first == startbranch && OP( last ) != BRANCH ) {
2824 /* whole branch chain */
2827 /* branch sub-chain */
2828 convert = NEXTOPER( first );
2831 /* -- First loop and Setup --
2833 We first traverse the branches and scan each word to determine if it
2834 contains widechars, and how many unique chars there are, this is
2835 important as we have to build a table with at least as many columns as we
2838 We use an array of integers to represent the character codes 0..255
2839 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2840 the native representation of the character value as the key and IV's for
2843 *TODO* If we keep track of how many times each character is used we can
2844 remap the columns so that the table compression later on is more
2845 efficient in terms of memory by ensuring the most common value is in the
2846 middle and the least common are on the outside. IMO this would be better
2847 than a most to least common mapping as theres a decent chance the most
2848 common letter will share a node with the least common, meaning the node
2849 will not be compressible. With a middle is most common approach the worst
2850 case is when we have the least common nodes twice.
2854 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2855 regnode *noper = NEXTOPER( cur );
2859 U32 wordlen = 0; /* required init */
2860 STRLEN minchars = 0;
2861 STRLEN maxchars = 0;
2862 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2865 if (OP(noper) == NOTHING) {
2866 /* skip past a NOTHING at the start of an alternation
2867 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2869 * If the next node is not something we are supposed to process
2870 * we will just ignore it due to the condition guarding the
2874 regnode *noper_next= regnext(noper);
2875 if (noper_next < tail)
2880 && ( OP(noper) == flags
2881 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2882 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
2883 || OP(noper) == EXACTFUP))))
2885 uc= (U8*)STRING(noper);
2886 e= uc + STR_LEN(noper);
2893 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2894 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2895 regardless of encoding */
2896 if (OP( noper ) == EXACTFUP) {
2897 /* false positives are ok, so just set this */
2898 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2902 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2904 TRIE_CHARCOUNT(trie)++;
2907 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2908 * is in effect. Under /i, this character can match itself, or
2909 * anything that folds to it. If not under /i, it can match just
2910 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2911 * all fold to k, and all are single characters. But some folds
2912 * expand to more than one character, so for example LATIN SMALL
2913 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2914 * the string beginning at 'uc' is 'ffi', it could be matched by
2915 * three characters, or just by the one ligature character. (It
2916 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2917 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2918 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2919 * match.) The trie needs to know the minimum and maximum number
2920 * of characters that could match so that it can use size alone to
2921 * quickly reject many match attempts. The max is simple: it is
2922 * the number of folded characters in this branch (since a fold is
2923 * never shorter than what folds to it. */
2927 /* And the min is equal to the max if not under /i (indicated by
2928 * 'folder' being NULL), or there are no multi-character folds. If
2929 * there is a multi-character fold, the min is incremented just
2930 * once, for the character that folds to the sequence. Each
2931 * character in the sequence needs to be added to the list below of
2932 * characters in the trie, but we count only the first towards the
2933 * min number of characters needed. This is done through the
2934 * variable 'foldlen', which is returned by the macros that look
2935 * for these sequences as the number of bytes the sequence
2936 * occupies. Each time through the loop, we decrement 'foldlen' by
2937 * how many bytes the current char occupies. Only when it reaches
2938 * 0 do we increment 'minchars' or look for another multi-character
2940 if (folder == NULL) {
2943 else if (foldlen > 0) {
2944 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2949 /* See if *uc is the beginning of a multi-character fold. If
2950 * so, we decrement the length remaining to look at, to account
2951 * for the current character this iteration. (We can use 'uc'
2952 * instead of the fold returned by TRIE_READ_CHAR because for
2953 * non-UTF, the latin1_safe macro is smart enough to account
2954 * for all the unfolded characters, and because for UTF, the
2955 * string will already have been folded earlier in the
2956 * compilation process */
2958 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2959 foldlen -= UTF8SKIP(uc);
2962 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2967 /* The current character (and any potential folds) should be added
2968 * to the possible matching characters for this position in this
2972 U8 folded= folder[ (U8) uvc ];
2973 if ( !trie->charmap[ folded ] ) {
2974 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2975 TRIE_STORE_REVCHAR( folded );
2978 if ( !trie->charmap[ uvc ] ) {
2979 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2980 TRIE_STORE_REVCHAR( uvc );
2983 /* store the codepoint in the bitmap, and its folded
2985 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2986 set_bit = 0; /* We've done our bit :-) */
2990 /* XXX We could come up with the list of code points that fold
2991 * to this using PL_utf8_foldclosures, except not for
2992 * multi-char folds, as there may be multiple combinations
2993 * there that could work, which needs to wait until runtime to
2994 * resolve (The comment about LIGATURE FFI above is such an
2999 widecharmap = newHV();
3001 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
3004 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3006 if ( !SvTRUE( *svpp ) ) {
3007 sv_setiv( *svpp, ++trie->uniquecharcount );
3008 TRIE_STORE_REVCHAR(uvc);
3011 } /* end loop through characters in this branch of the trie */
3013 /* We take the min and max for this branch and combine to find the min
3014 * and max for all branches processed so far */
3015 if( cur == first ) {
3016 trie->minlen = minchars;
3017 trie->maxlen = maxchars;
3018 } else if (minchars < trie->minlen) {
3019 trie->minlen = minchars;
3020 } else if (maxchars > trie->maxlen) {
3021 trie->maxlen = maxchars;
3023 } /* end first pass */
3024 DEBUG_TRIE_COMPILE_r(
3025 Perl_re_indentf( aTHX_
3026 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3028 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3029 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3030 (int)trie->minlen, (int)trie->maxlen )
3034 We now know what we are dealing with in terms of unique chars and
3035 string sizes so we can calculate how much memory a naive
3036 representation using a flat table will take. If it's over a reasonable
3037 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3038 conservative but potentially much slower representation using an array
3041 At the end we convert both representations into the same compressed
3042 form that will be used in regexec.c for matching with. The latter
3043 is a form that cannot be used to construct with but has memory
3044 properties similar to the list form and access properties similar
3045 to the table form making it both suitable for fast searches and
3046 small enough that its feasable to store for the duration of a program.
3048 See the comment in the code where the compressed table is produced
3049 inplace from the flat tabe representation for an explanation of how
3050 the compression works.
3055 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3058 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3059 > SvIV(re_trie_maxbuff) )
3062 Second Pass -- Array Of Lists Representation
3064 Each state will be represented by a list of charid:state records
3065 (reg_trie_trans_le) the first such element holds the CUR and LEN
3066 points of the allocated array. (See defines above).
3068 We build the initial structure using the lists, and then convert
3069 it into the compressed table form which allows faster lookups
3070 (but cant be modified once converted).
3073 STRLEN transcount = 1;
3075 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3078 trie->states = (reg_trie_state *)
3079 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3080 sizeof(reg_trie_state) );
3084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3086 regnode *noper = NEXTOPER( cur );
3087 U32 state = 1; /* required init */
3088 U16 charid = 0; /* sanity init */
3089 U32 wordlen = 0; /* required init */
3091 if (OP(noper) == NOTHING) {
3092 regnode *noper_next= regnext(noper);
3093 if (noper_next < tail)
3095 /* we will undo this assignment if noper does not
3096 * point at a trieable type in the else clause of
3097 * the following statement. */
3101 && ( OP(noper) == flags
3102 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3103 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3104 || OP(noper) == EXACTFUP))))
3106 const U8 *uc= (U8*)STRING(noper);
3107 const U8 *e= uc + STR_LEN(noper);
3109 for ( ; uc < e ; uc += len ) {
3114 charid = trie->charmap[ uvc ];
3116 SV** const svpp = hv_fetch( widecharmap,
3123 charid=(U16)SvIV( *svpp );
3126 /* charid is now 0 if we dont know the char read, or
3127 * nonzero if we do */
3134 if ( !trie->states[ state ].trans.list ) {
3135 TRIE_LIST_NEW( state );
3138 check <= TRIE_LIST_USED( state );
3141 if ( TRIE_LIST_ITEM( state, check ).forid
3144 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3149 newstate = next_alloc++;
3150 prev_states[newstate] = state;
3151 TRIE_LIST_PUSH( state, charid, newstate );
3156 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3160 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3161 * on a trieable type. So we need to reset noper back to point at the first regop
3162 * in the branch before we call TRIE_HANDLE_WORD()
3164 noper= NEXTOPER(cur);
3166 TRIE_HANDLE_WORD(state);
3168 } /* end second pass */
3170 /* next alloc is the NEXT state to be allocated */
3171 trie->statecount = next_alloc;
3172 trie->states = (reg_trie_state *)
3173 PerlMemShared_realloc( trie->states,
3175 * sizeof(reg_trie_state) );
3177 /* and now dump it out before we compress it */
3178 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3179 revcharmap, next_alloc,
3183 trie->trans = (reg_trie_trans *)
3184 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3191 for( state=1 ; state < next_alloc ; state ++ ) {
3195 DEBUG_TRIE_COMPILE_MORE_r(
3196 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3200 if (trie->states[state].trans.list) {
3201 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3205 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3206 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3207 if ( forid < minid ) {
3209 } else if ( forid > maxid ) {
3213 if ( transcount < tp + maxid - minid + 1) {
3215 trie->trans = (reg_trie_trans *)
3216 PerlMemShared_realloc( trie->trans,
3218 * sizeof(reg_trie_trans) );
3219 Zero( trie->trans + (transcount / 2),
3223 base = trie->uniquecharcount + tp - minid;
3224 if ( maxid == minid ) {
3226 for ( ; zp < tp ; zp++ ) {
3227 if ( ! trie->trans[ zp ].next ) {
3228 base = trie->uniquecharcount + zp - minid;
3229 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3231 trie->trans[ zp ].check = state;
3237 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3239 trie->trans[ tp ].check = state;
3244 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3245 const U32 tid = base
3246 - trie->uniquecharcount
3247 + TRIE_LIST_ITEM( state, idx ).forid;
3248 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3250 trie->trans[ tid ].check = state;
3252 tp += ( maxid - minid + 1 );
3254 Safefree(trie->states[ state ].trans.list);
3257 DEBUG_TRIE_COMPILE_MORE_r(
3258 Perl_re_printf( aTHX_ " base: %d\n",base);
3261 trie->states[ state ].trans.base=base;
3263 trie->lasttrans = tp + 1;
3267 Second Pass -- Flat Table Representation.
3269 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3270 each. We know that we will need Charcount+1 trans at most to store
3271 the data (one row per char at worst case) So we preallocate both
3272 structures assuming worst case.
3274 We then construct the trie using only the .next slots of the entry
3277 We use the .check field of the first entry of the node temporarily
3278 to make compression both faster and easier by keeping track of how
3279 many non zero fields are in the node.
3281 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3284 There are two terms at use here: state as a TRIE_NODEIDX() which is
3285 a number representing the first entry of the node, and state as a
3286 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3287 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3288 if there are 2 entrys per node. eg:
3296 The table is internally in the right hand, idx form. However as we
3297 also have to deal with the states array which is indexed by nodenum
3298 we have to use TRIE_NODENUM() to convert.
3301 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3304 trie->trans = (reg_trie_trans *)
3305 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3306 * trie->uniquecharcount + 1,
3307 sizeof(reg_trie_trans) );
3308 trie->states = (reg_trie_state *)
3309 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3310 sizeof(reg_trie_state) );
3311 next_alloc = trie->uniquecharcount + 1;
3314 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3316 regnode *noper = NEXTOPER( cur );
3318 U32 state = 1; /* required init */
3320 U16 charid = 0; /* sanity init */
3321 U32 accept_state = 0; /* sanity init */
3323 U32 wordlen = 0; /* required init */
3325 if (OP(noper) == NOTHING) {
3326 regnode *noper_next= regnext(noper);
3327 if (noper_next < tail)
3329 /* we will undo this assignment if noper does not
3330 * point at a trieable type in the else clause of
3331 * the following statement. */
3335 && ( OP(noper) == flags
3336 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3337 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3338 || OP(noper) == EXACTFUP))))
3340 const U8 *uc= (U8*)STRING(noper);
3341 const U8 *e= uc + STR_LEN(noper);
3343 for ( ; uc < e ; uc += len ) {
3348 charid = trie->charmap[ uvc ];
3350 SV* const * const svpp = hv_fetch( widecharmap,
3354 charid = svpp ? (U16)SvIV(*svpp) : 0;
3358 if ( !trie->trans[ state + charid ].next ) {
3359 trie->trans[ state + charid ].next = next_alloc;
3360 trie->trans[ state ].check++;
3361 prev_states[TRIE_NODENUM(next_alloc)]
3362 = TRIE_NODENUM(state);
3363 next_alloc += trie->uniquecharcount;
3365 state = trie->trans[ state + charid ].next;
3367 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3369 /* charid is now 0 if we dont know the char read, or
3370 * nonzero if we do */
3373 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3374 * on a trieable type. So we need to reset noper back to point at the first regop
3375 * in the branch before we call TRIE_HANDLE_WORD().
3377 noper= NEXTOPER(cur);
3379 accept_state = TRIE_NODENUM( state );
3380 TRIE_HANDLE_WORD(accept_state);
3382 } /* end second pass */
3384 /* and now dump it out before we compress it */
3385 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3387 next_alloc, depth+1));
3391 * Inplace compress the table.*
3393 For sparse data sets the table constructed by the trie algorithm will
3394 be mostly 0/FAIL transitions or to put it another way mostly empty.
3395 (Note that leaf nodes will not contain any transitions.)
3397 This algorithm compresses the tables by eliminating most such
3398 transitions, at the cost of a modest bit of extra work during lookup:
3400 - Each states[] entry contains a .base field which indicates the
3401 index in the state[] array wheres its transition data is stored.
3403 - If .base is 0 there are no valid transitions from that node.
3405 - If .base is nonzero then charid is added to it to find an entry in
3408 -If trans[states[state].base+charid].check!=state then the
3409 transition is taken to be a 0/Fail transition. Thus if there are fail
3410 transitions at the front of the node then the .base offset will point
3411 somewhere inside the previous nodes data (or maybe even into a node
3412 even earlier), but the .check field determines if the transition is
3416 The following process inplace converts the table to the compressed
3417 table: We first do not compress the root node 1,and mark all its
3418 .check pointers as 1 and set its .base pointer as 1 as well. This
3419 allows us to do a DFA construction from the compressed table later,
3420 and ensures that any .base pointers we calculate later are greater
3423 - We set 'pos' to indicate the first entry of the second node.
3425 - We then iterate over the columns of the node, finding the first and
3426 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3427 and set the .check pointers accordingly, and advance pos
3428 appropriately and repreat for the next node. Note that when we copy
3429 the next pointers we have to convert them from the original
3430 NODEIDX form to NODENUM form as the former is not valid post
3433 - If a node has no transitions used we mark its base as 0 and do not
3434 advance the pos pointer.
3436 - If a node only has one transition we use a second pointer into the
3437 structure to fill in allocated fail transitions from other states.
3438 This pointer is independent of the main pointer and scans forward
3439 looking for null transitions that are allocated to a state. When it
3440 finds one it writes the single transition into the "hole". If the
3441 pointer doesnt find one the single transition is appended as normal.
3443 - Once compressed we can Renew/realloc the structures to release the
3446 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3447 specifically Fig 3.47 and the associated pseudocode.
3451 const U32 laststate = TRIE_NODENUM( next_alloc );
3454 trie->statecount = laststate;
3456 for ( state = 1 ; state < laststate ; state++ ) {
3458 const U32 stateidx = TRIE_NODEIDX( state );
3459 const U32 o_used = trie->trans[ stateidx ].check;
3460 U32 used = trie->trans[ stateidx ].check;
3461 trie->trans[ stateidx ].check = 0;
3464 used && charid < trie->uniquecharcount;
3467 if ( flag || trie->trans[ stateidx + charid ].next ) {
3468 if ( trie->trans[ stateidx + charid ].next ) {
3470 for ( ; zp < pos ; zp++ ) {
3471 if ( ! trie->trans[ zp ].next ) {
3475 trie->states[ state ].trans.base
3477 + trie->uniquecharcount
3479 trie->trans[ zp ].next
3480 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3482 trie->trans[ zp ].check = state;
3483 if ( ++zp > pos ) pos = zp;
3490 trie->states[ state ].trans.base
3491 = pos + trie->uniquecharcount - charid ;
3493 trie->trans[ pos ].next
3494 = SAFE_TRIE_NODENUM(
3495 trie->trans[ stateidx + charid ].next );
3496 trie->trans[ pos ].check = state;
3501 trie->lasttrans = pos + 1;
3502 trie->states = (reg_trie_state *)
3503 PerlMemShared_realloc( trie->states, laststate
3504 * sizeof(reg_trie_state) );
3505 DEBUG_TRIE_COMPILE_MORE_r(
3506 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3508 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3512 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3515 } /* end table compress */
3517 DEBUG_TRIE_COMPILE_MORE_r(
3518 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3520 (UV)trie->statecount,
3521 (UV)trie->lasttrans)
3523 /* resize the trans array to remove unused space */
3524 trie->trans = (reg_trie_trans *)
3525 PerlMemShared_realloc( trie->trans, trie->lasttrans
3526 * sizeof(reg_trie_trans) );
3528 { /* Modify the program and insert the new TRIE node */
3529 U8 nodetype =(U8)(flags & 0xFF);
3533 regnode *optimize = NULL;
3534 #ifdef RE_TRACK_PATTERN_OFFSETS
3537 U32 mjd_nodelen = 0;
3538 #endif /* RE_TRACK_PATTERN_OFFSETS */
3539 #endif /* DEBUGGING */
3541 This means we convert either the first branch or the first Exact,
3542 depending on whether the thing following (in 'last') is a branch
3543 or not and whther first is the startbranch (ie is it a sub part of
3544 the alternation or is it the whole thing.)
3545 Assuming its a sub part we convert the EXACT otherwise we convert
3546 the whole branch sequence, including the first.
3548 /* Find the node we are going to overwrite */
3549 if ( first != startbranch || OP( last ) == BRANCH ) {
3550 /* branch sub-chain */
3551 NEXT_OFF( first ) = (U16)(last - first);
3552 #ifdef RE_TRACK_PATTERN_OFFSETS
3554 mjd_offset= Node_Offset((convert));
3555 mjd_nodelen= Node_Length((convert));
3558 /* whole branch chain */
3560 #ifdef RE_TRACK_PATTERN_OFFSETS
3563 const regnode *nop = NEXTOPER( convert );
3564 mjd_offset= Node_Offset((nop));
3565 mjd_nodelen= Node_Length((nop));
3569 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3571 (UV)mjd_offset, (UV)mjd_nodelen)
3574 /* But first we check to see if there is a common prefix we can
3575 split out as an EXACT and put in front of the TRIE node. */
3576 trie->startstate= 1;
3577 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3578 /* we want to find the first state that has more than
3579 * one transition, if that state is not the first state
3580 * then we have a common prefix which we can remove.
3583 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3585 I32 first_ofs = -1; /* keeps track of the ofs of the first
3586 transition, -1 means none */
3588 const U32 base = trie->states[ state ].trans.base;
3590 /* does this state terminate an alternation? */
3591 if ( trie->states[state].wordnum )
3594 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3595 if ( ( base + ofs >= trie->uniquecharcount ) &&
3596 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3597 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3599 if ( ++count > 1 ) {
3600 /* we have more than one transition */
3603 /* if this is the first state there is no common prefix
3604 * to extract, so we can exit */
3605 if ( state == 1 ) break;
3606 tmp = av_fetch( revcharmap, ofs, 0);
3607 ch = (U8*)SvPV_nolen_const( *tmp );
3609 /* if we are on count 2 then we need to initialize the
3610 * bitmap, and store the previous char if there was one
3613 /* clear the bitmap */
3614 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3616 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3619 if (first_ofs >= 0) {
3620 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3621 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3623 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3625 Perl_re_printf( aTHX_ "%s", (char*)ch)
3629 /* store the current firstchar in the bitmap */
3630 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3631 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3637 /* This state has only one transition, its transition is part
3638 * of a common prefix - we need to concatenate the char it
3639 * represents to what we have so far. */
3640 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3642 char *ch = SvPV( *tmp, len );
3644 SV *sv=sv_newmortal();
3645 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3647 (UV)state, (UV)first_ofs,
3648 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3649 PL_colors[0], PL_colors[1],
3650 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3651 PERL_PV_ESCAPE_FIRSTCHAR
3656 OP( convert ) = nodetype;
3657 str=STRING(convert);
3658 setSTR_LEN(convert, 0);
3660 assert( ( STR_LEN(convert) + len ) < 256 );
3661 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3667 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3672 trie->prefixlen = (state-1);
3674 regnode *n = convert+NODE_SZ_STR(convert);
3675 assert( NODE_SZ_STR(convert) <= U16_MAX );
3676 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3677 trie->startstate = state;
3678 trie->minlen -= (state - 1);
3679 trie->maxlen -= (state - 1);
3681 /* At least the UNICOS C compiler choked on this
3682 * being argument to DEBUG_r(), so let's just have
3685 #ifdef PERL_EXT_RE_BUILD
3691 regnode *fix = convert;
3692 U32 word = trie->wordcount;
3693 #ifdef RE_TRACK_PATTERN_OFFSETS
3696 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3697 while( ++fix < n ) {
3698 Set_Node_Offset_Length(fix, 0, 0);
3701 SV ** const tmp = av_fetch( trie_words, word, 0 );
3703 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3704 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3706 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3714 NEXT_OFF(convert) = (U16)(tail - convert);
3715 DEBUG_r(optimize= n);
3721 if ( trie->maxlen ) {
3722 NEXT_OFF( convert ) = (U16)(tail - convert);
3723 ARG_SET( convert, data_slot );
3724 /* Store the offset to the first unabsorbed branch in
3725 jump[0], which is otherwise unused by the jump logic.
3726 We use this when dumping a trie and during optimisation. */
3728 trie->jump[0] = (U16)(nextbranch - convert);
3730 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3731 * and there is a bitmap
3732 * and the first "jump target" node we found leaves enough room
3733 * then convert the TRIE node into a TRIEC node, with the bitmap
3734 * embedded inline in the opcode - this is hypothetically faster.
3736 if ( !trie->states[trie->startstate].wordnum
3738 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3740 OP( convert ) = TRIEC;
3741 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3742 PerlMemShared_free(trie->bitmap);
3745 OP( convert ) = TRIE;
3747 /* store the type in the flags */
3748 convert->flags = nodetype;
3752 + regarglen[ OP( convert ) ];
3754 /* XXX We really should free up the resource in trie now,
3755 as we won't use them - (which resources?) dmq */
3757 /* needed for dumping*/
3758 DEBUG_r(if (optimize) {
3759 regnode *opt = convert;
3761 while ( ++opt < optimize) {
3762 Set_Node_Offset_Length(opt, 0, 0);
3765 Try to clean up some of the debris left after the
3768 while( optimize < jumper ) {
3769 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3770 OP( optimize ) = OPTIMIZED;
3771 Set_Node_Offset_Length(optimize, 0, 0);
3774 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3776 } /* end node insert */
3778 /* Finish populating the prev field of the wordinfo array. Walk back
3779 * from each accept state until we find another accept state, and if
3780 * so, point the first word's .prev field at the second word. If the
3781 * second already has a .prev field set, stop now. This will be the
3782 * case either if we've already processed that word's accept state,
3783 * or that state had multiple words, and the overspill words were
3784 * already linked up earlier.
3791 for (word=1; word <= trie->wordcount; word++) {
3793 if (trie->wordinfo[word].prev)
3795 state = trie->wordinfo[word].accept;
3797 state = prev_states[state];
3800 prev = trie->states[state].wordnum;
3804 trie->wordinfo[word].prev = prev;
3806 Safefree(prev_states);
3810 /* and now dump out the compressed format */
3811 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3813 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3815 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3816 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3818 SvREFCNT_dec_NN(revcharmap);
3822 : trie->startstate>1
3828 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3830 /* The Trie is constructed and compressed now so we can build a fail array if
3833 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3835 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3839 We find the fail state for each state in the trie, this state is the longest
3840 proper suffix of the current state's 'word' that is also a proper prefix of
3841 another word in our trie. State 1 represents the word '' and is thus the
3842 default fail state. This allows the DFA not to have to restart after its
3843 tried and failed a word at a given point, it simply continues as though it
3844 had been matching the other word in the first place.
3846 'abcdgu'=~/abcdefg|cdgu/
3847 When we get to 'd' we are still matching the first word, we would encounter
3848 'g' which would fail, which would bring us to the state representing 'd' in
3849 the second word where we would try 'g' and succeed, proceeding to match
3852 /* add a fail transition */
3853 const U32 trie_offset = ARG(source);
3854 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3856 const U32 ucharcount = trie->uniquecharcount;
3857 const U32 numstates = trie->statecount;
3858 const U32 ubound = trie->lasttrans + ucharcount;
3862 U32 base = trie->states[ 1 ].trans.base;
3865 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3867 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3869 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3870 PERL_UNUSED_CONTEXT;
3872 PERL_UNUSED_ARG(depth);
3875 if ( OP(source) == TRIE ) {
3876 struct regnode_1 *op = (struct regnode_1 *)
3877 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3878 StructCopy(source, op, struct regnode_1);
3879 stclass = (regnode *)op;
3881 struct regnode_charclass *op = (struct regnode_charclass *)
3882 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3883 StructCopy(source, op, struct regnode_charclass);
3884 stclass = (regnode *)op;
3886 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3888 ARG_SET( stclass, data_slot );
3889 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3890 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3891 aho->trie=trie_offset;
3892 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3893 Copy( trie->states, aho->states, numstates, reg_trie_state );
3894 Newx( q, numstates, U32);
3895 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3898 /* initialize fail[0..1] to be 1 so that we always have
3899 a valid final fail state */
3900 fail[ 0 ] = fail[ 1 ] = 1;
3902 for ( charid = 0; charid < ucharcount ; charid++ ) {
3903 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3905 q[ q_write ] = newstate;
3906 /* set to point at the root */
3907 fail[ q[ q_write++ ] ]=1;
3910 while ( q_read < q_write) {
3911 const U32 cur = q[ q_read++ % numstates ];
3912 base = trie->states[ cur ].trans.base;
3914 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3915 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3917 U32 fail_state = cur;
3920 fail_state = fail[ fail_state ];
3921 fail_base = aho->states[ fail_state ].trans.base;
3922 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3924 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3925 fail[ ch_state ] = fail_state;
3926 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3928 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3930 q[ q_write++ % numstates] = ch_state;
3934 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3935 when we fail in state 1, this allows us to use the
3936 charclass scan to find a valid start char. This is based on the principle
3937 that theres a good chance the string being searched contains lots of stuff
3938 that cant be a start char.
3940 fail[ 0 ] = fail[ 1 ] = 0;
3941 DEBUG_TRIE_COMPILE_r({
3942 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3943 depth, (UV)numstates
3945 for( q_read=1; q_read<numstates; q_read++ ) {
3946 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3948 Perl_re_printf( aTHX_ "\n");
3951 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3956 /* The below joins as many adjacent EXACTish nodes as possible into a single
3957 * one. The regop may be changed if the node(s) contain certain sequences that
3958 * require special handling. The joining is only done if:
3959 * 1) there is room in the current conglomerated node to entirely contain the
3961 * 2) they are compatible node types
3963 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3964 * these get optimized out
3966 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3967 * as possible, even if that means splitting an existing node so that its first
3968 * part is moved to the preceeding node. This would maximise the efficiency of
3969 * memEQ during matching.
3971 * If a node is to match under /i (folded), the number of characters it matches
3972 * can be different than its character length if it contains a multi-character
3973 * fold. *min_subtract is set to the total delta number of characters of the
3976 * And *unfolded_multi_char is set to indicate whether or not the node contains
3977 * an unfolded multi-char fold. This happens when it won't be known until
3978 * runtime whether the fold is valid or not; namely
3979 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3980 * target string being matched against turns out to be UTF-8 is that fold
3982 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3984 * (Multi-char folds whose components are all above the Latin1 range are not
3985 * run-time locale dependent, and have already been folded by the time this
3986 * function is called.)
3988 * This is as good a place as any to discuss the design of handling these
3989 * multi-character fold sequences. It's been wrong in Perl for a very long
3990 * time. There are three code points in Unicode whose multi-character folds
3991 * were long ago discovered to mess things up. The previous designs for
3992 * dealing with these involved assigning a special node for them. This
3993 * approach doesn't always work, as evidenced by this example:
3994 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3995 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3996 * would match just the \xDF, it won't be able to handle the case where a
3997 * successful match would have to cross the node's boundary. The new approach
3998 * that hopefully generally solves the problem generates an EXACTFUP node
3999 * that is "sss" in this case.
4001 * It turns out that there are problems with all multi-character folds, and not
4002 * just these three. Now the code is general, for all such cases. The
4003 * approach taken is:
4004 * 1) This routine examines each EXACTFish node that could contain multi-
4005 * character folded sequences. Since a single character can fold into
4006 * such a sequence, the minimum match length for this node is less than
4007 * the number of characters in the node. This routine returns in
4008 * *min_subtract how many characters to subtract from the the actual
4009 * length of the string to get a real minimum match length; it is 0 if
4010 * there are no multi-char foldeds. This delta is used by the caller to
4011 * adjust the min length of the match, and the delta between min and max,
4012 * so that the optimizer doesn't reject these possibilities based on size
4015 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4016 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4017 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4018 * EXACTFU nodes. The node type of such nodes is then changed to
4019 * EXACTFUP, indicating it is problematic, and needs careful handling.
4020 * (The procedures in step 1) above are sufficient to handle this case in
4021 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4022 * the only case where there is a possible fold length change in non-UTF-8
4023 * patterns. By reserving a special node type for problematic cases, the
4024 * far more common regular EXACTFU nodes can be processed faster.
4025 * regexec.c takes advantage of this.
4027 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4028 * problematic cases. These all only occur when the pattern is not
4029 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4030 * length change, it handles the situation where the string cannot be
4031 * entirely folded. The strings in an EXACTFish node are folded as much
4032 * as possible during compilation in regcomp.c. This saves effort in
4033 * regex matching. By using an EXACTFUP node when it is not possible to
4034 * fully fold at compile time, regexec.c can know that everything in an
4035 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4036 * case where folding in EXACTFU nodes can't be done at compile time is
4037 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4038 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4039 * handle two very different cases. Alternatively, there could have been
4040 * a node type where there are length changes, one for unfolded, and one
4041 * for both. If yet another special case needed to be created, the number
4042 * of required node types would have to go to 7. khw figures that even
4043 * though there are plenty of node types to spare, that the maintenance
4044 * cost wasn't worth the small speedup of doing it that way, especially
4045 * since he thinks the MICRO SIGN is rarely encountered in practice.
4047 * There are other cases where folding isn't done at compile time, but
4048 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4049 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4050 * changes. Some folds in EXACTF depend on if the runtime target string
4051 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4052 * when no fold in it depends on the UTF-8ness of the target string.)
4054 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4055 * validity of the fold won't be known until runtime, and so must remain
4056 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4057 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4058 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4059 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4060 * The reason this is a problem is that the optimizer part of regexec.c
4061 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4062 * that a character in the pattern corresponds to at most a single
4063 * character in the target string. (And I do mean character, and not byte
4064 * here, unlike other parts of the documentation that have never been
4065 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4066 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4067 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4068 * EXACTFL nodes, violate the assumption, and they are the only instances
4069 * where it is violated. I'm reluctant to try to change the assumption,
4070 * as the code involved is impenetrable to me (khw), so instead the code
4071 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4072 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4073 * boolean indicating whether or not the node contains such a fold. When
4074 * it is true, the caller sets a flag that later causes the optimizer in
4075 * this file to not set values for the floating and fixed string lengths,
4076 * and thus avoids the optimizer code in regexec.c that makes the invalid
4077 * assumption. Thus, there is no optimization based on string lengths for
4078 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4079 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4080 * assumption is wrong only in these cases is that all other non-UTF-8
4081 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4082 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4083 * EXACTF nodes because we don't know at compile time if it actually
4084 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4085 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4086 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4087 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4088 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4089 * string would require the pattern to be forced into UTF-8, the overhead
4090 * of which we want to avoid. Similarly the unfolded multi-char folds in
4091 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4094 * Similarly, the code that generates tries doesn't currently handle
4095 * not-already-folded multi-char folds, and it looks like a pain to change
4096 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4097 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4098 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4099 * using /iaa matching will be doing so almost entirely with ASCII
4100 * strings, so this should rarely be encountered in practice */
4103 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4104 UV *min_subtract, bool *unfolded_multi_char,
4105 U32 flags, regnode *val, U32 depth)
4107 /* Merge several consecutive EXACTish nodes into one. */
4109 regnode *n = regnext(scan);
4111 regnode *next = scan + NODE_SZ_STR(scan);
4115 regnode *stop = scan;
4116 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4118 PERL_UNUSED_ARG(depth);
4121 PERL_ARGS_ASSERT_JOIN_EXACT;
4122 #ifndef EXPERIMENTAL_INPLACESCAN
4123 PERL_UNUSED_ARG(flags);
4124 PERL_UNUSED_ARG(val);
4126 DEBUG_PEEP("join", scan, depth, 0);
4128 assert(PL_regkind[OP(scan)] == EXACT);
4130 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4131 * EXACT ones that are mergeable to the current one. */
4133 && ( PL_regkind[OP(n)] == NOTHING
4134 || (stringok && PL_regkind[OP(n)] == EXACT))
4136 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4139 if (OP(n) == TAIL || n > next)
4141 if (PL_regkind[OP(n)] == NOTHING) {
4142 DEBUG_PEEP("skip:", n, depth, 0);
4143 NEXT_OFF(scan) += NEXT_OFF(n);
4144 next = n + NODE_STEP_REGNODE;
4151 else if (stringok) {
4152 const unsigned int oldl = STR_LEN(scan);
4153 regnode * const nnext = regnext(n);
4155 /* XXX I (khw) kind of doubt that this works on platforms (should
4156 * Perl ever run on one) where U8_MAX is above 255 because of lots
4157 * of other assumptions */
4158 /* Don't join if the sum can't fit into a single node */
4159 if (oldl + STR_LEN(n) > U8_MAX)
4162 /* Joining something that requires UTF-8 with something that
4163 * doesn't, means the result requires UTF-8. */
4164 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4165 OP(scan) = EXACT_REQ8;
4167 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4168 ; /* join is compatible, no need to change OP */
4170 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4171 OP(scan) = EXACTFU_REQ8;
4173 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4174 ; /* join is compatible, no need to change OP */
4176 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4177 ; /* join is compatible, no need to change OP */
4179 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4181 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4182 * which can join with EXACTFU ones. We check for this case
4183 * here. These need to be resolved to either EXACTFU or
4184 * EXACTF at joining time. They have nothing in them that
4185 * would forbid them from being the more desirable EXACTFU
4186 * nodes except that they begin and/or end with a single [Ss].
4187 * The reason this is problematic is because they could be
4188 * joined in this loop with an adjacent node that ends and/or
4189 * begins with [Ss] which would then form the sequence 'ss',
4190 * which matches differently under /di than /ui, in which case
4191 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4192 * formed, the nodes get absorbed into any adjacent EXACTFU
4193 * node. And if the only adjacent node is EXACTF, they get
4194 * absorbed into that, under the theory that a longer node is
4195 * better than two shorter ones, even if one is EXACTFU. Note
4196 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4197 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4199 if (STRING(n)[STR_LEN(n)-1] == 's') {
4201 /* Here the joined node would end with 's'. If the node
4202 * following the combination is an EXACTF one, it's better to
4203 * join this trailing edge 's' node with that one, leaving the
4204 * current one in 'scan' be the more desirable EXACTFU */
4205 if (OP(nnext) == EXACTF) {
4209 OP(scan) = EXACTFU_S_EDGE;
4211 } /* Otherwise, the beginning 's' of the 2nd node just
4212 becomes an interior 's' in 'scan' */
4214 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4215 ; /* join is compatible, no need to change OP */
4217 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4219 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4220 * nodes. But the latter nodes can be also joined with EXACTFU
4221 * ones, and that is a better outcome, so if the node following
4222 * 'n' is EXACTFU, quit now so that those two can be joined
4224 if (OP(nnext) == EXACTFU) {
4228 /* The join is compatible, and the combined node will be
4229 * EXACTF. (These don't care if they begin or end with 's' */
4231 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4232 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4233 && STRING(n)[0] == 's')
4235 /* When combined, we have the sequence 'ss', which means we
4236 * have to remain /di */
4240 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4241 if (STRING(n)[0] == 's') {
4242 ; /* Here the join is compatible and the combined node
4243 starts with 's', no need to change OP */
4245 else { /* Now the trailing 's' is in the interior */
4249 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4251 /* The join is compatible, and the combined node will be
4252 * EXACTF. (These don't care if they begin or end with 's' */
4255 else if (OP(scan) != OP(n)) {
4257 /* The only other compatible joinings are the same node type */
4261 DEBUG_PEEP("merg", n, depth, 0);
4264 NEXT_OFF(scan) += NEXT_OFF(n);
4265 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4266 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4267 next = n + NODE_SZ_STR(n);
4268 /* Now we can overwrite *n : */
4269 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4277 #ifdef EXPERIMENTAL_INPLACESCAN
4278 if (flags && !NEXT_OFF(n)) {
4279 DEBUG_PEEP("atch", val, depth, 0);
4280 if (reg_off_by_arg[OP(n)]) {
4281 ARG_SET(n, val - n);
4284 NEXT_OFF(n) = val - n;
4291 /* This temporary node can now be turned into EXACTFU, and must, as
4292 * regexec.c doesn't handle it */
4293 if (OP(scan) == EXACTFU_S_EDGE) {
4298 *unfolded_multi_char = FALSE;
4300 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4301 * can now analyze for sequences of problematic code points. (Prior to
4302 * this final joining, sequences could have been split over boundaries, and
4303 * hence missed). The sequences only happen in folding, hence for any
4304 * non-EXACT EXACTish node */
4305 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4306 U8* s0 = (U8*) STRING(scan);
4308 U8* s_end = s0 + STR_LEN(scan);
4310 int total_count_delta = 0; /* Total delta number of characters that
4311 multi-char folds expand to */
4313 /* One pass is made over the node's string looking for all the
4314 * possibilities. To avoid some tests in the loop, there are two main
4315 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4320 if (OP(scan) == EXACTFL) {
4323 /* An EXACTFL node would already have been changed to another
4324 * node type unless there is at least one character in it that
4325 * is problematic; likely a character whose fold definition
4326 * won't be known until runtime, and so has yet to be folded.
4327 * For all but the UTF-8 locale, folds are 1-1 in length, but
4328 * to handle the UTF-8 case, we need to create a temporary
4329 * folded copy using UTF-8 locale rules in order to analyze it.
4330 * This is because our macros that look to see if a sequence is
4331 * a multi-char fold assume everything is folded (otherwise the
4332 * tests in those macros would be too complicated and slow).
4333 * Note that here, the non-problematic folds will have already
4334 * been done, so we can just copy such characters. We actually
4335 * don't completely fold the EXACTFL string. We skip the
4336 * unfolded multi-char folds, as that would just create work
4337 * below to figure out the size they already are */
4339 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4342 STRLEN s_len = UTF8SKIP(s);
4343 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4344 Copy(s, d, s_len, U8);
4347 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4348 *unfolded_multi_char = TRUE;
4349 Copy(s, d, s_len, U8);
4352 else if (isASCII(*s)) {
4353 *(d++) = toFOLD(*s);
4357 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4363 /* Point the remainder of the routine to look at our temporary
4367 } /* End of creating folded copy of EXACTFL string */
4369 /* Examine the string for a multi-character fold sequence. UTF-8
4370 * patterns have all characters pre-folded by the time this code is
4372 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4373 length sequence we are looking for is 2 */
4375 int count = 0; /* How many characters in a multi-char fold */
4376 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4377 if (! len) { /* Not a multi-char fold: get next char */
4382 { /* Here is a generic multi-char fold. */
4383 U8* multi_end = s + len;
4385 /* Count how many characters are in it. In the case of
4386 * /aa, no folds which contain ASCII code points are
4387 * allowed, so check for those, and skip if found. */
4388 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4389 count = utf8_length(s, multi_end);
4393 while (s < multi_end) {
4396 goto next_iteration;
4406 /* The delta is how long the sequence is minus 1 (1 is how long
4407 * the character that folds to the sequence is) */
4408 total_count_delta += count - 1;
4412 /* We created a temporary folded copy of the string in EXACTFL
4413 * nodes. Therefore we need to be sure it doesn't go below zero,
4414 * as the real string could be shorter */
4415 if (OP(scan) == EXACTFL) {
4416 int total_chars = utf8_length((U8*) STRING(scan),
4417 (U8*) STRING(scan) + STR_LEN(scan));
4418 if (total_count_delta > total_chars) {
4419 total_count_delta = total_chars;
4423 *min_subtract += total_count_delta;
4426 else if (OP(scan) == EXACTFAA) {
4428 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4429 * fold to the ASCII range (and there are no existing ones in the
4430 * upper latin1 range). But, as outlined in the comments preceding
4431 * this function, we need to flag any occurrences of the sharp s.
4432 * This character forbids trie formation (because of added
4434 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4435 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4436 || UNICODE_DOT_DOT_VERSION > 0)
4438 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4439 OP(scan) = EXACTFAA_NO_TRIE;
4440 *unfolded_multi_char = TRUE;
4446 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4448 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4449 * folds that are all Latin1. As explained in the comments
4450 * preceding this function, we look also for the sharp s in EXACTF
4451 * and EXACTFL nodes; it can be in the final position. Otherwise
4452 * we can stop looking 1 byte earlier because have to find at least
4453 * two characters for a multi-fold */
4454 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4459 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4460 if (! len) { /* Not a multi-char fold. */
4461 if (*s == LATIN_SMALL_LETTER_SHARP_S
4462 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4464 *unfolded_multi_char = TRUE;
4471 && isALPHA_FOLD_EQ(*s, 's')
4472 && isALPHA_FOLD_EQ(*(s+1), 's'))
4475 /* EXACTF nodes need to know that the minimum length
4476 * changed so that a sharp s in the string can match this
4477 * ss in the pattern, but they remain EXACTF nodes, as they
4478 * won't match this unless the target string is is UTF-8,
4479 * which we don't know until runtime. EXACTFL nodes can't
4480 * transform into EXACTFU nodes */
4481 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4482 OP(scan) = EXACTFUP;
4486 *min_subtract += len - 1;
4494 /* Allow dumping but overwriting the collection of skipped
4495 * ops and/or strings with fake optimized ops */
4496 n = scan + NODE_SZ_STR(scan);
4504 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4508 /* REx optimizer. Converts nodes into quicker variants "in place".
4509 Finds fixed substrings. */
4511 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4512 to the position after last scanned or to NULL. */
4514 #define INIT_AND_WITHP \
4515 assert(!and_withp); \
4516 Newx(and_withp, 1, regnode_ssc); \
4517 SAVEFREEPV(and_withp)
4521 S_unwind_scan_frames(pTHX_ const void *p)
4523 scan_frame *f= (scan_frame *)p;
4525 scan_frame *n= f->next_frame;
4531 /* the return from this sub is the minimum length that could possibly match */
4533 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4534 SSize_t *minlenp, SSize_t *deltap,
4539 regnode_ssc *and_withp,
4540 U32 flags, U32 depth)
4541 /* scanp: Start here (read-write). */
4542 /* deltap: Write maxlen-minlen here. */
4543 /* last: Stop before this one. */
4544 /* data: string data about the pattern */
4545 /* stopparen: treat close N as END */
4546 /* recursed: which subroutines have we recursed into */
4547 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4550 SSize_t final_minlen;
4551 /* There must be at least this number of characters to match */
4554 regnode *scan = *scanp, *next;
4556 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4557 int is_inf_internal = 0; /* The studied chunk is infinite */
4558 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4559 scan_data_t data_fake;
4560 SV *re_trie_maxbuff = NULL;
4561 regnode *first_non_open = scan;
4562 SSize_t stopmin = OPTIMIZE_INFTY;
4563 scan_frame *frame = NULL;
4564 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4566 PERL_ARGS_ASSERT_STUDY_CHUNK;
4567 RExC_study_started= 1;
4569 Zero(&data_fake, 1, scan_data_t);
4572 while (first_non_open && OP(first_non_open) == OPEN)
4573 first_non_open=regnext(first_non_open);
4579 RExC_study_chunk_recursed_count++;
4581 DEBUG_OPTIMISE_MORE_r(
4583 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4584 depth, (long)stopparen,
4585 (unsigned long)RExC_study_chunk_recursed_count,
4586 (unsigned long)depth, (unsigned long)recursed_depth,
4589 if (recursed_depth) {
4592 for ( j = 0 ; j < recursed_depth ; j++ ) {
4593 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4594 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4595 Perl_re_printf( aTHX_ " %d",(int)i);
4599 if ( j + 1 < recursed_depth ) {
4600 Perl_re_printf( aTHX_ ",");
4604 Perl_re_printf( aTHX_ "\n");
4607 while ( scan && OP(scan) != END && scan < last ){
4608 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4609 node length to get a real minimum (because
4610 the folded version may be shorter) */
4611 bool unfolded_multi_char = FALSE;
4612 bool mutate_ok = (frame && frame->in_gosub) ? 0 : 1;
4613 /* Peephole optimizer: */
4614 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4615 DEBUG_PEEP("Peep", scan, depth, flags);
4618 /* The reason we do this here is that we need to deal with things like
4619 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4620 * parsing code, as each (?:..) is handled by a different invocation of
4623 if (PL_regkind[OP(scan)] == EXACT
4624 && OP(scan) != LEXACT
4625 && OP(scan) != LEXACT_REQ8
4628 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4629 0, NULL, depth + 1);
4632 /* Follow the next-chain of the current node and optimize
4633 away all the NOTHINGs from it. */
4634 if (OP(scan) != CURLYX) {
4635 const int max = (reg_off_by_arg[OP(scan)]
4637 /* I32 may be smaller than U16 on CRAYs! */
4638 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4639 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4643 /* Skip NOTHING and LONGJMP. */
4644 while ( (n = regnext(n))
4645 && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4646 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4647 && off + noff < max)
4649 if (reg_off_by_arg[OP(scan)])
4652 NEXT_OFF(scan) = off;
4655 /* The principal pseudo-switch. Cannot be a switch, since we look into
4656 * several different things. */
4657 if ( OP(scan) == DEFINEP ) {
4659 SSize_t deltanext = 0;
4660 SSize_t fake_last_close = 0;
4661 I32 f = SCF_IN_DEFINE;
4663 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4664 scan = regnext(scan);
4665 assert( OP(scan) == IFTHEN );
4666 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4668 data_fake.last_closep= &fake_last_close;
4670 next = regnext(scan);
4671 scan = NEXTOPER(NEXTOPER(scan));
4672 DEBUG_PEEP("scan", scan, depth, flags);
4673 DEBUG_PEEP("next", next, depth, flags);
4675 /* we suppose the run is continuous, last=next...
4676 * NOTE we dont use the return here! */
4677 /* DEFINEP study_chunk() recursion */
4678 (void)study_chunk(pRExC_state, &scan, &minlen,
4679 &deltanext, next, &data_fake, stopparen,
4680 recursed_depth, NULL, f, depth+1);
4685 OP(scan) == BRANCH ||
4686 OP(scan) == BRANCHJ ||
4689 next = regnext(scan);
4692 /* The op(next)==code check below is to see if we
4693 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4694 * IFTHEN is special as it might not appear in pairs.
4695 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4696 * we dont handle it cleanly. */
4697 if (OP(next) == code || code == IFTHEN) {
4698 /* NOTE - There is similar code to this block below for
4699 * handling TRIE nodes on a re-study. If you change stuff here
4700 * check there too. */
4701 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4703 regnode * const startbranch=scan;
4705 if (flags & SCF_DO_SUBSTR) {
4706 /* Cannot merge strings after this. */
4707 scan_commit(pRExC_state, data, minlenp, is_inf);
4710 if (flags & SCF_DO_STCLASS)
4711 ssc_init_zero(pRExC_state, &accum);
4713 while (OP(scan) == code) {
4714 SSize_t deltanext, minnext, fake;
4716 regnode_ssc this_class;
4718 DEBUG_PEEP("Branch", scan, depth, flags);
4721 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4723 data_fake.whilem_c = data->whilem_c;
4724 data_fake.last_closep = data->last_closep;
4727 data_fake.last_closep = &fake;
4729 data_fake.pos_delta = delta;
4730 next = regnext(scan);
4732 scan = NEXTOPER(scan); /* everything */
4733 if (code != BRANCH) /* everything but BRANCH */
4734 scan = NEXTOPER(scan);
4736 if (flags & SCF_DO_STCLASS) {
4737 ssc_init(pRExC_state, &this_class);
4738 data_fake.start_class = &this_class;
4739 f = SCF_DO_STCLASS_AND;
4741 if (flags & SCF_WHILEM_VISITED_POS)
4742 f |= SCF_WHILEM_VISITED_POS;
4744 /* we suppose the run is continuous, last=next...*/
4745 /* recurse study_chunk() for each BRANCH in an alternation */
4746 minnext = study_chunk(pRExC_state, &scan, minlenp,
4747 &deltanext, next, &data_fake, stopparen,
4748 recursed_depth, NULL, f, depth+1);
4752 if (deltanext == OPTIMIZE_INFTY) {
4753 is_inf = is_inf_internal = 1;
4754 max1 = OPTIMIZE_INFTY;
4755 } else if (max1 < minnext + deltanext)
4756 max1 = minnext + deltanext;
4758 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4760 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4761 if ( stopmin > minnext)
4762 stopmin = min + min1;
4763 flags &= ~SCF_DO_SUBSTR;
4765 data->flags |= SCF_SEEN_ACCEPT;
4768 if (data_fake.flags & SF_HAS_EVAL)
4769 data->flags |= SF_HAS_EVAL;
4770 data->whilem_c = data_fake.whilem_c;
4772 if (flags & SCF_DO_STCLASS)
4773 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4775 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4777 if (flags & SCF_DO_SUBSTR) {
4778 data->pos_min += min1;
4779 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4780 data->pos_delta = OPTIMIZE_INFTY;
4782 data->pos_delta += max1 - min1;
4783 if (max1 != min1 || is_inf)
4784 data->cur_is_floating = 1;
4787 if (delta == OPTIMIZE_INFTY
4788 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4789 delta = OPTIMIZE_INFTY;
4791 delta += max1 - min1;
4792 if (flags & SCF_DO_STCLASS_OR) {
4793 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4795 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4796 flags &= ~SCF_DO_STCLASS;
4799 else if (flags & SCF_DO_STCLASS_AND) {
4801 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4802 flags &= ~SCF_DO_STCLASS;
4805 /* Switch to OR mode: cache the old value of
4806 * data->start_class */
4808 StructCopy(data->start_class, and_withp, regnode_ssc);
4809 flags &= ~SCF_DO_STCLASS_AND;
4810 StructCopy(&accum, data->start_class, regnode_ssc);
4811 flags |= SCF_DO_STCLASS_OR;
4815 if (PERL_ENABLE_TRIE_OPTIMISATION
4816 && OP(startbranch) == BRANCH
4821 Assuming this was/is a branch we are dealing with: 'scan'
4822 now points at the item that follows the branch sequence,
4823 whatever it is. We now start at the beginning of the
4824 sequence and look for subsequences of
4830 which would be constructed from a pattern like
4833 If we can find such a subsequence we need to turn the first
4834 element into a trie and then add the subsequent branch exact
4835 strings to the trie.
4839 1. patterns where the whole set of branches can be
4842 2. patterns where only a subset can be converted.
4844 In case 1 we can replace the whole set with a single regop
4845 for the trie. In case 2 we need to keep the start and end
4848 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4849 becomes BRANCH TRIE; BRANCH X;
4851 There is an additional case, that being where there is a
4852 common prefix, which gets split out into an EXACT like node
4853 preceding the TRIE node.
4855 If x(1..n)==tail then we can do a simple trie, if not we make
4856 a "jump" trie, such that when we match the appropriate word
4857 we "jump" to the appropriate tail node. Essentially we turn
4858 a nested if into a case structure of sorts.
4863 if (!re_trie_maxbuff) {
4864 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4865 if (!SvIOK(re_trie_maxbuff))
4866 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4868 if ( SvIV(re_trie_maxbuff)>=0 ) {
4870 regnode *first = (regnode *)NULL;
4871 regnode *prev = (regnode *)NULL;
4872 regnode *tail = scan;
4876 /* var tail is used because there may be a TAIL
4877 regop in the way. Ie, the exacts will point to the
4878 thing following the TAIL, but the last branch will
4879 point at the TAIL. So we advance tail. If we
4880 have nested (?:) we may have to move through several
4884 while ( OP( tail ) == TAIL ) {
4885 /* this is the TAIL generated by (?:) */
4886 tail = regnext( tail );
4890 DEBUG_TRIE_COMPILE_r({
4891 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4892 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4894 "Looking for TRIE'able sequences. Tail node is ",
4895 (UV) REGNODE_OFFSET(tail),
4896 SvPV_nolen_const( RExC_mysv )
4902 Step through the branches
4903 cur represents each branch,
4904 noper is the first thing to be matched as part
4906 noper_next is the regnext() of that node.
4908 We normally handle a case like this
4909 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4910 support building with NOJUMPTRIE, which restricts
4911 the trie logic to structures like /FOO|BAR/.
4913 If noper is a trieable nodetype then the branch is
4914 a possible optimization target. If we are building
4915 under NOJUMPTRIE then we require that noper_next is
4916 the same as scan (our current position in the regex
4919 Once we have two or more consecutive such branches
4920 we can create a trie of the EXACT's contents and
4921 stitch it in place into the program.
4923 If the sequence represents all of the branches in
4924 the alternation we replace the entire thing with a
4927 Otherwise when it is a subsequence we need to
4928 stitch it in place and replace only the relevant
4929 branches. This means the first branch has to remain
4930 as it is used by the alternation logic, and its
4931 next pointer, and needs to be repointed at the item
4932 on the branch chain following the last branch we
4933 have optimized away.
4935 This could be either a BRANCH, in which case the
4936 subsequence is internal, or it could be the item
4937 following the branch sequence in which case the
4938 subsequence is at the end (which does not
4939 necessarily mean the first node is the start of the
4942 TRIE_TYPE(X) is a define which maps the optype to a
4946 ----------------+-----------
4951 EXACTFU_REQ8 | EXACTFU
4955 EXACTFLU8 | EXACTFLU8
4959 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4961 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
4963 : ( EXACTFU == (X) \
4964 || EXACTFU_REQ8 == (X) \
4965 || EXACTFUP == (X) ) \
4967 : ( EXACTFAA == (X) ) \
4969 : ( EXACTL == (X) ) \
4971 : ( EXACTFLU8 == (X) ) \
4975 /* dont use tail as the end marker for this traverse */
4976 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4977 regnode * const noper = NEXTOPER( cur );
4978 U8 noper_type = OP( noper );
4979 U8 noper_trietype = TRIE_TYPE( noper_type );
4980 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4981 regnode * const noper_next = regnext( noper );
4982 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4983 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4986 DEBUG_TRIE_COMPILE_r({
4987 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4988 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4990 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4992 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4993 Perl_re_printf( aTHX_ " -> %d:%s",
4994 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4997 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4998 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4999 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5001 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5002 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5003 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5007 /* Is noper a trieable nodetype that can be merged
5008 * with the current trie (if there is one)? */
5012 ( noper_trietype == NOTHING )
5013 || ( trietype == NOTHING )
5014 || ( trietype == noper_trietype )
5017 && noper_next >= tail
5021 /* Handle mergable triable node Either we are
5022 * the first node in a new trieable sequence,
5023 * in which case we do some bookkeeping,
5024 * otherwise we update the end pointer. */
5027 if ( noper_trietype == NOTHING ) {
5028 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5029 regnode * const noper_next = regnext( noper );
5030 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5031 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5034 if ( noper_next_trietype ) {
5035 trietype = noper_next_trietype;
5036 } else if (noper_next_type) {
5037 /* a NOTHING regop is 1 regop wide.
5038 * We need at least two for a trie
5039 * so we can't merge this in */
5043 trietype = noper_trietype;
5046 if ( trietype == NOTHING )
5047 trietype = noper_trietype;
5052 } /* end handle mergable triable node */
5054 /* handle unmergable node -
5055 * noper may either be a triable node which can
5056 * not be tried together with the current trie,
5057 * or a non triable node */
5059 /* If last is set and trietype is not
5060 * NOTHING then we have found at least two
5061 * triable branch sequences in a row of a
5062 * similar trietype so we can turn them
5063 * into a trie. If/when we allow NOTHING to
5064 * start a trie sequence this condition
5065 * will be required, and it isn't expensive
5066 * so we leave it in for now. */
5067 if ( trietype && trietype != NOTHING )
5068 make_trie( pRExC_state,
5069 startbranch, first, cur, tail,
5070 count, trietype, depth+1 );
5071 prev = NULL; /* note: we clear/update
5072 first, trietype etc below,
5073 so we dont do it here */
5077 && noper_next >= tail
5080 /* noper is triable, so we can start a new
5084 trietype = noper_trietype;
5086 /* if we already saw a first but the
5087 * current node is not triable then we have
5088 * to reset the first information. */
5093 } /* end handle unmergable node */
5094 } /* loop over branches */
5095 DEBUG_TRIE_COMPILE_r({
5096 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5097 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5098 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5099 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5100 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5101 PL_reg_name[trietype]
5105 if ( prev && trietype ) {
5106 if ( trietype != NOTHING ) {
5107 /* the last branch of the sequence was part of
5108 * a trie, so we have to construct it here
5109 * outside of the loop */
5110 made= make_trie( pRExC_state, startbranch,
5111 first, scan, tail, count,
5112 trietype, depth+1 );
5113 #ifdef TRIE_STUDY_OPT
5114 if ( ((made == MADE_EXACT_TRIE &&
5115 startbranch == first)
5116 || ( first_non_open == first )) &&
5118 flags |= SCF_TRIE_RESTUDY;
5119 if ( startbranch == first
5122 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5127 /* at this point we know whatever we have is a
5128 * NOTHING sequence/branch AND if 'startbranch'
5129 * is 'first' then we can turn the whole thing
5132 if ( startbranch == first ) {
5134 /* the entire thing is a NOTHING sequence,
5135 * something like this: (?:|) So we can
5136 * turn it into a plain NOTHING op. */
5137 DEBUG_TRIE_COMPILE_r({
5138 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5139 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5141 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5144 OP(startbranch)= NOTHING;
5145 NEXT_OFF(startbranch)= tail - startbranch;
5146 for ( opt= startbranch + 1; opt < tail ; opt++ )
5150 } /* end if ( prev) */
5151 } /* TRIE_MAXBUF is non zero */
5155 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5156 scan = NEXTOPER(NEXTOPER(scan));
5157 } else /* single branch is optimized. */
5158 scan = NEXTOPER(scan);
5160 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5162 regnode *start = NULL;
5163 regnode *end = NULL;
5164 U32 my_recursed_depth= recursed_depth;
5166 if (OP(scan) != SUSPEND) { /* GOSUB */
5167 /* Do setup, note this code has side effects beyond
5168 * the rest of this block. Specifically setting
5169 * RExC_recurse[] must happen at least once during
5172 RExC_recurse[ARG2L(scan)] = scan;
5173 start = REGNODE_p(RExC_open_parens[paren]);
5174 end = REGNODE_p(RExC_close_parens[paren]);
5176 /* NOTE we MUST always execute the above code, even
5177 * if we do nothing with a GOSUB */
5179 ( flags & SCF_IN_DEFINE )
5182 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5184 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5187 /* no need to do anything here if we are in a define. */
5188 /* or we are after some kind of infinite construct
5189 * so we can skip recursing into this item.
5190 * Since it is infinite we will not change the maxlen
5191 * or delta, and if we miss something that might raise
5192 * the minlen it will merely pessimise a little.
5194 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5195 * might result in a minlen of 1 and not of 4,
5196 * but this doesn't make us mismatch, just try a bit
5197 * harder than we should.
5199 scan= regnext(scan);
5205 || !PAREN_TEST(recursed_depth - 1, paren)
5207 /* it is quite possible that there are more efficient ways
5208 * to do this. We maintain a bitmap per level of recursion
5209 * of which patterns we have entered so we can detect if a
5210 * pattern creates a possible infinite loop. When we
5211 * recurse down a level we copy the previous levels bitmap
5212 * down. When we are at recursion level 0 we zero the top
5213 * level bitmap. It would be nice to implement a different
5214 * more efficient way of doing this. In particular the top
5215 * level bitmap may be unnecessary.
5217 if (!recursed_depth) {
5218 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5220 Copy(PAREN_OFFSET(recursed_depth - 1),
5221 PAREN_OFFSET(recursed_depth),
5222 RExC_study_chunk_recursed_bytes, U8);
5224 /* we havent recursed into this paren yet, so recurse into it */
5225 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5226 PAREN_SET(recursed_depth, paren);
5227 my_recursed_depth= recursed_depth + 1;
5229 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5230 /* some form of infinite recursion, assume infinite length
5232 if (flags & SCF_DO_SUBSTR) {
5233 scan_commit(pRExC_state, data, minlenp, is_inf);
5234 data->cur_is_floating = 1;
5236 is_inf = is_inf_internal = 1;
5237 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5238 ssc_anything(data->start_class);
5239 flags &= ~SCF_DO_STCLASS;
5241 start= NULL; /* reset start so we dont recurse later on. */
5246 end = regnext(scan);
5249 scan_frame *newframe;
5251 if (!RExC_frame_last) {
5252 Newxz(newframe, 1, scan_frame);
5253 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5254 RExC_frame_head= newframe;
5256 } else if (!RExC_frame_last->next_frame) {
5257 Newxz(newframe, 1, scan_frame);
5258 RExC_frame_last->next_frame= newframe;
5259 newframe->prev_frame= RExC_frame_last;
5262 newframe= RExC_frame_last->next_frame;
5264 RExC_frame_last= newframe;
5266 newframe->next_regnode = regnext(scan);
5267 newframe->last_regnode = last;
5268 newframe->stopparen = stopparen;
5269 newframe->prev_recursed_depth = recursed_depth;
5270 newframe->this_prev_frame= frame;
5271 newframe->in_gosub = (
5272 (frame && frame->in_gosub) || OP(scan) == GOSUB
5275 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5276 DEBUG_PEEP("fnew", scan, depth, flags);
5283 recursed_depth= my_recursed_depth;
5288 else if ( OP(scan) == EXACT
5289 || OP(scan) == LEXACT
5290 || OP(scan) == EXACT_REQ8
5291 || OP(scan) == LEXACT_REQ8
5292 || OP(scan) == EXACTL)
5294 SSize_t bytelen = STR_LEN(scan), charlen;
5298 const U8 * const s = (U8*)STRING(scan);
5299 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5300 charlen = utf8_length(s, s + bytelen);
5302 uc = *((U8*)STRING(scan));
5306 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5307 /* The code below prefers earlier match for fixed
5308 offset, later match for variable offset. */
5309 if (data->last_end == -1) { /* Update the start info. */
5310 data->last_start_min = data->pos_min;
5311 data->last_start_max =
5312 is_inf ? OPTIMIZE_INFTY
5313 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5314 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5316 sv_catpvn(data->last_found, STRING(scan), bytelen);
5318 SvUTF8_on(data->last_found);
5320 SV * const sv = data->last_found;
5321 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5322 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5323 if (mg && mg->mg_len >= 0)
5324 mg->mg_len += charlen;
5326 data->last_end = data->pos_min + charlen;
5327 data->pos_min += charlen; /* As in the first entry. */
5328 data->flags &= ~SF_BEFORE_EOL;
5331 /* ANDing the code point leaves at most it, and not in locale, and
5332 * can't match null string */
5333 if (flags & SCF_DO_STCLASS_AND) {
5334 ssc_cp_and(data->start_class, uc);
5335 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5336 ssc_clear_locale(data->start_class);
5338 else if (flags & SCF_DO_STCLASS_OR) {
5339 ssc_add_cp(data->start_class, uc);
5340 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5342 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5343 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5345 flags &= ~SCF_DO_STCLASS;
5347 else if (PL_regkind[OP(scan)] == EXACT) {
5348 /* But OP != EXACT!, so is EXACTFish */
5349 SSize_t bytelen = STR_LEN(scan), charlen;
5350 const U8 * s = (U8*)STRING(scan);
5352 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5353 * with the mask set to the complement of the bit that differs
5354 * between upper and lower case, and the lowest code point of the
5355 * pair (which the '&' forces) */
5358 && ( OP(scan) == EXACTFAA
5359 || ( OP(scan) == EXACTFU
5360 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5363 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5366 ARG_SET(scan, *s & mask);
5368 /* we're not EXACTFish any more, so restudy */
5372 /* Search for fixed substrings supports EXACT only. */
5373 if (flags & SCF_DO_SUBSTR) {
5375 scan_commit(pRExC_state, data, minlenp, is_inf);
5377 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5378 if (unfolded_multi_char) {
5379 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5381 min += charlen - min_subtract;
5383 delta += min_subtract;
5384 if (flags & SCF_DO_SUBSTR) {
5385 data->pos_min += charlen - min_subtract;
5386 if (data->pos_min < 0) {
5389 data->pos_delta += min_subtract;
5391 data->cur_is_floating = 1; /* float */
5395 if (flags & SCF_DO_STCLASS) {
5396 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5398 assert(EXACTF_invlist);
5399 if (flags & SCF_DO_STCLASS_AND) {
5400 if (OP(scan) != EXACTFL)
5401 ssc_clear_locale(data->start_class);
5402 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5403 ANYOF_POSIXL_ZERO(data->start_class);
5404 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5406 else { /* SCF_DO_STCLASS_OR */
5407 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5408 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5410 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5411 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5413 flags &= ~SCF_DO_STCLASS;
5414 SvREFCNT_dec(EXACTF_invlist);
5417 else if (REGNODE_VARIES(OP(scan))) {
5418 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5419 I32 fl = 0, f = flags;
5420 regnode * const oscan = scan;
5421 regnode_ssc this_class;
5422 regnode_ssc *oclass = NULL;
5423 I32 next_is_eval = 0;
5425 switch (PL_regkind[OP(scan)]) {
5426 case WHILEM: /* End of (?:...)* . */
5427 scan = NEXTOPER(scan);
5430 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5431 next = NEXTOPER(scan);
5432 if ( OP(next) == EXACT
5433 || OP(next) == LEXACT
5434 || OP(next) == EXACT_REQ8
5435 || OP(next) == LEXACT_REQ8
5436 || OP(next) == EXACTL
5437 || (flags & SCF_DO_STCLASS))
5440 maxcount = REG_INFTY;
5441 next = regnext(scan);
5442 scan = NEXTOPER(scan);
5446 if (flags & SCF_DO_SUBSTR)
5451 next = NEXTOPER(scan);
5453 /* This temporary node can now be turned into EXACTFU, and
5454 * must, as regexec.c doesn't handle it */
5455 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5459 if ( STR_LEN(next) == 1
5460 && isALPHA_A(* STRING(next))
5461 && ( OP(next) == EXACTFAA
5462 || ( OP(next) == EXACTFU
5463 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5466 /* These differ in just one bit */
5467 U8 mask = ~ ('A' ^ 'a');
5469 assert(isALPHA_A(* STRING(next)));
5471 /* Then replace it by an ANYOFM node, with
5472 * the mask set to the complement of the
5473 * bit that differs between upper and lower
5474 * case, and the lowest code point of the
5475 * pair (which the '&' forces) */
5477 ARG_SET(next, *STRING(next) & mask);
5481 if (flags & SCF_DO_STCLASS) {
5483 maxcount = REG_INFTY;
5484 next = regnext(scan);
5485 scan = NEXTOPER(scan);
5488 if (flags & SCF_DO_SUBSTR) {
5489 scan_commit(pRExC_state, data, minlenp, is_inf);
5490 /* Cannot extend fixed substrings */
5491 data->cur_is_floating = 1; /* float */
5493 is_inf = is_inf_internal = 1;
5494 scan = regnext(scan);
5495 goto optimize_curly_tail;
5497 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5498 && (scan->flags == stopparen))
5503 mincount = ARG1(scan);
5504 maxcount = ARG2(scan);
5506 next = regnext(scan);
5507 if (OP(scan) == CURLYX) {
5508 I32 lp = (data ? *(data->last_closep) : 0);
5509 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5511 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5512 next_is_eval = (OP(scan) == EVAL);
5514 if (flags & SCF_DO_SUBSTR) {
5516 scan_commit(pRExC_state, data, minlenp, is_inf);
5517 /* Cannot extend fixed substrings */
5518 pos_before = data->pos_min;
5522 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5524 data->flags |= SF_IS_INF;
5526 if (flags & SCF_DO_STCLASS) {
5527 ssc_init(pRExC_state, &this_class);
5528 oclass = data->start_class;
5529 data->start_class = &this_class;
5530 f |= SCF_DO_STCLASS_AND;
5531 f &= ~SCF_DO_STCLASS_OR;
5533 /* Exclude from super-linear cache processing any {n,m}
5534 regops for which the combination of input pos and regex
5535 pos is not enough information to determine if a match
5538 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5539 regex pos at the \s*, the prospects for a match depend not
5540 only on the input position but also on how many (bar\s*)
5541 repeats into the {4,8} we are. */
5542 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5543 f &= ~SCF_WHILEM_VISITED_POS;
5545 /* This will finish on WHILEM, setting scan, or on NULL: */
5546 /* recurse study_chunk() on loop bodies */
5547 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5548 last, data, stopparen, recursed_depth, NULL,
5550 ? (f & ~SCF_DO_SUBSTR)
5554 if (flags & SCF_DO_STCLASS)
5555 data->start_class = oclass;
5556 if (mincount == 0 || minnext == 0) {
5557 if (flags & SCF_DO_STCLASS_OR) {
5558 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5560 else if (flags & SCF_DO_STCLASS_AND) {
5561 /* Switch to OR mode: cache the old value of
5562 * data->start_class */
5564 StructCopy(data->start_class, and_withp, regnode_ssc);
5565 flags &= ~SCF_DO_STCLASS_AND;
5566 StructCopy(&this_class, data->start_class, regnode_ssc);
5567 flags |= SCF_DO_STCLASS_OR;
5568 ANYOF_FLAGS(data->start_class)
5569 |= SSC_MATCHES_EMPTY_STRING;
5571 } else { /* Non-zero len */
5572 if (flags & SCF_DO_STCLASS_OR) {
5573 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5574 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5576 else if (flags & SCF_DO_STCLASS_AND)
5577 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5578 flags &= ~SCF_DO_STCLASS;
5580 if (!scan) /* It was not CURLYX, but CURLY. */
5582 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5583 /* ? quantifier ok, except for (?{ ... }) */
5584 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5585 && (minnext == 0) && (deltanext == 0)
5586 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5587 && maxcount <= REG_INFTY/3) /* Complement check for big
5590 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5591 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5592 "Quantifier unexpected on zero-length expression "
5593 "in regex m/%" UTF8f "/",
5594 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5598 min += minnext * mincount;
5599 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5600 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5601 is_inf |= is_inf_internal;
5603 delta = OPTIMIZE_INFTY;
5605 delta += (minnext + deltanext) * maxcount
5606 - minnext * mincount;
5608 /* Try powerful optimization CURLYX => CURLYN. */
5609 if ( OP(oscan) == CURLYX && data
5610 && data->flags & SF_IN_PAR
5611 && !(data->flags & SF_HAS_EVAL)
5612 && !deltanext && minnext == 1
5615 /* Try to optimize to CURLYN. */
5616 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5617 regnode * const nxt1 = nxt;
5624 if (!REGNODE_SIMPLE(OP(nxt))
5625 && !(PL_regkind[OP(nxt)] == EXACT
5626 && STR_LEN(nxt) == 1))
5632 if (OP(nxt) != CLOSE)
5634 if (RExC_open_parens) {
5637 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5640 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5642 /* Now we know that nxt2 is the only contents: */
5643 oscan->flags = (U8)ARG(nxt);
5645 OP(nxt1) = NOTHING; /* was OPEN. */
5648 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5649 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5650 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5651 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5652 OP(nxt + 1) = OPTIMIZED; /* was count. */
5653 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5658 /* Try optimization CURLYX => CURLYM. */
5659 if ( OP(oscan) == CURLYX && data
5660 && !(data->flags & SF_HAS_PAR)
5661 && !(data->flags & SF_HAS_EVAL)
5662 && !deltanext /* atom is fixed width */
5663 && minnext != 0 /* CURLYM can't handle zero width */
5664 /* Nor characters whose fold at run-time may be
5665 * multi-character */
5666 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5669 /* XXXX How to optimize if data == 0? */
5670 /* Optimize to a simpler form. */
5671 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5675 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5676 && (OP(nxt2) != WHILEM))
5678 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5679 /* Need to optimize away parenths. */
5680 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5681 /* Set the parenth number. */
5682 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5684 oscan->flags = (U8)ARG(nxt);
5685 if (RExC_open_parens) {
5687 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5690 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5693 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5694 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5697 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5698 OP(nxt + 1) = OPTIMIZED; /* was count. */
5699 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5700 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5703 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5704 regnode *nnxt = regnext(nxt1);
5706 if (reg_off_by_arg[OP(nxt1)])
5707 ARG_SET(nxt1, nxt2 - nxt1);
5708 else if (nxt2 - nxt1 < U16_MAX)
5709 NEXT_OFF(nxt1) = nxt2 - nxt1;
5711 OP(nxt) = NOTHING; /* Cannot beautify */
5716 /* Optimize again: */
5717 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5718 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5719 NULL, stopparen, recursed_depth, NULL, 0,
5725 else if ((OP(oscan) == CURLYX)
5726 && (flags & SCF_WHILEM_VISITED_POS)
5727 /* See the comment on a similar expression above.
5728 However, this time it's not a subexpression
5729 we care about, but the expression itself. */
5730 && (maxcount == REG_INFTY)
5732 /* This stays as CURLYX, we can put the count/of pair. */
5733 /* Find WHILEM (as in regexec.c) */
5734 regnode *nxt = oscan + NEXT_OFF(oscan);
5736 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5738 nxt = PREVOPER(nxt);
5739 if (nxt->flags & 0xf) {
5740 /* we've already set whilem count on this node */
5741 } else if (++data->whilem_c < 16) {
5742 assert(data->whilem_c <= RExC_whilem_seen);
5743 nxt->flags = (U8)(data->whilem_c
5744 | (RExC_whilem_seen << 4)); /* On WHILEM */
5747 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5749 if (flags & SCF_DO_SUBSTR) {
5750 SV *last_str = NULL;
5751 STRLEN last_chrs = 0;
5752 int counted = mincount != 0;
5754 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5756 SSize_t b = pos_before >= data->last_start_min
5757 ? pos_before : data->last_start_min;
5759 const char * const s = SvPV_const(data->last_found, l);
5760 SSize_t old = b - data->last_start_min;
5764 old = utf8_hop_forward((U8*)s, old,
5765 (U8 *) SvEND(data->last_found))
5768 /* Get the added string: */
5769 last_str = newSVpvn_utf8(s + old, l, UTF);
5770 last_chrs = UTF ? utf8_length((U8*)(s + old),
5771 (U8*)(s + old + l)) : l;
5772 if (deltanext == 0 && pos_before == b) {
5773 /* What was added is a constant string */
5776 SvGROW(last_str, (mincount * l) + 1);
5777 repeatcpy(SvPVX(last_str) + l,
5778 SvPVX_const(last_str), l,
5780 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5781 /* Add additional parts. */
5782 SvCUR_set(data->last_found,
5783 SvCUR(data->last_found) - l);
5784 sv_catsv(data->last_found, last_str);
5786 SV * sv = data->last_found;
5788 SvUTF8(sv) && SvMAGICAL(sv) ?
5789 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5790 if (mg && mg->mg_len >= 0)
5791 mg->mg_len += last_chrs * (mincount-1);
5793 last_chrs *= mincount;
5794 data->last_end += l * (mincount - 1);
5797 /* start offset must point into the last copy */
5798 data->last_start_min += minnext * (mincount - 1);
5799 data->last_start_max =
5802 : data->last_start_max +
5803 (maxcount - 1) * (minnext + data->pos_delta);
5806 /* It is counted once already... */
5807 data->pos_min += minnext * (mincount - counted);
5809 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5810 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5811 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5812 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5814 if (deltanext != OPTIMIZE_INFTY)
5815 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5816 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5817 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5819 if (deltanext == OPTIMIZE_INFTY
5820 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5821 data->pos_delta = OPTIMIZE_INFTY;
5823 data->pos_delta += - counted * deltanext +
5824 (minnext + deltanext) * maxcount - minnext * mincount;
5825 if (mincount != maxcount) {
5826 /* Cannot extend fixed substrings found inside
5828 scan_commit(pRExC_state, data, minlenp, is_inf);
5829 if (mincount && last_str) {
5830 SV * const sv = data->last_found;
5831 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5832 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5836 sv_setsv(sv, last_str);
5837 data->last_end = data->pos_min;
5838 data->last_start_min = data->pos_min - last_chrs;
5839 data->last_start_max = is_inf
5841 : data->pos_min + data->pos_delta - last_chrs;
5843 data->cur_is_floating = 1; /* float */
5845 SvREFCNT_dec(last_str);
5847 if (data && (fl & SF_HAS_EVAL))
5848 data->flags |= SF_HAS_EVAL;
5849 optimize_curly_tail:
5850 if (OP(oscan) != CURLYX) {
5851 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5853 NEXT_OFF(oscan) += NEXT_OFF(next);
5858 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5862 if (flags & SCF_DO_SUBSTR) {
5863 /* Cannot expect anything... */
5864 scan_commit(pRExC_state, data, minlenp, is_inf);
5865 data->cur_is_floating = 1; /* float */
5867 is_inf = is_inf_internal = 1;
5868 if (flags & SCF_DO_STCLASS_OR) {
5869 if (OP(scan) == CLUMP) {
5870 /* Actually is any start char, but very few code points
5871 * aren't start characters */
5872 ssc_match_all_cp(data->start_class);
5875 ssc_anything(data->start_class);
5878 flags &= ~SCF_DO_STCLASS;
5882 else if (OP(scan) == LNBREAK) {
5883 if (flags & SCF_DO_STCLASS) {
5884 if (flags & SCF_DO_STCLASS_AND) {
5885 ssc_intersection(data->start_class,
5886 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5887 ssc_clear_locale(data->start_class);
5888 ANYOF_FLAGS(data->start_class)
5889 &= ~SSC_MATCHES_EMPTY_STRING;
5891 else if (flags & SCF_DO_STCLASS_OR) {
5892 ssc_union(data->start_class,
5893 PL_XPosix_ptrs[_CC_VERTSPACE],
5895 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5897 /* See commit msg for
5898 * 749e076fceedeb708a624933726e7989f2302f6a */
5899 ANYOF_FLAGS(data->start_class)
5900 &= ~SSC_MATCHES_EMPTY_STRING;
5902 flags &= ~SCF_DO_STCLASS;
5905 if (delta != OPTIMIZE_INFTY)
5906 delta++; /* Because of the 2 char string cr-lf */
5907 if (flags & SCF_DO_SUBSTR) {
5908 /* Cannot expect anything... */
5909 scan_commit(pRExC_state, data, minlenp, is_inf);
5911 if (data->pos_delta != OPTIMIZE_INFTY) {
5912 data->pos_delta += 1;
5914 data->cur_is_floating = 1; /* float */
5917 else if (REGNODE_SIMPLE(OP(scan))) {
5919 if (flags & SCF_DO_SUBSTR) {
5920 scan_commit(pRExC_state, data, minlenp, is_inf);
5924 if (flags & SCF_DO_STCLASS) {
5926 SV* my_invlist = NULL;
5929 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5930 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5932 /* Some of the logic below assumes that switching
5933 locale on will only add false positives. */
5938 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5942 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5943 ssc_match_all_cp(data->start_class);
5948 SV* REG_ANY_invlist = _new_invlist(2);
5949 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5951 if (flags & SCF_DO_STCLASS_OR) {
5952 ssc_union(data->start_class,
5954 TRUE /* TRUE => invert, hence all but \n
5958 else if (flags & SCF_DO_STCLASS_AND) {
5959 ssc_intersection(data->start_class,
5961 TRUE /* TRUE => invert */
5963 ssc_clear_locale(data->start_class);
5965 SvREFCNT_dec_NN(REG_ANY_invlist);
5977 if (flags & SCF_DO_STCLASS_AND)
5978 ssc_and(pRExC_state, data->start_class,
5979 (regnode_charclass *) scan);
5981 ssc_or(pRExC_state, data->start_class,
5982 (regnode_charclass *) scan);
5985 case NANYOFM: /* NANYOFM already contains the inversion of the
5986 input ANYOF data, so, unlike things like
5987 NPOSIXA, don't change 'invert' to TRUE */
5991 SV* cp_list = get_ANYOFM_contents(scan);
5993 if (flags & SCF_DO_STCLASS_OR) {
5994 ssc_union(data->start_class, cp_list, invert);
5996 else if (flags & SCF_DO_STCLASS_AND) {
5997 ssc_intersection(data->start_class, cp_list, invert);
6000 SvREFCNT_dec_NN(cp_list);
6009 cp_list = _add_range_to_invlist(cp_list,
6011 ANYOFRbase(scan) + ANYOFRdelta(scan));
6013 if (flags & SCF_DO_STCLASS_OR) {
6014 ssc_union(data->start_class, cp_list, invert);
6016 else if (flags & SCF_DO_STCLASS_AND) {
6017 ssc_intersection(data->start_class, cp_list, invert);
6020 SvREFCNT_dec_NN(cp_list);
6029 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6030 if (flags & SCF_DO_STCLASS_AND) {
6031 bool was_there = cBOOL(
6032 ANYOF_POSIXL_TEST(data->start_class,
6034 ANYOF_POSIXL_ZERO(data->start_class);
6035 if (was_there) { /* Do an AND */
6036 ANYOF_POSIXL_SET(data->start_class, namedclass);
6038 /* No individual code points can now match */
6039 data->start_class->invlist
6040 = sv_2mortal(_new_invlist(0));
6043 int complement = namedclass + ((invert) ? -1 : 1);
6045 assert(flags & SCF_DO_STCLASS_OR);
6047 /* If the complement of this class was already there,
6048 * the result is that they match all code points,
6049 * (\d + \D == everything). Remove the classes from
6050 * future consideration. Locale is not relevant in
6052 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6053 ssc_match_all_cp(data->start_class);
6054 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6055 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6057 else { /* The usual case; just add this class to the
6059 ANYOF_POSIXL_SET(data->start_class, namedclass);
6064 case NPOSIXA: /* For these, we always know the exact set of
6069 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6070 goto join_posix_and_ascii;
6078 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6080 /* NPOSIXD matches all upper Latin1 code points unless the
6081 * target string being matched is UTF-8, which is
6082 * unknowable until match time. Since we are going to
6083 * invert, we want to get rid of all of them so that the
6084 * inversion will match all */
6085 if (OP(scan) == NPOSIXD) {
6086 _invlist_subtract(my_invlist, PL_UpperLatin1,
6090 join_posix_and_ascii:
6092 if (flags & SCF_DO_STCLASS_AND) {
6093 ssc_intersection(data->start_class, my_invlist, invert);
6094 ssc_clear_locale(data->start_class);
6097 assert(flags & SCF_DO_STCLASS_OR);
6098 ssc_union(data->start_class, my_invlist, invert);
6100 SvREFCNT_dec(my_invlist);
6102 if (flags & SCF_DO_STCLASS_OR)
6103 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6104 flags &= ~SCF_DO_STCLASS;
6107 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6108 data->flags |= (OP(scan) == MEOL
6111 scan_commit(pRExC_state, data, minlenp, is_inf);
6114 else if ( PL_regkind[OP(scan)] == BRANCHJ
6115 /* Lookbehind, or need to calculate parens/evals/stclass: */
6116 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6117 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6119 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6120 || OP(scan) == UNLESSM )
6122 /* Negative Lookahead/lookbehind
6123 In this case we can't do fixed string optimisation.
6126 SSize_t deltanext, minnext, fake = 0;
6131 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6133 data_fake.whilem_c = data->whilem_c;
6134 data_fake.last_closep = data->last_closep;
6137 data_fake.last_closep = &fake;
6138 data_fake.pos_delta = delta;
6139 if ( flags & SCF_DO_STCLASS && !scan->flags
6140 && OP(scan) == IFMATCH ) { /* Lookahead */
6141 ssc_init(pRExC_state, &intrnl);
6142 data_fake.start_class = &intrnl;
6143 f |= SCF_DO_STCLASS_AND;
6145 if (flags & SCF_WHILEM_VISITED_POS)
6146 f |= SCF_WHILEM_VISITED_POS;
6147 next = regnext(scan);
6148 nscan = NEXTOPER(NEXTOPER(scan));
6150 /* recurse study_chunk() for lookahead body */
6151 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6152 last, &data_fake, stopparen,
6153 recursed_depth, NULL, f, depth+1);
6156 || deltanext > (I32) U8_MAX
6157 || minnext > (I32)U8_MAX
6158 || minnext + deltanext > (I32)U8_MAX)
6160 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6164 /* The 'next_off' field has been repurposed to count the
6165 * additional starting positions to try beyond the initial
6166 * one. (This leaves it at 0 for non-variable length
6167 * matches to avoid breakage for those not using this
6170 scan->next_off = deltanext;
6171 ckWARNexperimental(RExC_parse,
6172 WARN_EXPERIMENTAL__VLB,
6173 "Variable length lookbehind is experimental");
6175 scan->flags = (U8)minnext + deltanext;
6178 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6180 if (data_fake.flags & SF_HAS_EVAL)
6181 data->flags |= SF_HAS_EVAL;
6182 data->whilem_c = data_fake.whilem_c;
6184 if (f & SCF_DO_STCLASS_AND) {
6185 if (flags & SCF_DO_STCLASS_OR) {
6186 /* OR before, AND after: ideally we would recurse with
6187 * data_fake to get the AND applied by study of the
6188 * remainder of the pattern, and then derecurse;
6189 * *** HACK *** for now just treat as "no information".
6190 * See [perl #56690].
6192 ssc_init(pRExC_state, data->start_class);
6194 /* AND before and after: combine and continue. These
6195 * assertions are zero-length, so can match an EMPTY
6197 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6198 ANYOF_FLAGS(data->start_class)
6199 |= SSC_MATCHES_EMPTY_STRING;
6203 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6205 /* Positive Lookahead/lookbehind
6206 In this case we can do fixed string optimisation,
6207 but we must be careful about it. Note in the case of
6208 lookbehind the positions will be offset by the minimum
6209 length of the pattern, something we won't know about
6210 until after the recurse.
6212 SSize_t deltanext, fake = 0;
6216 /* We use SAVEFREEPV so that when the full compile
6217 is finished perl will clean up the allocated
6218 minlens when it's all done. This way we don't
6219 have to worry about freeing them when we know
6220 they wont be used, which would be a pain.
6223 Newx( minnextp, 1, SSize_t );
6224 SAVEFREEPV(minnextp);
6227 StructCopy(data, &data_fake, scan_data_t);
6228 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6231 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6232 data_fake.last_found=newSVsv(data->last_found);
6236 data_fake.last_closep = &fake;
6237 data_fake.flags = 0;
6238 data_fake.substrs[0].flags = 0;
6239 data_fake.substrs[1].flags = 0;
6240 data_fake.pos_delta = delta;
6242 data_fake.flags |= SF_IS_INF;
6243 if ( flags & SCF_DO_STCLASS && !scan->flags
6244 && OP(scan) == IFMATCH ) { /* Lookahead */
6245 ssc_init(pRExC_state, &intrnl);
6246 data_fake.start_class = &intrnl;
6247 f |= SCF_DO_STCLASS_AND;
6249 if (flags & SCF_WHILEM_VISITED_POS)
6250 f |= SCF_WHILEM_VISITED_POS;
6251 next = regnext(scan);
6252 nscan = NEXTOPER(NEXTOPER(scan));
6254 /* positive lookahead study_chunk() recursion */
6255 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6256 &deltanext, last, &data_fake,
6257 stopparen, recursed_depth, NULL,
6260 assert(0); /* This code has never been tested since this
6261 is normally not compiled */
6263 || deltanext > (I32) U8_MAX
6264 || *minnextp > (I32)U8_MAX
6265 || *minnextp + deltanext > (I32)U8_MAX)
6267 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6272 scan->next_off = deltanext;
6274 scan->flags = (U8)*minnextp + deltanext;
6279 if (f & SCF_DO_STCLASS_AND) {
6280 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6281 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6284 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6286 if (data_fake.flags & SF_HAS_EVAL)
6287 data->flags |= SF_HAS_EVAL;
6288 data->whilem_c = data_fake.whilem_c;
6289 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6291 if (RExC_rx->minlen<*minnextp)
6292 RExC_rx->minlen=*minnextp;
6293 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6294 SvREFCNT_dec_NN(data_fake.last_found);
6296 for (i = 0; i < 2; i++) {
6297 if (data_fake.substrs[i].minlenp != minlenp) {
6298 data->substrs[i].min_offset =
6299 data_fake.substrs[i].min_offset;
6300 data->substrs[i].max_offset =
6301 data_fake.substrs[i].max_offset;
6302 data->substrs[i].minlenp =
6303 data_fake.substrs[i].minlenp;
6304 data->substrs[i].lookbehind += scan->flags;
6312 else if (OP(scan) == OPEN) {
6313 if (stopparen != (I32)ARG(scan))
6316 else if (OP(scan) == CLOSE) {
6317 if (stopparen == (I32)ARG(scan)) {
6320 if ((I32)ARG(scan) == is_par) {
6321 next = regnext(scan);
6323 if ( next && (OP(next) != WHILEM) && next < last)
6324 is_par = 0; /* Disable optimization */
6327 *(data->last_closep) = ARG(scan);
6329 else if (OP(scan) == EVAL) {
6331 data->flags |= SF_HAS_EVAL;
6333 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6334 if (flags & SCF_DO_SUBSTR) {
6335 scan_commit(pRExC_state, data, minlenp, is_inf);
6336 flags &= ~SCF_DO_SUBSTR;
6338 if (data && OP(scan)==ACCEPT) {
6339 data->flags |= SCF_SEEN_ACCEPT;
6344 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6346 if (flags & SCF_DO_SUBSTR) {
6347 scan_commit(pRExC_state, data, minlenp, is_inf);
6348 data->cur_is_floating = 1; /* float */
6350 is_inf = is_inf_internal = 1;
6351 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6352 ssc_anything(data->start_class);
6353 flags &= ~SCF_DO_STCLASS;
6355 else if (OP(scan) == GPOS) {
6356 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6357 !(delta || is_inf || (data && data->pos_delta)))
6359 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6360 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6361 if (RExC_rx->gofs < (STRLEN)min)
6362 RExC_rx->gofs = min;
6364 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6368 #ifdef TRIE_STUDY_OPT
6369 #ifdef FULL_TRIE_STUDY
6370 else if (PL_regkind[OP(scan)] == TRIE) {
6371 /* NOTE - There is similar code to this block above for handling
6372 BRANCH nodes on the initial study. If you change stuff here
6374 regnode *trie_node= scan;
6375 regnode *tail= regnext(scan);
6376 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6377 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6380 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6381 /* Cannot merge strings after this. */
6382 scan_commit(pRExC_state, data, minlenp, is_inf);
6384 if (flags & SCF_DO_STCLASS)
6385 ssc_init_zero(pRExC_state, &accum);
6391 const regnode *nextbranch= NULL;
6394 for ( word=1 ; word <= trie->wordcount ; word++)
6396 SSize_t deltanext=0, minnext=0, f = 0, fake;
6397 regnode_ssc this_class;
6399 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6401 data_fake.whilem_c = data->whilem_c;
6402 data_fake.last_closep = data->last_closep;
6405 data_fake.last_closep = &fake;
6406 data_fake.pos_delta = delta;
6407 if (flags & SCF_DO_STCLASS) {
6408 ssc_init(pRExC_state, &this_class);
6409 data_fake.start_class = &this_class;
6410 f = SCF_DO_STCLASS_AND;
6412 if (flags & SCF_WHILEM_VISITED_POS)
6413 f |= SCF_WHILEM_VISITED_POS;
6415 if (trie->jump[word]) {
6417 nextbranch = trie_node + trie->jump[0];
6418 scan= trie_node + trie->jump[word];
6419 /* We go from the jump point to the branch that follows
6420 it. Note this means we need the vestigal unused
6421 branches even though they arent otherwise used. */
6422 /* optimise study_chunk() for TRIE */
6423 minnext = study_chunk(pRExC_state, &scan, minlenp,
6424 &deltanext, (regnode *)nextbranch, &data_fake,
6425 stopparen, recursed_depth, NULL, f, depth+1);
6427 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6428 nextbranch= regnext((regnode*)nextbranch);
6430 if (min1 > (SSize_t)(minnext + trie->minlen))
6431 min1 = minnext + trie->minlen;
6432 if (deltanext == OPTIMIZE_INFTY) {
6433 is_inf = is_inf_internal = 1;
6434 max1 = OPTIMIZE_INFTY;
6435 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6436 max1 = minnext + deltanext + trie->maxlen;
6438 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6440 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6441 if ( stopmin > min + min1)
6442 stopmin = min + min1;
6443 flags &= ~SCF_DO_SUBSTR;
6445 data->flags |= SCF_SEEN_ACCEPT;
6448 if (data_fake.flags & SF_HAS_EVAL)
6449 data->flags |= SF_HAS_EVAL;
6450 data->whilem_c = data_fake.whilem_c;
6452 if (flags & SCF_DO_STCLASS)
6453 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6456 if (flags & SCF_DO_SUBSTR) {
6457 data->pos_min += min1;
6458 data->pos_delta += max1 - min1;
6459 if (max1 != min1 || is_inf)
6460 data->cur_is_floating = 1; /* float */
6463 if (delta != OPTIMIZE_INFTY) {
6464 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6465 delta += max1 - min1;
6467 delta = OPTIMIZE_INFTY;
6469 if (flags & SCF_DO_STCLASS_OR) {
6470 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6472 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6473 flags &= ~SCF_DO_STCLASS;
6476 else if (flags & SCF_DO_STCLASS_AND) {
6478 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6479 flags &= ~SCF_DO_STCLASS;
6482 /* Switch to OR mode: cache the old value of
6483 * data->start_class */
6485 StructCopy(data->start_class, and_withp, regnode_ssc);
6486 flags &= ~SCF_DO_STCLASS_AND;
6487 StructCopy(&accum, data->start_class, regnode_ssc);
6488 flags |= SCF_DO_STCLASS_OR;
6495 else if (PL_regkind[OP(scan)] == TRIE) {
6496 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6499 min += trie->minlen;
6500 delta += (trie->maxlen - trie->minlen);
6501 flags &= ~SCF_DO_STCLASS; /* xxx */
6502 if (flags & SCF_DO_SUBSTR) {
6503 /* Cannot expect anything... */
6504 scan_commit(pRExC_state, data, minlenp, is_inf);
6505 data->pos_min += trie->minlen;
6506 data->pos_delta += (trie->maxlen - trie->minlen);
6507 if (trie->maxlen != trie->minlen)
6508 data->cur_is_floating = 1; /* float */
6510 if (trie->jump) /* no more substrings -- for now /grr*/
6511 flags &= ~SCF_DO_SUBSTR;
6513 else if (OP(scan) == REGEX_SET) {
6514 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6515 " before optimization", reg_name[REGEX_SET]);
6518 #endif /* old or new */
6519 #endif /* TRIE_STUDY_OPT */
6521 /* Else: zero-length, ignore. */
6522 scan = regnext(scan);
6527 /* we need to unwind recursion. */
6530 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6531 DEBUG_PEEP("fend", scan, depth, flags);
6533 /* restore previous context */
6534 last = frame->last_regnode;
6535 scan = frame->next_regnode;
6536 stopparen = frame->stopparen;
6537 recursed_depth = frame->prev_recursed_depth;
6539 RExC_frame_last = frame->prev_frame;
6540 frame = frame->this_prev_frame;
6541 goto fake_study_recurse;
6545 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6548 *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6550 if (flags & SCF_DO_SUBSTR && is_inf)
6551 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6552 if (is_par > (I32)U8_MAX)
6554 if (is_par && pars==1 && data) {
6555 data->flags |= SF_IN_PAR;
6556 data->flags &= ~SF_HAS_PAR;
6558 else if (pars && data) {
6559 data->flags |= SF_HAS_PAR;
6560 data->flags &= ~SF_IN_PAR;
6562 if (flags & SCF_DO_STCLASS_OR)
6563 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6564 if (flags & SCF_TRIE_RESTUDY)
6565 data->flags |= SCF_TRIE_RESTUDY;
6567 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6569 final_minlen = min < stopmin
6572 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6573 if (final_minlen > OPTIMIZE_INFTY - delta)
6574 RExC_maxlen = OPTIMIZE_INFTY;
6575 else if (RExC_maxlen < final_minlen + delta)
6576 RExC_maxlen = final_minlen + delta;
6578 return final_minlen;
6582 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6584 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6586 PERL_ARGS_ASSERT_ADD_DATA;
6588 Renewc(RExC_rxi->data,
6589 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6590 char, struct reg_data);
6592 Renew(RExC_rxi->data->what, count + n, U8);
6594 Newx(RExC_rxi->data->what, n, U8);
6595 RExC_rxi->data->count = count + n;
6596 Copy(s, RExC_rxi->data->what + count, n, U8);
6600 /*XXX: todo make this not included in a non debugging perl, but appears to be
6601 * used anyway there, in 'use re' */
6602 #ifndef PERL_IN_XSUB_RE
6604 Perl_reginitcolors(pTHX)
6606 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6608 char *t = savepv(s);
6612 t = strchr(t, '\t');
6618 PL_colors[i] = t = (char *)"";
6623 PL_colors[i++] = (char *)"";
6630 #ifdef TRIE_STUDY_OPT
6631 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6634 (data.flags & SCF_TRIE_RESTUDY) \
6642 #define CHECK_RESTUDY_GOTO_butfirst
6646 * pregcomp - compile a regular expression into internal code
6648 * Decides which engine's compiler to call based on the hint currently in
6652 #ifndef PERL_IN_XSUB_RE
6654 /* return the currently in-scope regex engine (or the default if none) */
6656 regexp_engine const *
6657 Perl_current_re_engine(pTHX)
6659 if (IN_PERL_COMPILETIME) {
6660 HV * const table = GvHV(PL_hintgv);
6663 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6664 return &PL_core_reg_engine;
6665 ptr = hv_fetchs(table, "regcomp", FALSE);
6666 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6667 return &PL_core_reg_engine;
6668 return INT2PTR(regexp_engine*, SvIV(*ptr));
6672 if (!PL_curcop->cop_hints_hash)
6673 return &PL_core_reg_engine;
6674 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6675 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6676 return &PL_core_reg_engine;
6677 return INT2PTR(regexp_engine*, SvIV(ptr));
6683 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6685 regexp_engine const *eng = current_re_engine();
6686 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6688 PERL_ARGS_ASSERT_PREGCOMP;
6690 /* Dispatch a request to compile a regexp to correct regexp engine. */
6692 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6695 return CALLREGCOMP_ENG(eng, pattern, flags);
6699 /* public(ish) entry point for the perl core's own regex compiling code.
6700 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6701 * pattern rather than a list of OPs, and uses the internal engine rather
6702 * than the current one */
6705 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6707 SV *pat = pattern; /* defeat constness! */
6709 PERL_ARGS_ASSERT_RE_COMPILE;
6711 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6712 #ifdef PERL_IN_XSUB_RE
6715 &PL_core_reg_engine,
6717 NULL, NULL, rx_flags, 0);
6721 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6725 if (--cbs->refcnt > 0)
6727 for (n = 0; n < cbs->count; n++) {
6728 REGEXP *rx = cbs->cb[n].src_regex;
6730 cbs->cb[n].src_regex = NULL;
6731 SvREFCNT_dec_NN(rx);
6739 static struct reg_code_blocks *
6740 S_alloc_code_blocks(pTHX_ int ncode)
6742 struct reg_code_blocks *cbs;
6743 Newx(cbs, 1, struct reg_code_blocks);
6746 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6748 Newx(cbs->cb, ncode, struct reg_code_block);
6755 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6756 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6757 * point to the realloced string and length.
6759 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6763 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6764 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6766 U8 *const src = (U8*)*pat_p;
6771 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6773 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6774 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6776 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6777 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6780 while (s < *plen_p) {
6781 append_utf8_from_native_byte(src[s], &d);
6783 if (n < num_code_blocks) {
6784 assert(pRExC_state->code_blocks);
6785 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6786 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6787 assert(*(d - 1) == '(');
6790 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6791 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6792 assert(*(d - 1) == ')');
6801 *pat_p = (char*) dst;
6803 RExC_orig_utf8 = RExC_utf8 = 1;
6808 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6809 * while recording any code block indices, and handling overloading,
6810 * nested qr// objects etc. If pat is null, it will allocate a new
6811 * string, or just return the first arg, if there's only one.
6813 * Returns the malloced/updated pat.
6814 * patternp and pat_count is the array of SVs to be concatted;
6815 * oplist is the optional list of ops that generated the SVs;
6816 * recompile_p is a pointer to a boolean that will be set if
6817 * the regex will need to be recompiled.
6818 * delim, if non-null is an SV that will be inserted between each element
6822 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6823 SV *pat, SV ** const patternp, int pat_count,
6824 OP *oplist, bool *recompile_p, SV *delim)
6828 bool use_delim = FALSE;
6829 bool alloced = FALSE;
6831 /* if we know we have at least two args, create an empty string,
6832 * then concatenate args to that. For no args, return an empty string */
6833 if (!pat && pat_count != 1) {
6839 for (svp = patternp; svp < patternp + pat_count; svp++) {
6842 STRLEN orig_patlen = 0;
6844 SV *msv = use_delim ? delim : *svp;
6845 if (!msv) msv = &PL_sv_undef;
6847 /* if we've got a delimiter, we go round the loop twice for each
6848 * svp slot (except the last), using the delimiter the second
6857 if (SvTYPE(msv) == SVt_PVAV) {
6858 /* we've encountered an interpolated array within
6859 * the pattern, e.g. /...@a..../. Expand the list of elements,
6860 * then recursively append elements.
6861 * The code in this block is based on S_pushav() */
6863 AV *const av = (AV*)msv;
6864 const SSize_t maxarg = AvFILL(av) + 1;
6868 assert(oplist->op_type == OP_PADAV
6869 || oplist->op_type == OP_RV2AV);
6870 oplist = OpSIBLING(oplist);
6873 if (SvRMAGICAL(av)) {
6876 Newx(array, maxarg, SV*);
6878 for (i=0; i < maxarg; i++) {
6879 SV ** const svp = av_fetch(av, i, FALSE);
6880 array[i] = svp ? *svp : &PL_sv_undef;
6884 array = AvARRAY(av);
6886 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6887 array, maxarg, NULL, recompile_p,
6889 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6895 /* we make the assumption here that each op in the list of
6896 * op_siblings maps to one SV pushed onto the stack,
6897 * except for code blocks, with have both an OP_NULL and
6899 * This allows us to match up the list of SVs against the
6900 * list of OPs to find the next code block.
6902 * Note that PUSHMARK PADSV PADSV ..
6904 * PADRANGE PADSV PADSV ..
6905 * so the alignment still works. */
6908 if (oplist->op_type == OP_NULL
6909 && (oplist->op_flags & OPf_SPECIAL))
6911 assert(n < pRExC_state->code_blocks->count);
6912 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6913 pRExC_state->code_blocks->cb[n].block = oplist;
6914 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6917 oplist = OpSIBLING(oplist); /* skip CONST */
6920 oplist = OpSIBLING(oplist);;
6923 /* apply magic and QR overloading to arg */
6926 if (SvROK(msv) && SvAMAGIC(msv)) {
6927 SV *sv = AMG_CALLunary(msv, regexp_amg);
6931 if (SvTYPE(sv) != SVt_REGEXP)
6932 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6937 /* try concatenation overload ... */
6938 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6939 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6942 /* overloading involved: all bets are off over literal
6943 * code. Pretend we haven't seen it */
6945 pRExC_state->code_blocks->count -= n;
6949 /* ... or failing that, try "" overload */
6950 while (SvAMAGIC(msv)
6951 && (sv = AMG_CALLunary(msv, string_amg))
6955 && SvRV(msv) == SvRV(sv))
6960 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6964 /* this is a partially unrolled
6965 * sv_catsv_nomg(pat, msv);
6966 * that allows us to adjust code block indices if
6969 char *dst = SvPV_force_nomg(pat, dlen);
6971 if (SvUTF8(msv) && !SvUTF8(pat)) {
6972 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6973 sv_setpvn(pat, dst, dlen);
6976 sv_catsv_nomg(pat, msv);
6980 /* We have only one SV to process, but we need to verify
6981 * it is properly null terminated or we will fail asserts
6982 * later. In theory we probably shouldn't get such SV's,
6983 * but if we do we should handle it gracefully. */
6984 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6985 /* not a string, or a string with a trailing null */
6988 /* a string with no trailing null, we need to copy it
6989 * so it has a trailing null */
6990 pat = sv_2mortal(newSVsv(msv));
6995 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6998 /* extract any code blocks within any embedded qr//'s */
6999 if (rx && SvTYPE(rx) == SVt_REGEXP
7000 && RX_ENGINE((REGEXP*)rx)->op_comp)
7003 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7004 if (ri->code_blocks && ri->code_blocks->count) {
7006 /* the presence of an embedded qr// with code means
7007 * we should always recompile: the text of the
7008 * qr// may not have changed, but it may be a
7009 * different closure than last time */
7011 if (pRExC_state->code_blocks) {
7012 int new_count = pRExC_state->code_blocks->count
7013 + ri->code_blocks->count;
7014 Renew(pRExC_state->code_blocks->cb,
7015 new_count, struct reg_code_block);
7016 pRExC_state->code_blocks->count = new_count;
7019 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7020 ri->code_blocks->count);
7022 for (i=0; i < ri->code_blocks->count; i++) {
7023 struct reg_code_block *src, *dst;
7024 STRLEN offset = orig_patlen
7025 + ReANY((REGEXP *)rx)->pre_prefix;
7026 assert(n < pRExC_state->code_blocks->count);
7027 src = &ri->code_blocks->cb[i];
7028 dst = &pRExC_state->code_blocks->cb[n];
7029 dst->start = src->start + offset;
7030 dst->end = src->end + offset;
7031 dst->block = src->block;
7032 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7041 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7050 /* see if there are any run-time code blocks in the pattern.
7051 * False positives are allowed */
7054 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7055 char *pat, STRLEN plen)
7060 PERL_UNUSED_CONTEXT;
7062 for (s = 0; s < plen; s++) {
7063 if ( pRExC_state->code_blocks
7064 && n < pRExC_state->code_blocks->count
7065 && s == pRExC_state->code_blocks->cb[n].start)
7067 s = pRExC_state->code_blocks->cb[n].end;
7071 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7073 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7075 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7082 /* Handle run-time code blocks. We will already have compiled any direct
7083 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7084 * copy of it, but with any literal code blocks blanked out and
7085 * appropriate chars escaped; then feed it into
7087 * eval "qr'modified_pattern'"
7091 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7095 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7097 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7098 * and merge them with any code blocks of the original regexp.
7100 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7101 * instead, just save the qr and return FALSE; this tells our caller that
7102 * the original pattern needs upgrading to utf8.
7106 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7107 char *pat, STRLEN plen)
7111 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7113 if (pRExC_state->runtime_code_qr) {
7114 /* this is the second time we've been called; this should
7115 * only happen if the main pattern got upgraded to utf8
7116 * during compilation; re-use the qr we compiled first time
7117 * round (which should be utf8 too)
7119 qr = pRExC_state->runtime_code_qr;
7120 pRExC_state->runtime_code_qr = NULL;
7121 assert(RExC_utf8 && SvUTF8(qr));
7127 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7131 /* determine how many extra chars we need for ' and \ escaping */
7132 for (s = 0; s < plen; s++) {
7133 if (pat[s] == '\'' || pat[s] == '\\')
7137 Newx(newpat, newlen, char);
7139 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7141 for (s = 0; s < plen; s++) {
7142 if ( pRExC_state->code_blocks
7143 && n < pRExC_state->code_blocks->count
7144 && s == pRExC_state->code_blocks->cb[n].start)
7146 /* blank out literal code block so that they aren't
7147 * recompiled: eg change from/to:
7157 assert(pat[s] == '(');
7158 assert(pat[s+1] == '?');
7162 while (s < pRExC_state->code_blocks->cb[n].end) {
7170 if (pat[s] == '\'' || pat[s] == '\\')
7175 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7177 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7183 Perl_re_printf( aTHX_
7184 "%sre-parsing pattern for runtime code:%s %s\n",
7185 PL_colors[4], PL_colors[5], newpat);
7188 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7194 PUSHSTACKi(PERLSI_REQUIRE);
7195 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7196 * parsing qr''; normally only q'' does this. It also alters
7198 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7199 SvREFCNT_dec_NN(sv);
7204 SV * const errsv = ERRSV;
7205 if (SvTRUE_NN(errsv))
7206 /* use croak_sv ? */
7207 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7209 assert(SvROK(qr_ref));
7211 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7212 /* the leaving below frees the tmp qr_ref.
7213 * Give qr a life of its own */
7221 if (!RExC_utf8 && SvUTF8(qr)) {
7222 /* first time through; the pattern got upgraded; save the
7223 * qr for the next time through */
7224 assert(!pRExC_state->runtime_code_qr);
7225 pRExC_state->runtime_code_qr = qr;
7230 /* extract any code blocks within the returned qr// */
7233 /* merge the main (r1) and run-time (r2) code blocks into one */
7235 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7236 struct reg_code_block *new_block, *dst;
7237 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7241 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7243 SvREFCNT_dec_NN(qr);
7247 if (!r1->code_blocks)
7248 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7250 r1c = r1->code_blocks->count;
7251 r2c = r2->code_blocks->count;
7253 Newx(new_block, r1c + r2c, struct reg_code_block);
7257 while (i1 < r1c || i2 < r2c) {
7258 struct reg_code_block *src;
7262 src = &r2->code_blocks->cb[i2++];
7266 src = &r1->code_blocks->cb[i1++];
7267 else if ( r1->code_blocks->cb[i1].start
7268 < r2->code_blocks->cb[i2].start)
7270 src = &r1->code_blocks->cb[i1++];
7271 assert(src->end < r2->code_blocks->cb[i2].start);
7274 assert( r1->code_blocks->cb[i1].start
7275 > r2->code_blocks->cb[i2].start);
7276 src = &r2->code_blocks->cb[i2++];
7278 assert(src->end < r1->code_blocks->cb[i1].start);
7281 assert(pat[src->start] == '(');
7282 assert(pat[src->end] == ')');
7283 dst->start = src->start;
7284 dst->end = src->end;
7285 dst->block = src->block;
7286 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7290 r1->code_blocks->count += r2c;
7291 Safefree(r1->code_blocks->cb);
7292 r1->code_blocks->cb = new_block;
7295 SvREFCNT_dec_NN(qr);
7301 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7302 struct reg_substr_datum *rsd,
7303 struct scan_data_substrs *sub,
7304 STRLEN longest_length)
7306 /* This is the common code for setting up the floating and fixed length
7307 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7308 * as to whether succeeded or not */
7312 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7313 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7315 if (! (longest_length
7316 || (eol /* Can't have SEOL and MULTI */
7317 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7319 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7320 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7325 /* copy the information about the longest from the reg_scan_data
7326 over to the program. */
7327 if (SvUTF8(sub->str)) {
7329 rsd->utf8_substr = sub->str;
7331 rsd->substr = sub->str;
7332 rsd->utf8_substr = NULL;
7334 /* end_shift is how many chars that must be matched that
7335 follow this item. We calculate it ahead of time as once the
7336 lookbehind offset is added in we lose the ability to correctly
7338 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7339 rsd->end_shift = ml - sub->min_offset
7341 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7343 + (SvTAIL(sub->str) != 0)
7347 t = (eol/* Can't have SEOL and MULTI */
7348 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7349 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7355 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7357 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7358 * properly wrapped with the right modifiers */
7360 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7361 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7362 != REGEX_DEPENDS_CHARSET);
7364 /* The caret is output if there are any defaults: if not all the STD
7365 * flags are set, or if no character set specifier is needed */
7367 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7369 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7370 == REG_RUN_ON_COMMENT_SEEN);
7371 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7372 >> RXf_PMf_STD_PMMOD_SHIFT);
7373 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7375 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7377 /* We output all the necessary flags; we never output a minus, as all
7378 * those are defaults, so are
7379 * covered by the caret */
7380 const STRLEN wraplen = pat_len + has_p + has_runon
7381 + has_default /* If needs a caret */
7382 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7384 /* If needs a character set specifier */
7385 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7386 + (sizeof("(?:)") - 1);
7388 PERL_ARGS_ASSERT_SET_REGEX_PV;
7390 /* make sure PL_bitcount bounds not exceeded */
7391 assert(sizeof(STD_PAT_MODS) <= 8);
7393 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7396 SvFLAGS(Rx) |= SVf_UTF8;
7399 /* If a default, cover it using the caret */
7401 *p++= DEFAULT_PAT_MOD;
7407 name = get_regex_charset_name(RExC_rx->extflags, &len);
7408 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7410 name = UNICODE_PAT_MODS;
7411 len = sizeof(UNICODE_PAT_MODS) - 1;
7413 Copy(name, p, len, char);
7417 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7420 while((ch = *fptr++)) {
7428 Copy(RExC_precomp, p, pat_len, char);
7429 assert ((RX_WRAPPED(Rx) - p) < 16);
7430 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7433 /* Adding a trailing \n causes this to compile properly:
7434 my $R = qr / A B C # D E/x; /($R)/
7435 Otherwise the parens are considered part of the comment */
7440 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7444 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7445 * regular expression into internal code.
7446 * The pattern may be passed either as:
7447 * a list of SVs (patternp plus pat_count)
7448 * a list of OPs (expr)
7449 * If both are passed, the SV list is used, but the OP list indicates
7450 * which SVs are actually pre-compiled code blocks
7452 * The SVs in the list have magic and qr overloading applied to them (and
7453 * the list may be modified in-place with replacement SVs in the latter
7456 * If the pattern hasn't changed from old_re, then old_re will be
7459 * eng is the current engine. If that engine has an op_comp method, then
7460 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7461 * do the initial concatenation of arguments and pass on to the external
7464 * If is_bare_re is not null, set it to a boolean indicating whether the
7465 * arg list reduced (after overloading) to a single bare regex which has
7466 * been returned (i.e. /$qr/).
7468 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7470 * pm_flags contains the PMf_* flags, typically based on those from the
7471 * pm_flags field of the related PMOP. Currently we're only interested in
7472 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7474 * For many years this code had an initial sizing pass that calculated
7475 * (sometimes incorrectly, leading to security holes) the size needed for the
7476 * compiled pattern. That was changed by commit
7477 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7478 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7479 * references to this sizing pass.
7481 * Now, an initial crude guess as to the size needed is made, based on the
7482 * length of the pattern. Patches welcome to improve that guess. That amount
7483 * of space is malloc'd and then immediately freed, and then clawed back node
7484 * by node. This design is to minimze, to the extent possible, memory churn
7485 * when doing the the reallocs.
7487 * A separate parentheses counting pass may be needed in some cases.
7488 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7491 * The existence of a sizing pass necessitated design decisions that are no
7492 * longer needed. There are potential areas of simplification.
7494 * Beware that the optimization-preparation code in here knows about some
7495 * of the structure of the compiled regexp. [I'll say.]
7499 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7500 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7501 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7504 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7512 SV** new_patternp = patternp;
7514 /* these are all flags - maybe they should be turned
7515 * into a single int with different bit masks */
7516 I32 sawlookahead = 0;
7521 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7523 bool runtime_code = 0;
7525 RExC_state_t RExC_state;
7526 RExC_state_t * const pRExC_state = &RExC_state;
7527 #ifdef TRIE_STUDY_OPT
7529 RExC_state_t copyRExC_state;
7531 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7533 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7535 DEBUG_r(if (!PL_colorset) reginitcolors());
7538 pRExC_state->warn_text = NULL;
7539 pRExC_state->unlexed_names = NULL;
7540 pRExC_state->code_blocks = NULL;
7543 *is_bare_re = FALSE;
7545 if (expr && (expr->op_type == OP_LIST ||
7546 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7547 /* allocate code_blocks if needed */
7551 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7552 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7553 ncode++; /* count of DO blocks */
7556 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7560 /* compile-time pattern with just OP_CONSTs and DO blocks */
7565 /* find how many CONSTs there are */
7568 if (expr->op_type == OP_CONST)
7571 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7572 if (o->op_type == OP_CONST)
7576 /* fake up an SV array */
7578 assert(!new_patternp);
7579 Newx(new_patternp, n, SV*);
7580 SAVEFREEPV(new_patternp);
7584 if (expr->op_type == OP_CONST)
7585 new_patternp[n] = cSVOPx_sv(expr);
7587 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7588 if (o->op_type == OP_CONST)
7589 new_patternp[n++] = cSVOPo_sv;
7594 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7595 "Assembling pattern from %d elements%s\n", pat_count,
7596 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7598 /* set expr to the first arg op */
7600 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7601 && expr->op_type != OP_CONST)
7603 expr = cLISTOPx(expr)->op_first;
7604 assert( expr->op_type == OP_PUSHMARK
7605 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7606 || expr->op_type == OP_PADRANGE);
7607 expr = OpSIBLING(expr);
7610 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7611 expr, &recompile, NULL);
7613 /* handle bare (possibly after overloading) regex: foo =~ $re */
7618 if (SvTYPE(re) == SVt_REGEXP) {
7622 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7623 "Precompiled pattern%s\n",
7624 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7630 exp = SvPV_nomg(pat, plen);
7632 if (!eng->op_comp) {
7633 if ((SvUTF8(pat) && IN_BYTES)
7634 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7636 /* make a temporary copy; either to convert to bytes,
7637 * or to avoid repeating get-magic / overloaded stringify */
7638 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7639 (IN_BYTES ? 0 : SvUTF8(pat)));
7641 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7644 /* ignore the utf8ness if the pattern is 0 length */
7645 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7646 RExC_uni_semantics = 0;
7647 RExC_contains_locale = 0;
7648 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7649 RExC_in_script_run = 0;
7650 RExC_study_started = 0;
7651 pRExC_state->runtime_code_qr = NULL;
7652 RExC_frame_head= NULL;
7653 RExC_frame_last= NULL;
7654 RExC_frame_count= 0;
7655 RExC_latest_warn_offset = 0;
7656 RExC_use_BRANCHJ = 0;
7657 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7658 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7659 RExC_total_parens = 0;
7660 RExC_open_parens = NULL;
7661 RExC_close_parens = NULL;
7662 RExC_paren_names = NULL;
7664 RExC_seen_d_op = FALSE;
7666 RExC_paren_name_list = NULL;
7670 RExC_mysv1= sv_newmortal();
7671 RExC_mysv2= sv_newmortal();
7675 SV *dsv= sv_newmortal();
7676 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7677 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7678 PL_colors[4], PL_colors[5], s);
7681 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7684 if ((pm_flags & PMf_USE_RE_EVAL)
7685 /* this second condition covers the non-regex literal case,
7686 * i.e. $foo =~ '(?{})'. */
7687 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7689 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7692 /* return old regex if pattern hasn't changed */
7693 /* XXX: note in the below we have to check the flags as well as the
7696 * Things get a touch tricky as we have to compare the utf8 flag
7697 * independently from the compile flags. */
7701 && !!RX_UTF8(old_re) == !!RExC_utf8
7702 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7703 && RX_PRECOMP(old_re)
7704 && RX_PRELEN(old_re) == plen
7705 && memEQ(RX_PRECOMP(old_re), exp, plen)
7706 && !runtime_code /* with runtime code, always recompile */ )
7709 SV *dsv= sv_newmortal();
7710 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7711 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
7712 PL_colors[4], PL_colors[5], s);
7717 /* Allocate the pattern's SV */
7718 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7719 RExC_rx = ReANY(Rx);
7720 if ( RExC_rx == NULL )
7721 FAIL("Regexp out of space");
7723 rx_flags = orig_rx_flags;
7725 if ( (UTF || RExC_uni_semantics)
7726 && initial_charset == REGEX_DEPENDS_CHARSET)
7729 /* Set to use unicode semantics if the pattern is in utf8 and has the
7730 * 'depends' charset specified, as it means unicode when utf8 */
7731 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7732 RExC_uni_semantics = 1;
7735 RExC_pm_flags = pm_flags;
7738 assert(TAINTING_get || !TAINT_get);
7740 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7742 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7743 /* whoops, we have a non-utf8 pattern, whilst run-time code
7744 * got compiled as utf8. Try again with a utf8 pattern */
7745 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7746 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7750 assert(!pRExC_state->runtime_code_qr);
7756 RExC_in_lookbehind = 0;
7757 RExC_in_lookahead = 0;
7758 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7759 RExC_recode_x_to_native = 0;
7760 RExC_in_multi_char_class = 0;
7762 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7763 RExC_precomp_end = RExC_end = exp + plen;
7765 RExC_whilem_seen = 0;
7767 RExC_recurse = NULL;
7768 RExC_study_chunk_recursed = NULL;
7769 RExC_study_chunk_recursed_bytes= 0;
7770 RExC_recurse_count = 0;
7771 RExC_sets_depth = 0;
7772 pRExC_state->code_index = 0;
7774 /* Initialize the string in the compiled pattern. This is so that there is
7775 * something to output if necessary */
7776 set_regex_pv(pRExC_state, Rx);
7779 Perl_re_printf( aTHX_
7780 "Starting parse and generation\n");
7782 RExC_lastparse=NULL;
7785 /* Allocate space and zero-initialize. Note, the two step process
7786 of zeroing when in debug mode, thus anything assigned has to
7787 happen after that */
7790 /* On the first pass of the parse, we guess how big this will be. Then
7791 * we grow in one operation to that amount and then give it back. As
7792 * we go along, we re-allocate what we need.
7794 * XXX Currently the guess is essentially that the pattern will be an
7795 * EXACT node with one byte input, one byte output. This is crude, and
7796 * better heuristics are welcome.
7798 * On any subsequent passes, we guess what we actually computed in the
7799 * latest earlier pass. Such a pass probably didn't complete so is
7800 * missing stuff. We could improve those guesses by knowing where the
7801 * parse stopped, and use the length so far plus apply the above
7802 * assumption to what's left. */
7803 RExC_size = STR_SZ(RExC_end - RExC_start);
7806 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7807 if ( RExC_rxi == NULL )
7808 FAIL("Regexp out of space");
7810 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7811 RXi_SET( RExC_rx, RExC_rxi );
7813 /* We start from 0 (over from 0 in the case this is a reparse. The first
7814 * node parsed will give back any excess memory we have allocated so far).
7818 /* non-zero initialization begins here */
7819 RExC_rx->engine= eng;
7820 RExC_rx->extflags = rx_flags;
7821 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7823 if (pm_flags & PMf_IS_QR) {
7824 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7825 if (RExC_rxi->code_blocks) {
7826 RExC_rxi->code_blocks->refcnt++;
7830 RExC_rx->intflags = 0;
7832 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7835 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7836 * code makes sure the final byte is an uncounted NUL. But should this
7837 * ever not be the case, lots of things could read beyond the end of the
7838 * buffer: loops like
7839 * while(isFOO(*RExC_parse)) RExC_parse++;
7840 * strchr(RExC_parse, "foo");
7841 * etc. So it is worth noting. */
7842 assert(*RExC_end == '\0');
7846 RExC_parens_buf_size = 0;
7847 RExC_emit_start = RExC_rxi->program;
7848 pRExC_state->code_index = 0;
7850 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7854 if (reg(pRExC_state, 0, &flags, 1)) {
7856 /* Success!, But we may need to redo the parse knowing how many parens
7857 * there actually are */
7858 if (IN_PARENS_PASS) {
7859 flags |= RESTART_PARSE;
7862 /* We have that number in RExC_npar */
7863 RExC_total_parens = RExC_npar;
7865 else if (! MUST_RESTART(flags)) {
7867 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7870 /* Here, we either have success, or we have to redo the parse for some reason */
7871 if (MUST_RESTART(flags)) {
7873 /* It's possible to write a regexp in ascii that represents Unicode
7874 codepoints outside of the byte range, such as via \x{100}. If we
7875 detect such a sequence we have to convert the entire pattern to utf8
7876 and then recompile, as our sizing calculation will have been based
7877 on 1 byte == 1 character, but we will need to use utf8 to encode
7878 at least some part of the pattern, and therefore must convert the whole
7881 if (flags & NEED_UTF8) {
7883 /* We have stored the offset of the final warning output so far.
7884 * That must be adjusted. Any variant characters between the start
7885 * of the pattern and this warning count for 2 bytes in the final,
7886 * so just add them again */
7887 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7888 RExC_latest_warn_offset +=
7889 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7890 + RExC_latest_warn_offset);
7892 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7893 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7894 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7897 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7900 if (ALL_PARENS_COUNTED) {
7901 /* Make enough room for all the known parens, and zero it */
7902 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7903 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7904 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7906 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7907 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7909 else { /* Parse did not complete. Reinitialize the parentheses
7911 RExC_total_parens = 0;
7912 if (RExC_open_parens) {
7913 Safefree(RExC_open_parens);
7914 RExC_open_parens = NULL;
7916 if (RExC_close_parens) {
7917 Safefree(RExC_close_parens);
7918 RExC_close_parens = NULL;
7922 /* Clean up what we did in this parse */
7923 SvREFCNT_dec_NN(RExC_rx_sv);
7928 /* Here, we have successfully parsed and generated the pattern's program
7929 * for the regex engine. We are ready to finish things up and look for
7932 /* Update the string to compile, with correct modifiers, etc */
7933 set_regex_pv(pRExC_state, Rx);
7935 RExC_rx->nparens = RExC_total_parens - 1;
7937 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7938 if (RExC_whilem_seen > 15)
7939 RExC_whilem_seen = 15;
7942 Perl_re_printf( aTHX_
7943 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7945 RExC_lastparse=NULL;
7948 #ifdef RE_TRACK_PATTERN_OFFSETS
7949 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7950 "%s %" UVuf " bytes for offset annotations.\n",
7951 RExC_offsets ? "Got" : "Couldn't get",
7952 (UV)((RExC_offsets[0] * 2 + 1))));
7953 DEBUG_OFFSETS_r(if (RExC_offsets) {
7954 const STRLEN len = RExC_offsets[0];
7956 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7957 Perl_re_printf( aTHX_
7958 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7959 for (i = 1; i <= len; i++) {
7960 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7961 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7962 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7964 Perl_re_printf( aTHX_ "\n");
7968 SetProgLen(RExC_rxi,RExC_size);
7971 DEBUG_DUMP_PRE_OPTIMIZE_r({
7972 SV * const sv = sv_newmortal();
7973 RXi_GET_DECL(RExC_rx, ri);
7975 Perl_re_printf( aTHX_ "Program before optimization:\n");
7977 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7982 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7985 /* XXXX To minimize changes to RE engine we always allocate
7986 3-units-long substrs field. */
7987 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7988 if (RExC_recurse_count) {
7989 Newx(RExC_recurse, RExC_recurse_count, regnode *);
7990 SAVEFREEPV(RExC_recurse);
7993 if (RExC_seen & REG_RECURSE_SEEN) {
7994 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7995 * So its 1 if there are no parens. */
7996 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7997 ((RExC_total_parens & 0x07) != 0);
7998 Newx(RExC_study_chunk_recursed,
7999 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8000 SAVEFREEPV(RExC_study_chunk_recursed);
8004 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8006 RExC_study_chunk_recursed_count= 0;
8008 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8009 if (RExC_study_chunk_recursed) {
8010 Zero(RExC_study_chunk_recursed,
8011 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8015 #ifdef TRIE_STUDY_OPT
8017 StructCopy(&zero_scan_data, &data, scan_data_t);
8018 copyRExC_state = RExC_state;
8021 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8023 RExC_state = copyRExC_state;
8024 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8025 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8027 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8028 StructCopy(&zero_scan_data, &data, scan_data_t);
8031 StructCopy(&zero_scan_data, &data, scan_data_t);
8034 /* Dig out information for optimizations. */
8035 RExC_rx->extflags = RExC_flags; /* was pm_op */
8036 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8039 SvUTF8_on(Rx); /* Unicode in it? */
8040 RExC_rxi->regstclass = NULL;
8041 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8042 RExC_rx->intflags |= PREGf_NAUGHTY;
8043 scan = RExC_rxi->program + 1; /* First BRANCH. */
8045 /* testing for BRANCH here tells us whether there is "must appear"
8046 data in the pattern. If there is then we can use it for optimisations */
8047 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8050 STRLEN longest_length[2];
8051 regnode_ssc ch_class; /* pointed to by data */
8053 SSize_t last_close = 0; /* pointed to by data */
8054 regnode *first= scan;
8055 regnode *first_next= regnext(first);
8059 * Skip introductions and multiplicators >= 1
8060 * so that we can extract the 'meat' of the pattern that must
8061 * match in the large if() sequence following.
8062 * NOTE that EXACT is NOT covered here, as it is normally
8063 * picked up by the optimiser separately.
8065 * This is unfortunate as the optimiser isnt handling lookahead
8066 * properly currently.
8069 while ((OP(first) == OPEN && (sawopen = 1)) ||
8070 /* An OR of *one* alternative - should not happen now. */
8071 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8072 /* for now we can't handle lookbehind IFMATCH*/
8073 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8074 (OP(first) == PLUS) ||
8075 (OP(first) == MINMOD) ||
8076 /* An {n,m} with n>0 */
8077 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8078 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8081 * the only op that could be a regnode is PLUS, all the rest
8082 * will be regnode_1 or regnode_2.
8084 * (yves doesn't think this is true)
8086 if (OP(first) == PLUS)
8089 if (OP(first) == MINMOD)
8091 first += regarglen[OP(first)];
8093 first = NEXTOPER(first);
8094 first_next= regnext(first);
8097 /* Starting-point info. */
8099 DEBUG_PEEP("first:", first, 0, 0);
8100 /* Ignore EXACT as we deal with it later. */
8101 if (PL_regkind[OP(first)] == EXACT) {
8102 if ( OP(first) == EXACT
8103 || OP(first) == LEXACT
8104 || OP(first) == EXACT_REQ8
8105 || OP(first) == LEXACT_REQ8
8106 || OP(first) == EXACTL)
8108 NOOP; /* Empty, get anchored substr later. */
8111 RExC_rxi->regstclass = first;
8114 else if (PL_regkind[OP(first)] == TRIE &&
8115 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8117 /* this can happen only on restudy */
8118 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8121 else if (REGNODE_SIMPLE(OP(first)))
8122 RExC_rxi->regstclass = first;
8123 else if (PL_regkind[OP(first)] == BOUND ||
8124 PL_regkind[OP(first)] == NBOUND)
8125 RExC_rxi->regstclass = first;
8126 else if (PL_regkind[OP(first)] == BOL) {
8127 RExC_rx->intflags |= (OP(first) == MBOL
8130 first = NEXTOPER(first);
8133 else if (OP(first) == GPOS) {
8134 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8135 first = NEXTOPER(first);
8138 else if ((!sawopen || !RExC_sawback) &&
8140 (OP(first) == STAR &&
8141 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8142 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8144 /* turn .* into ^.* with an implied $*=1 */
8146 (OP(NEXTOPER(first)) == REG_ANY)
8149 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8150 first = NEXTOPER(first);
8153 if (sawplus && !sawminmod && !sawlookahead
8154 && (!sawopen || !RExC_sawback)
8155 && !pRExC_state->code_blocks) /* May examine pos and $& */
8156 /* x+ must match at the 1st pos of run of x's */
8157 RExC_rx->intflags |= PREGf_SKIP;
8159 /* Scan is after the zeroth branch, first is atomic matcher. */
8160 #ifdef TRIE_STUDY_OPT
8163 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8164 (IV)(first - scan + 1))
8168 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8169 (IV)(first - scan + 1))
8175 * If there's something expensive in the r.e., find the
8176 * longest literal string that must appear and make it the
8177 * regmust. Resolve ties in favor of later strings, since
8178 * the regstart check works with the beginning of the r.e.
8179 * and avoiding duplication strengthens checking. Not a
8180 * strong reason, but sufficient in the absence of others.
8181 * [Now we resolve ties in favor of the earlier string if
8182 * it happens that c_offset_min has been invalidated, since the
8183 * earlier string may buy us something the later one won't.]
8186 data.substrs[0].str = newSVpvs("");
8187 data.substrs[1].str = newSVpvs("");
8188 data.last_found = newSVpvs("");
8189 data.cur_is_floating = 0; /* initially any found substring is fixed */
8190 ENTER_with_name("study_chunk");
8191 SAVEFREESV(data.substrs[0].str);
8192 SAVEFREESV(data.substrs[1].str);
8193 SAVEFREESV(data.last_found);
8195 if (!RExC_rxi->regstclass) {
8196 ssc_init(pRExC_state, &ch_class);
8197 data.start_class = &ch_class;
8198 stclass_flag = SCF_DO_STCLASS_AND;
8199 } else /* XXXX Check for BOUND? */
8201 data.last_closep = &last_close;
8205 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8206 * (NO top level branches)
8208 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8209 scan + RExC_size, /* Up to end */
8211 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8212 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8216 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8219 if ( RExC_total_parens == 1 && !data.cur_is_floating
8220 && data.last_start_min == 0 && data.last_end > 0
8221 && !RExC_seen_zerolen
8222 && !(RExC_seen & REG_VERBARG_SEEN)
8223 && !(RExC_seen & REG_GPOS_SEEN)
8225 RExC_rx->extflags |= RXf_CHECK_ALL;
8227 scan_commit(pRExC_state, &data,&minlen, 0);
8230 /* XXX this is done in reverse order because that's the way the
8231 * code was before it was parameterised. Don't know whether it
8232 * actually needs doing in reverse order. DAPM */
8233 for (i = 1; i >= 0; i--) {
8234 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8237 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8238 && data.substrs[0].min_offset
8239 == data.substrs[1].min_offset
8240 && SvCUR(data.substrs[0].str)
8241 == SvCUR(data.substrs[1].str)
8243 && S_setup_longest (aTHX_ pRExC_state,
8244 &(RExC_rx->substrs->data[i]),
8248 RExC_rx->substrs->data[i].min_offset =
8249 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8251 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8252 /* Don't offset infinity */
8253 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8254 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8255 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8258 RExC_rx->substrs->data[i].substr = NULL;
8259 RExC_rx->substrs->data[i].utf8_substr = NULL;
8260 longest_length[i] = 0;
8264 LEAVE_with_name("study_chunk");
8266 if (RExC_rxi->regstclass
8267 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8268 RExC_rxi->regstclass = NULL;
8270 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8271 || RExC_rx->substrs->data[0].min_offset)
8273 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8274 && is_ssc_worth_it(pRExC_state, data.start_class))
8276 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8278 ssc_finalize(pRExC_state, data.start_class);
8280 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8281 StructCopy(data.start_class,
8282 (regnode_ssc*)RExC_rxi->data->data[n],
8284 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8285 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8286 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8287 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8288 Perl_re_printf( aTHX_
8289 "synthetic stclass \"%s\".\n",
8290 SvPVX_const(sv));});
8291 data.start_class = NULL;
8294 /* A temporary algorithm prefers floated substr to fixed one of
8295 * same length to dig more info. */
8296 i = (longest_length[0] <= longest_length[1]);
8297 RExC_rx->substrs->check_ix = i;
8298 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8299 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8300 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8301 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8302 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8303 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8304 RExC_rx->intflags |= PREGf_NOSCAN;
8306 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8307 RExC_rx->extflags |= RXf_USE_INTUIT;
8308 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8309 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8312 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8313 if ( (STRLEN)minlen < longest_length[1] )
8314 minlen= longest_length[1];
8315 if ( (STRLEN)minlen < longest_length[0] )
8316 minlen= longest_length[0];
8320 /* Several toplevels. Best we can is to set minlen. */
8322 regnode_ssc ch_class;
8323 SSize_t last_close = 0;
8325 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8327 scan = RExC_rxi->program + 1;
8328 ssc_init(pRExC_state, &ch_class);
8329 data.start_class = &ch_class;
8330 data.last_closep = &last_close;
8334 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8335 * (patterns WITH top level branches)
8337 minlen = study_chunk(pRExC_state,
8338 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8339 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8340 ? SCF_TRIE_DOING_RESTUDY
8344 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8346 RExC_rx->check_substr = NULL;
8347 RExC_rx->check_utf8 = NULL;
8348 RExC_rx->substrs->data[0].substr = NULL;
8349 RExC_rx->substrs->data[0].utf8_substr = NULL;
8350 RExC_rx->substrs->data[1].substr = NULL;
8351 RExC_rx->substrs->data[1].utf8_substr = NULL;
8353 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8354 && is_ssc_worth_it(pRExC_state, data.start_class))
8356 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8358 ssc_finalize(pRExC_state, data.start_class);
8360 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8361 StructCopy(data.start_class,
8362 (regnode_ssc*)RExC_rxi->data->data[n],
8364 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8365 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8366 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8367 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8368 Perl_re_printf( aTHX_
8369 "synthetic stclass \"%s\".\n",
8370 SvPVX_const(sv));});
8371 data.start_class = NULL;
8375 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8376 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8377 RExC_rx->maxlen = REG_INFTY;
8380 RExC_rx->maxlen = RExC_maxlen;
8383 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8384 the "real" pattern. */
8386 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8387 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8389 RExC_rx->minlenret = minlen;
8390 if (RExC_rx->minlen < minlen)
8391 RExC_rx->minlen = minlen;
8393 if (RExC_seen & REG_RECURSE_SEEN ) {
8394 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8395 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8397 if (RExC_seen & REG_GPOS_SEEN)
8398 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8399 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8400 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8402 if (pRExC_state->code_blocks)
8403 RExC_rx->extflags |= RXf_EVAL_SEEN;
8404 if (RExC_seen & REG_VERBARG_SEEN)
8406 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8407 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8409 if (RExC_seen & REG_CUTGROUP_SEEN)
8410 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8411 if (pm_flags & PMf_USE_RE_EVAL)
8412 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8413 if (RExC_paren_names)
8414 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8416 RXp_PAREN_NAMES(RExC_rx) = NULL;
8418 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8419 * so it can be used in pp.c */
8420 if (RExC_rx->intflags & PREGf_ANCH)
8421 RExC_rx->extflags |= RXf_IS_ANCHORED;
8425 /* this is used to identify "special" patterns that might result
8426 * in Perl NOT calling the regex engine and instead doing the match "itself",
8427 * particularly special cases in split//. By having the regex compiler
8428 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8429 * we avoid weird issues with equivalent patterns resulting in different behavior,
8430 * AND we allow non Perl engines to get the same optimizations by the setting the
8431 * flags appropriately - Yves */
8432 regnode *first = RExC_rxi->program + 1;
8434 regnode *next = regnext(first);
8437 if (PL_regkind[fop] == NOTHING && nop == END)
8438 RExC_rx->extflags |= RXf_NULL;
8439 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8440 /* when fop is SBOL first->flags will be true only when it was
8441 * produced by parsing /\A/, and not when parsing /^/. This is
8442 * very important for the split code as there we want to
8443 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8444 * See rt #122761 for more details. -- Yves */
8445 RExC_rx->extflags |= RXf_START_ONLY;
8446 else if (fop == PLUS
8447 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8449 RExC_rx->extflags |= RXf_WHITE;
8450 else if ( RExC_rx->extflags & RXf_SPLIT
8451 && ( fop == EXACT || fop == LEXACT
8452 || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8454 && STR_LEN(first) == 1
8455 && *(STRING(first)) == ' '
8457 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8461 if (RExC_contains_locale) {
8462 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8466 if (RExC_paren_names) {
8467 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8468 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8469 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8472 RExC_rxi->name_list_idx = 0;
8474 while ( RExC_recurse_count > 0 ) {
8475 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8477 * This data structure is set up in study_chunk() and is used
8478 * to calculate the distance between a GOSUB regopcode and
8479 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8482 * If for some reason someone writes code that optimises
8483 * away a GOSUB opcode then the assert should be changed to
8484 * an if(scan) to guard the ARG2L_SET() - Yves
8487 assert(scan && OP(scan) == GOSUB);
8488 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8491 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8492 /* assume we don't need to swap parens around before we match */
8494 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8495 (unsigned long)RExC_study_chunk_recursed_count);
8499 Perl_re_printf( aTHX_ "Final program:\n");
8503 if (RExC_open_parens) {
8504 Safefree(RExC_open_parens);
8505 RExC_open_parens = NULL;
8507 if (RExC_close_parens) {
8508 Safefree(RExC_close_parens);
8509 RExC_close_parens = NULL;
8513 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8514 * by setting the regexp SV to readonly-only instead. If the
8515 * pattern's been recompiled, the USEDness should remain. */
8516 if (old_re && SvREADONLY(old_re))
8524 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8527 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8529 PERL_UNUSED_ARG(value);
8531 if (flags & RXapif_FETCH) {
8532 return reg_named_buff_fetch(rx, key, flags);
8533 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8534 Perl_croak_no_modify();
8536 } else if (flags & RXapif_EXISTS) {
8537 return reg_named_buff_exists(rx, key, flags)
8540 } else if (flags & RXapif_REGNAMES) {
8541 return reg_named_buff_all(rx, flags);
8542 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8543 return reg_named_buff_scalar(rx, flags);
8545 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8551 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8554 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8555 PERL_UNUSED_ARG(lastkey);
8557 if (flags & RXapif_FIRSTKEY)
8558 return reg_named_buff_firstkey(rx, flags);
8559 else if (flags & RXapif_NEXTKEY)
8560 return reg_named_buff_nextkey(rx, flags);
8562 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8569 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8573 struct regexp *const rx = ReANY(r);
8575 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8577 if (rx && RXp_PAREN_NAMES(rx)) {
8578 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8581 SV* sv_dat=HeVAL(he_str);
8582 I32 *nums=(I32*)SvPVX(sv_dat);
8583 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8584 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8585 if ((I32)(rx->nparens) >= nums[i]
8586 && rx->offs[nums[i]].start != -1
8587 && rx->offs[nums[i]].end != -1)
8590 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8595 ret = newSVsv(&PL_sv_undef);
8598 av_push(retarray, ret);
8601 return newRV_noinc(MUTABLE_SV(retarray));
8608 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8611 struct regexp *const rx = ReANY(r);
8613 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8615 if (rx && RXp_PAREN_NAMES(rx)) {
8616 if (flags & RXapif_ALL) {
8617 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8619 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8621 SvREFCNT_dec_NN(sv);
8633 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8635 struct regexp *const rx = ReANY(r);
8637 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8639 if ( rx && RXp_PAREN_NAMES(rx) ) {
8640 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8642 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8649 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8651 struct regexp *const rx = ReANY(r);
8652 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8654 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8656 if (rx && RXp_PAREN_NAMES(rx)) {
8657 HV *hv = RXp_PAREN_NAMES(rx);
8659 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8662 SV* sv_dat = HeVAL(temphe);
8663 I32 *nums = (I32*)SvPVX(sv_dat);
8664 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8665 if ((I32)(rx->lastparen) >= nums[i] &&
8666 rx->offs[nums[i]].start != -1 &&
8667 rx->offs[nums[i]].end != -1)
8673 if (parno || flags & RXapif_ALL) {
8674 return newSVhek(HeKEY_hek(temphe));
8682 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8687 struct regexp *const rx = ReANY(r);
8689 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8691 if (rx && RXp_PAREN_NAMES(rx)) {
8692 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8693 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8694 } else if (flags & RXapif_ONE) {
8695 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8696 av = MUTABLE_AV(SvRV(ret));
8697 length = av_tindex(av);
8698 SvREFCNT_dec_NN(ret);
8699 return newSViv(length + 1);
8701 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8706 return &PL_sv_undef;
8710 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8712 struct regexp *const rx = ReANY(r);
8715 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8717 if (rx && RXp_PAREN_NAMES(rx)) {
8718 HV *hv= RXp_PAREN_NAMES(rx);
8720 (void)hv_iterinit(hv);
8721 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8724 SV* sv_dat = HeVAL(temphe);
8725 I32 *nums = (I32*)SvPVX(sv_dat);
8726 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8727 if ((I32)(rx->lastparen) >= nums[i] &&
8728 rx->offs[nums[i]].start != -1 &&
8729 rx->offs[nums[i]].end != -1)
8735 if (parno || flags & RXapif_ALL) {
8736 av_push(av, newSVhek(HeKEY_hek(temphe)));
8741 return newRV_noinc(MUTABLE_SV(av));
8745 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8748 struct regexp *const rx = ReANY(r);
8754 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8756 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8757 || n == RX_BUFF_IDX_CARET_FULLMATCH
8758 || n == RX_BUFF_IDX_CARET_POSTMATCH
8761 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8763 /* on something like
8766 * the KEEPCOPY is set on the PMOP rather than the regex */
8767 if (PL_curpm && r == PM_GETRE(PL_curpm))
8768 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8777 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8778 /* no need to distinguish between them any more */
8779 n = RX_BUFF_IDX_FULLMATCH;
8781 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8782 && rx->offs[0].start != -1)
8784 /* $`, ${^PREMATCH} */
8785 i = rx->offs[0].start;
8789 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8790 && rx->offs[0].end != -1)
8792 /* $', ${^POSTMATCH} */
8793 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8794 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8797 if (inRANGE(n, 0, (I32)rx->nparens) &&
8798 (s1 = rx->offs[n].start) != -1 &&
8799 (t1 = rx->offs[n].end) != -1)
8801 /* $&, ${^MATCH}, $1 ... */
8803 s = rx->subbeg + s1 - rx->suboffset;
8808 assert(s >= rx->subbeg);
8809 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8811 #ifdef NO_TAINT_SUPPORT
8812 sv_setpvn(sv, s, i);
8814 const int oldtainted = TAINT_get;
8816 sv_setpvn(sv, s, i);
8817 TAINT_set(oldtainted);
8819 if (RXp_MATCH_UTF8(rx))
8824 if (RXp_MATCH_TAINTED(rx)) {
8825 if (SvTYPE(sv) >= SVt_PVMG) {
8826 MAGIC* const mg = SvMAGIC(sv);
8829 SvMAGIC_set(sv, mg->mg_moremagic);
8831 if ((mgt = SvMAGIC(sv))) {
8832 mg->mg_moremagic = mgt;
8833 SvMAGIC_set(sv, mg);
8850 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8851 SV const * const value)
8853 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8855 PERL_UNUSED_ARG(rx);
8856 PERL_UNUSED_ARG(paren);
8857 PERL_UNUSED_ARG(value);
8860 Perl_croak_no_modify();
8864 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8867 struct regexp *const rx = ReANY(r);
8871 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8873 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8874 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8875 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8878 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8880 /* on something like
8883 * the KEEPCOPY is set on the PMOP rather than the regex */
8884 if (PL_curpm && r == PM_GETRE(PL_curpm))
8885 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8891 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8893 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8894 case RX_BUFF_IDX_PREMATCH: /* $` */
8895 if (rx->offs[0].start != -1) {
8896 i = rx->offs[0].start;
8905 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8906 case RX_BUFF_IDX_POSTMATCH: /* $' */
8907 if (rx->offs[0].end != -1) {
8908 i = rx->sublen - rx->offs[0].end;
8910 s1 = rx->offs[0].end;
8917 default: /* $& / ${^MATCH}, $1, $2, ... */
8918 if (paren <= (I32)rx->nparens &&
8919 (s1 = rx->offs[paren].start) != -1 &&
8920 (t1 = rx->offs[paren].end) != -1)
8926 if (ckWARN(WARN_UNINITIALIZED))
8927 report_uninit((const SV *)sv);
8932 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8933 const char * const s = rx->subbeg - rx->suboffset + s1;
8938 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8945 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8947 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8948 PERL_UNUSED_ARG(rx);
8952 return newSVpvs("Regexp");
8955 /* Scans the name of a named buffer from the pattern.
8956 * If flags is REG_RSN_RETURN_NULL returns null.
8957 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8958 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8959 * to the parsed name as looked up in the RExC_paren_names hash.
8960 * If there is an error throws a vFAIL().. type exception.
8963 #define REG_RSN_RETURN_NULL 0
8964 #define REG_RSN_RETURN_NAME 1
8965 #define REG_RSN_RETURN_DATA 2
8968 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8970 char *name_start = RExC_parse;
8973 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8975 assert (RExC_parse <= RExC_end);
8976 if (RExC_parse == RExC_end) NOOP;
8977 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8978 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8979 * using do...while */
8982 RExC_parse += UTF8SKIP(RExC_parse);
8983 } while ( RExC_parse < RExC_end
8984 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8988 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8990 RExC_parse++; /* so the <- from the vFAIL is after the offending
8992 vFAIL("Group name must start with a non-digit word character");
8994 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8995 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8996 if ( flags == REG_RSN_RETURN_NAME)
8998 else if (flags==REG_RSN_RETURN_DATA) {
9001 if ( ! sv_name ) /* should not happen*/
9002 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9003 if (RExC_paren_names)
9004 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9006 sv_dat = HeVAL(he_str);
9007 if ( ! sv_dat ) { /* Didn't find group */
9009 /* It might be a forward reference; we can't fail until we
9010 * know, by completing the parse to get all the groups, and
9012 if (ALL_PARENS_COUNTED) {
9013 vFAIL("Reference to nonexistent named group");
9016 REQUIRE_PARENS_PASS;
9022 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9023 (unsigned long) flags);
9026 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9027 if (RExC_lastparse!=RExC_parse) { \
9028 Perl_re_printf( aTHX_ "%s", \
9029 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9030 RExC_end - RExC_parse, 16, \
9032 PERL_PV_ESCAPE_UNI_DETECT | \
9033 PERL_PV_PRETTY_ELLIPSES | \
9034 PERL_PV_PRETTY_LTGT | \
9035 PERL_PV_ESCAPE_RE | \
9036 PERL_PV_PRETTY_EXACTSIZE \
9040 Perl_re_printf( aTHX_ "%16s",""); \
9042 if (RExC_lastnum!=RExC_emit) \
9043 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9045 Perl_re_printf( aTHX_ "|%4s",""); \
9046 Perl_re_printf( aTHX_ "|%*s%-4s", \
9047 (int)((depth*2)), "", \
9050 RExC_lastnum=RExC_emit; \
9051 RExC_lastparse=RExC_parse; \
9056 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9057 DEBUG_PARSE_MSG((funcname)); \
9058 Perl_re_printf( aTHX_ "%4s","\n"); \
9060 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9061 DEBUG_PARSE_MSG((funcname)); \
9062 Perl_re_printf( aTHX_ fmt "\n",args); \
9065 /* This section of code defines the inversion list object and its methods. The
9066 * interfaces are highly subject to change, so as much as possible is static to
9067 * this file. An inversion list is here implemented as a malloc'd C UV array
9068 * as an SVt_INVLIST scalar.
9070 * An inversion list for Unicode is an array of code points, sorted by ordinal
9071 * number. Each element gives the code point that begins a range that extends
9072 * up-to but not including the code point given by the next element. The final
9073 * element gives the first code point of a range that extends to the platform's
9074 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9075 * ...) give ranges whose code points are all in the inversion list. We say
9076 * that those ranges are in the set. The odd-numbered elements give ranges
9077 * whose code points are not in the inversion list, and hence not in the set.
9078 * Thus, element [0] is the first code point in the list. Element [1]
9079 * is the first code point beyond that not in the list; and element [2] is the
9080 * first code point beyond that that is in the list. In other words, the first
9081 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9082 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9083 * all code points in that range are not in the inversion list. The third
9084 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9085 * list, and so forth. Thus every element whose index is divisible by two
9086 * gives the beginning of a range that is in the list, and every element whose
9087 * index is not divisible by two gives the beginning of a range not in the
9088 * list. If the final element's index is divisible by two, the inversion list
9089 * extends to the platform's infinity; otherwise the highest code point in the
9090 * inversion list is the contents of that element minus 1.
9092 * A range that contains just a single code point N will look like
9094 * invlist[i+1] == N+1
9096 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9097 * impossible to represent, so element [i+1] is omitted. The single element
9099 * invlist[0] == UV_MAX
9100 * contains just UV_MAX, but is interpreted as matching to infinity.
9102 * Taking the complement (inverting) an inversion list is quite simple, if the
9103 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9104 * This implementation reserves an element at the beginning of each inversion
9105 * list to always contain 0; there is an additional flag in the header which
9106 * indicates if the list begins at the 0, or is offset to begin at the next
9107 * element. This means that the inversion list can be inverted without any
9108 * copying; just flip the flag.
9110 * More about inversion lists can be found in "Unicode Demystified"
9111 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9113 * The inversion list data structure is currently implemented as an SV pointing
9114 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9115 * array of UV whose memory management is automatically handled by the existing
9116 * facilities for SV's.
9118 * Some of the methods should always be private to the implementation, and some
9119 * should eventually be made public */
9121 /* The header definitions are in F<invlist_inline.h> */
9123 #ifndef PERL_IN_XSUB_RE
9125 PERL_STATIC_INLINE UV*
9126 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9128 /* Returns a pointer to the first element in the inversion list's array.
9129 * This is called upon initialization of an inversion list. Where the
9130 * array begins depends on whether the list has the code point U+0000 in it
9131 * or not. The other parameter tells it whether the code that follows this
9132 * call is about to put a 0 in the inversion list or not. The first
9133 * element is either the element reserved for 0, if TRUE, or the element
9134 * after it, if FALSE */
9136 bool* offset = get_invlist_offset_addr(invlist);
9137 UV* zero_addr = (UV *) SvPVX(invlist);
9139 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9142 assert(! _invlist_len(invlist));
9146 /* 1^1 = 0; 1^0 = 1 */
9147 *offset = 1 ^ will_have_0;
9148 return zero_addr + *offset;
9152 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9154 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9155 * steals the list from 'src', so 'src' is made to have a NULL list. This
9156 * is similar to what SvSetMagicSV() would do, if it were implemented on
9157 * inversion lists, though this routine avoids a copy */
9159 const UV src_len = _invlist_len(src);
9160 const bool src_offset = *get_invlist_offset_addr(src);
9161 const STRLEN src_byte_len = SvLEN(src);
9162 char * array = SvPVX(src);
9164 const int oldtainted = TAINT_get;
9166 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9168 assert(is_invlist(src));
9169 assert(is_invlist(dest));
9170 assert(! invlist_is_iterating(src));
9171 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9173 /* Make sure it ends in the right place with a NUL, as our inversion list
9174 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9176 array[src_byte_len - 1] = '\0';
9178 TAINT_NOT; /* Otherwise it breaks */
9179 sv_usepvn_flags(dest,
9183 /* This flag is documented to cause a copy to be avoided */
9184 SV_HAS_TRAILING_NUL);
9185 TAINT_set(oldtainted);
9190 /* Finish up copying over the other fields in an inversion list */
9191 *get_invlist_offset_addr(dest) = src_offset;
9192 invlist_set_len(dest, src_len, src_offset);
9193 *get_invlist_previous_index_addr(dest) = 0;
9194 invlist_iterfinish(dest);
9197 PERL_STATIC_INLINE IV*
9198 S_get_invlist_previous_index_addr(SV* invlist)
9200 /* Return the address of the IV that is reserved to hold the cached index
9202 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9204 assert(is_invlist(invlist));
9206 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9209 PERL_STATIC_INLINE IV
9210 S_invlist_previous_index(SV* const invlist)
9212 /* Returns cached index of previous search */
9214 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9216 return *get_invlist_previous_index_addr(invlist);
9219 PERL_STATIC_INLINE void
9220 S_invlist_set_previous_index(SV* const invlist, const IV index)
9222 /* Caches <index> for later retrieval */
9224 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9226 assert(index == 0 || index < (int) _invlist_len(invlist));
9228 *get_invlist_previous_index_addr(invlist) = index;
9231 PERL_STATIC_INLINE void
9232 S_invlist_trim(SV* invlist)
9234 /* Free the not currently-being-used space in an inversion list */
9236 /* But don't free up the space needed for the 0 UV that is always at the
9237 * beginning of the list, nor the trailing NUL */
9238 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9240 PERL_ARGS_ASSERT_INVLIST_TRIM;
9242 assert(is_invlist(invlist));
9244 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9247 PERL_STATIC_INLINE void
9248 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9250 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9252 assert(is_invlist(invlist));
9254 invlist_set_len(invlist, 0, 0);
9255 invlist_trim(invlist);
9258 #endif /* ifndef PERL_IN_XSUB_RE */
9260 PERL_STATIC_INLINE bool
9261 S_invlist_is_iterating(SV* const invlist)
9263 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9265 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9268 #ifndef PERL_IN_XSUB_RE
9270 PERL_STATIC_INLINE UV
9271 S_invlist_max(SV* const invlist)
9273 /* Returns the maximum number of elements storable in the inversion list's
9274 * array, without having to realloc() */
9276 PERL_ARGS_ASSERT_INVLIST_MAX;
9278 assert(is_invlist(invlist));
9280 /* Assumes worst case, in which the 0 element is not counted in the
9281 * inversion list, so subtracts 1 for that */
9282 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9283 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9284 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9288 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9290 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9292 /* First 1 is in case the zero element isn't in the list; second 1 is for
9294 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9295 invlist_set_len(invlist, 0, 0);
9297 /* Force iterinit() to be used to get iteration to work */
9298 invlist_iterfinish(invlist);
9300 *get_invlist_previous_index_addr(invlist) = 0;
9301 SvPOK_on(invlist); /* This allows B to extract the PV */
9305 Perl__new_invlist(pTHX_ IV initial_size)
9308 /* Return a pointer to a newly constructed inversion list, with enough
9309 * space to store 'initial_size' elements. If that number is negative, a
9310 * system default is used instead */
9314 if (initial_size < 0) {
9318 new_list = newSV_type(SVt_INVLIST);
9319 initialize_invlist_guts(new_list, initial_size);
9325 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9327 /* Return a pointer to a newly constructed inversion list, initialized to
9328 * point to <list>, which has to be in the exact correct inversion list
9329 * form, including internal fields. Thus this is a dangerous routine that
9330 * should not be used in the wrong hands. The passed in 'list' contains
9331 * several header fields at the beginning that are not part of the
9332 * inversion list body proper */
9334 const STRLEN length = (STRLEN) list[0];
9335 const UV version_id = list[1];
9336 const bool offset = cBOOL(list[2]);
9337 #define HEADER_LENGTH 3
9338 /* If any of the above changes in any way, you must change HEADER_LENGTH
9339 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9340 * perl -E 'say int(rand 2**31-1)'
9342 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9343 data structure type, so that one being
9344 passed in can be validated to be an
9345 inversion list of the correct vintage.
9348 SV* invlist = newSV_type(SVt_INVLIST);
9350 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9352 if (version_id != INVLIST_VERSION_ID) {
9353 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9356 /* The generated array passed in includes header elements that aren't part
9357 * of the list proper, so start it just after them */
9358 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9360 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9361 shouldn't touch it */
9363 *(get_invlist_offset_addr(invlist)) = offset;
9365 /* The 'length' passed to us is the physical number of elements in the
9366 * inversion list. But if there is an offset the logical number is one
9368 invlist_set_len(invlist, length - offset, offset);
9370 invlist_set_previous_index(invlist, 0);
9372 /* Initialize the iteration pointer. */
9373 invlist_iterfinish(invlist);
9375 SvREADONLY_on(invlist);
9382 S__append_range_to_invlist(pTHX_ SV* const invlist,
9383 const UV start, const UV end)
9385 /* Subject to change or removal. Append the range from 'start' to 'end' at
9386 * the end of the inversion list. The range must be above any existing
9390 UV max = invlist_max(invlist);
9391 UV len = _invlist_len(invlist);
9394 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9396 if (len == 0) { /* Empty lists must be initialized */
9397 offset = start != 0;
9398 array = _invlist_array_init(invlist, ! offset);
9401 /* Here, the existing list is non-empty. The current max entry in the
9402 * list is generally the first value not in the set, except when the
9403 * set extends to the end of permissible values, in which case it is
9404 * the first entry in that final set, and so this call is an attempt to
9405 * append out-of-order */
9407 UV final_element = len - 1;
9408 array = invlist_array(invlist);
9409 if ( array[final_element] > start
9410 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9412 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9413 array[final_element], start,
9414 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9417 /* Here, it is a legal append. If the new range begins 1 above the end
9418 * of the range below it, it is extending the range below it, so the
9419 * new first value not in the set is one greater than the newly
9420 * extended range. */
9421 offset = *get_invlist_offset_addr(invlist);
9422 if (array[final_element] == start) {
9423 if (end != UV_MAX) {
9424 array[final_element] = end + 1;
9427 /* But if the end is the maximum representable on the machine,
9428 * assume that infinity was actually what was meant. Just let
9429 * the range that this would extend to have no end */
9430 invlist_set_len(invlist, len - 1, offset);
9436 /* Here the new range doesn't extend any existing set. Add it */
9438 len += 2; /* Includes an element each for the start and end of range */
9440 /* If wll overflow the existing space, extend, which may cause the array to
9443 invlist_extend(invlist, len);
9445 /* Have to set len here to avoid assert failure in invlist_array() */
9446 invlist_set_len(invlist, len, offset);
9448 array = invlist_array(invlist);
9451 invlist_set_len(invlist, len, offset);
9454 /* The next item on the list starts the range, the one after that is
9455 * one past the new range. */
9456 array[len - 2] = start;
9457 if (end != UV_MAX) {
9458 array[len - 1] = end + 1;
9461 /* But if the end is the maximum representable on the machine, just let
9462 * the range have no end */
9463 invlist_set_len(invlist, len - 1, offset);
9468 Perl__invlist_search(SV* const invlist, const UV cp)
9470 /* Searches the inversion list for the entry that contains the input code
9471 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9472 * return value is the index into the list's array of the range that
9473 * contains <cp>, that is, 'i' such that
9474 * array[i] <= cp < array[i+1]
9479 IV high = _invlist_len(invlist);
9480 const IV highest_element = high - 1;
9483 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9485 /* If list is empty, return failure. */
9490 /* (We can't get the array unless we know the list is non-empty) */
9491 array = invlist_array(invlist);
9493 mid = invlist_previous_index(invlist);
9495 if (mid > highest_element) {
9496 mid = highest_element;
9499 /* <mid> contains the cache of the result of the previous call to this
9500 * function (0 the first time). See if this call is for the same result,
9501 * or if it is for mid-1. This is under the theory that calls to this
9502 * function will often be for related code points that are near each other.
9503 * And benchmarks show that caching gives better results. We also test
9504 * here if the code point is within the bounds of the list. These tests
9505 * replace others that would have had to be made anyway to make sure that
9506 * the array bounds were not exceeded, and these give us extra information
9507 * at the same time */
9508 if (cp >= array[mid]) {
9509 if (cp >= array[highest_element]) {
9510 return highest_element;
9513 /* Here, array[mid] <= cp < array[highest_element]. This means that
9514 * the final element is not the answer, so can exclude it; it also
9515 * means that <mid> is not the final element, so can refer to 'mid + 1'
9517 if (cp < array[mid + 1]) {
9523 else { /* cp < aray[mid] */
9524 if (cp < array[0]) { /* Fail if outside the array */
9528 if (cp >= array[mid - 1]) {
9533 /* Binary search. What we are looking for is <i> such that
9534 * array[i] <= cp < array[i+1]
9535 * The loop below converges on the i+1. Note that there may not be an
9536 * (i+1)th element in the array, and things work nonetheless */
9537 while (low < high) {
9538 mid = (low + high) / 2;
9539 assert(mid <= highest_element);
9540 if (array[mid] <= cp) { /* cp >= array[mid] */
9543 /* We could do this extra test to exit the loop early.
9544 if (cp < array[low]) {
9549 else { /* cp < array[mid] */
9556 invlist_set_previous_index(invlist, high);
9561 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9562 const bool complement_b, SV** output)
9564 /* Take the union of two inversion lists and point '*output' to it. On
9565 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9566 * even 'a' or 'b'). If to an inversion list, the contents of the original
9567 * list will be replaced by the union. The first list, 'a', may be
9568 * NULL, in which case a copy of the second list is placed in '*output'.
9569 * If 'complement_b' is TRUE, the union is taken of the complement
9570 * (inversion) of 'b' instead of b itself.
9572 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9573 * Richard Gillam, published by Addison-Wesley, and explained at some
9574 * length there. The preface says to incorporate its examples into your
9575 * code at your own risk.
9577 * The algorithm is like a merge sort. */
9579 const UV* array_a; /* a's array */
9581 UV len_a; /* length of a's array */
9584 SV* u; /* the resulting union */
9588 UV i_a = 0; /* current index into a's array */
9592 /* running count, as explained in the algorithm source book; items are
9593 * stopped accumulating and are output when the count changes to/from 0.
9594 * The count is incremented when we start a range that's in an input's set,
9595 * and decremented when we start a range that's not in a set. So this
9596 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9597 * and hence nothing goes into the union; 1, just one of the inputs is in
9598 * its set (and its current range gets added to the union); and 2 when both
9599 * inputs are in their sets. */
9602 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9604 assert(*output == NULL || is_invlist(*output));
9606 len_b = _invlist_len(b);
9609 /* Here, 'b' is empty, hence it's complement is all possible code
9610 * points. So if the union includes the complement of 'b', it includes
9611 * everything, and we need not even look at 'a'. It's easiest to
9612 * create a new inversion list that matches everything. */
9614 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9616 if (*output == NULL) { /* If the output didn't exist, just point it
9618 *output = everything;
9620 else { /* Otherwise, replace its contents with the new list */
9621 invlist_replace_list_destroys_src(*output, everything);
9622 SvREFCNT_dec_NN(everything);
9628 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9629 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9630 * output will be empty */
9632 if (a == NULL || _invlist_len(a) == 0) {
9633 if (*output == NULL) {
9634 *output = _new_invlist(0);
9637 invlist_clear(*output);
9642 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9643 * union. We can just return a copy of 'a' if '*output' doesn't point
9644 * to an existing list */
9645 if (*output == NULL) {
9646 *output = invlist_clone(a, NULL);
9650 /* If the output is to overwrite 'a', we have a no-op, as it's
9656 /* Here, '*output' is to be overwritten by 'a' */
9657 u = invlist_clone(a, NULL);
9658 invlist_replace_list_destroys_src(*output, u);
9664 /* Here 'b' is not empty. See about 'a' */
9666 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9668 /* Here, 'a' is empty (and b is not). That means the union will come
9669 * entirely from 'b'. If '*output' is NULL, we can directly return a
9670 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9673 SV ** dest = (*output == NULL) ? output : &u;
9674 *dest = invlist_clone(b, NULL);
9676 _invlist_invert(*dest);
9680 invlist_replace_list_destroys_src(*output, u);
9687 /* Here both lists exist and are non-empty */
9688 array_a = invlist_array(a);
9689 array_b = invlist_array(b);
9691 /* If are to take the union of 'a' with the complement of b, set it
9692 * up so are looking at b's complement. */
9695 /* To complement, we invert: if the first element is 0, remove it. To
9696 * do this, we just pretend the array starts one later */
9697 if (array_b[0] == 0) {
9703 /* But if the first element is not zero, we pretend the list starts
9704 * at the 0 that is always stored immediately before the array. */
9710 /* Size the union for the worst case: that the sets are completely
9712 u = _new_invlist(len_a + len_b);
9714 /* Will contain U+0000 if either component does */
9715 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9716 || (len_b > 0 && array_b[0] == 0));
9718 /* Go through each input list item by item, stopping when have exhausted
9720 while (i_a < len_a && i_b < len_b) {
9721 UV cp; /* The element to potentially add to the union's array */
9722 bool cp_in_set; /* is it in the the input list's set or not */
9724 /* We need to take one or the other of the two inputs for the union.
9725 * Since we are merging two sorted lists, we take the smaller of the
9726 * next items. In case of a tie, we take first the one that is in its
9727 * set. If we first took the one not in its set, it would decrement
9728 * the count, possibly to 0 which would cause it to be output as ending
9729 * the range, and the next time through we would take the same number,
9730 * and output it again as beginning the next range. By doing it the
9731 * opposite way, there is no possibility that the count will be
9732 * momentarily decremented to 0, and thus the two adjoining ranges will
9733 * be seamlessly merged. (In a tie and both are in the set or both not
9734 * in the set, it doesn't matter which we take first.) */
9735 if ( array_a[i_a] < array_b[i_b]
9736 || ( array_a[i_a] == array_b[i_b]
9737 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9739 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9740 cp = array_a[i_a++];
9743 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9744 cp = array_b[i_b++];
9747 /* Here, have chosen which of the two inputs to look at. Only output
9748 * if the running count changes to/from 0, which marks the
9749 * beginning/end of a range that's in the set */
9752 array_u[i_u++] = cp;
9759 array_u[i_u++] = cp;
9765 /* The loop above increments the index into exactly one of the input lists
9766 * each iteration, and ends when either index gets to its list end. That
9767 * means the other index is lower than its end, and so something is
9768 * remaining in that one. We decrement 'count', as explained below, if
9769 * that list is in its set. (i_a and i_b each currently index the element
9770 * beyond the one we care about.) */
9771 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9772 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9777 /* Above we decremented 'count' if the list that had unexamined elements in
9778 * it was in its set. This has made it so that 'count' being non-zero
9779 * means there isn't anything left to output; and 'count' equal to 0 means
9780 * that what is left to output is precisely that which is left in the
9781 * non-exhausted input list.
9783 * To see why, note first that the exhausted input obviously has nothing
9784 * left to add to the union. If it was in its set at its end, that means
9785 * the set extends from here to the platform's infinity, and hence so does
9786 * the union and the non-exhausted set is irrelevant. The exhausted set
9787 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9788 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9789 * 'count' remains at 1. This is consistent with the decremented 'count'
9790 * != 0 meaning there's nothing left to add to the union.
9792 * But if the exhausted input wasn't in its set, it contributed 0 to
9793 * 'count', and the rest of the union will be whatever the other input is.
9794 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9795 * otherwise it gets decremented to 0. This is consistent with 'count'
9796 * == 0 meaning the remainder of the union is whatever is left in the
9797 * non-exhausted list. */
9802 IV copy_count = len_a - i_a;
9803 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9804 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9806 else { /* The non-exhausted input is b */
9807 copy_count = len_b - i_b;
9808 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9810 len_u = i_u + copy_count;
9813 /* Set the result to the final length, which can change the pointer to
9814 * array_u, so re-find it. (Note that it is unlikely that this will
9815 * change, as we are shrinking the space, not enlarging it) */
9816 if (len_u != _invlist_len(u)) {
9817 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9819 array_u = invlist_array(u);
9822 if (*output == NULL) { /* Simply return the new inversion list */
9826 /* Otherwise, overwrite the inversion list that was in '*output'. We
9827 * could instead free '*output', and then set it to 'u', but experience
9828 * has shown [perl #127392] that if the input is a mortal, we can get a
9829 * huge build-up of these during regex compilation before they get
9831 invlist_replace_list_destroys_src(*output, u);
9839 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9840 const bool complement_b, SV** i)
9842 /* Take the intersection of two inversion lists and point '*i' to it. On
9843 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9844 * even 'a' or 'b'). If to an inversion list, the contents of the original
9845 * list will be replaced by the intersection. The first list, 'a', may be
9846 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9847 * TRUE, the result will be the intersection of 'a' and the complement (or
9848 * inversion) of 'b' instead of 'b' directly.
9850 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9851 * Richard Gillam, published by Addison-Wesley, and explained at some
9852 * length there. The preface says to incorporate its examples into your
9853 * code at your own risk. In fact, it had bugs
9855 * The algorithm is like a merge sort, and is essentially the same as the
9859 const UV* array_a; /* a's array */
9861 UV len_a; /* length of a's array */
9864 SV* r; /* the resulting intersection */
9868 UV i_a = 0; /* current index into a's array */
9872 /* running count of how many of the two inputs are postitioned at ranges
9873 * that are in their sets. As explained in the algorithm source book,
9874 * items are stopped accumulating and are output when the count changes
9875 * to/from 2. The count is incremented when we start a range that's in an
9876 * input's set, and decremented when we start a range that's not in a set.
9877 * Only when it is 2 are we in the intersection. */
9880 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9882 assert(*i == NULL || is_invlist(*i));
9884 /* Special case if either one is empty */
9885 len_a = (a == NULL) ? 0 : _invlist_len(a);
9886 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9887 if (len_a != 0 && complement_b) {
9889 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9890 * must be empty. Here, also we are using 'b's complement, which
9891 * hence must be every possible code point. Thus the intersection
9894 if (*i == a) { /* No-op */
9899 *i = invlist_clone(a, NULL);
9903 r = invlist_clone(a, NULL);
9904 invlist_replace_list_destroys_src(*i, r);
9909 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9910 * intersection must be empty */
9912 *i = _new_invlist(0);
9920 /* Here both lists exist and are non-empty */
9921 array_a = invlist_array(a);
9922 array_b = invlist_array(b);
9924 /* If are to take the intersection of 'a' with the complement of b, set it
9925 * up so are looking at b's complement. */
9928 /* To complement, we invert: if the first element is 0, remove it. To
9929 * do this, we just pretend the array starts one later */
9930 if (array_b[0] == 0) {
9936 /* But if the first element is not zero, we pretend the list starts
9937 * at the 0 that is always stored immediately before the array. */
9943 /* Size the intersection for the worst case: that the intersection ends up
9944 * fragmenting everything to be completely disjoint */
9945 r= _new_invlist(len_a + len_b);
9947 /* Will contain U+0000 iff both components do */
9948 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9949 && len_b > 0 && array_b[0] == 0);
9951 /* Go through each list item by item, stopping when have exhausted one of
9953 while (i_a < len_a && i_b < len_b) {
9954 UV cp; /* The element to potentially add to the intersection's
9956 bool cp_in_set; /* Is it in the input list's set or not */
9958 /* We need to take one or the other of the two inputs for the
9959 * intersection. Since we are merging two sorted lists, we take the
9960 * smaller of the next items. In case of a tie, we take first the one
9961 * that is not in its set (a difference from the union algorithm). If
9962 * we first took the one in its set, it would increment the count,
9963 * possibly to 2 which would cause it to be output as starting a range
9964 * in the intersection, and the next time through we would take that
9965 * same number, and output it again as ending the set. By doing the
9966 * opposite of this, there is no possibility that the count will be
9967 * momentarily incremented to 2. (In a tie and both are in the set or
9968 * both not in the set, it doesn't matter which we take first.) */
9969 if ( array_a[i_a] < array_b[i_b]
9970 || ( array_a[i_a] == array_b[i_b]
9971 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9973 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9974 cp = array_a[i_a++];
9977 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9981 /* Here, have chosen which of the two inputs to look at. Only output
9982 * if the running count changes to/from 2, which marks the
9983 * beginning/end of a range that's in the intersection */
9987 array_r[i_r++] = cp;
9992 array_r[i_r++] = cp;
9999 /* The loop above increments the index into exactly one of the input lists
10000 * each iteration, and ends when either index gets to its list end. That
10001 * means the other index is lower than its end, and so something is
10002 * remaining in that one. We increment 'count', as explained below, if the
10003 * exhausted list was in its set. (i_a and i_b each currently index the
10004 * element beyond the one we care about.) */
10005 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10006 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10011 /* Above we incremented 'count' if the exhausted list was in its set. This
10012 * has made it so that 'count' being below 2 means there is nothing left to
10013 * output; otheriwse what's left to add to the intersection is precisely
10014 * that which is left in the non-exhausted input list.
10016 * To see why, note first that the exhausted input obviously has nothing
10017 * left to affect the intersection. If it was in its set at its end, that
10018 * means the set extends from here to the platform's infinity, and hence
10019 * anything in the non-exhausted's list will be in the intersection, and
10020 * anything not in it won't be. Hence, the rest of the intersection is
10021 * precisely what's in the non-exhausted list The exhausted set also
10022 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10023 * it means 'count' is now at least 2. This is consistent with the
10024 * incremented 'count' being >= 2 means to add the non-exhausted list to
10025 * the intersection.
10027 * But if the exhausted input wasn't in its set, it contributed 0 to
10028 * 'count', and the intersection can't include anything further; the
10029 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10030 * incremented. This is consistent with 'count' being < 2 meaning nothing
10031 * further to add to the intersection. */
10032 if (count < 2) { /* Nothing left to put in the intersection. */
10035 else { /* copy the non-exhausted list, unchanged. */
10036 IV copy_count = len_a - i_a;
10037 if (copy_count > 0) { /* a is the one with stuff left */
10038 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10040 else { /* b is the one with stuff left */
10041 copy_count = len_b - i_b;
10042 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10044 len_r = i_r + copy_count;
10047 /* Set the result to the final length, which can change the pointer to
10048 * array_r, so re-find it. (Note that it is unlikely that this will
10049 * change, as we are shrinking the space, not enlarging it) */
10050 if (len_r != _invlist_len(r)) {
10051 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10053 array_r = invlist_array(r);
10056 if (*i == NULL) { /* Simply return the calculated intersection */
10059 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10060 instead free '*i', and then set it to 'r', but experience has
10061 shown [perl #127392] that if the input is a mortal, we can get a
10062 huge build-up of these during regex compilation before they get
10065 invlist_replace_list_destroys_src(*i, r);
10070 SvREFCNT_dec_NN(r);
10077 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10079 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10080 * set. A pointer to the inversion list is returned. This may actually be
10081 * a new list, in which case the passed in one has been destroyed. The
10082 * passed-in inversion list can be NULL, in which case a new one is created
10083 * with just the one range in it. The new list is not necessarily
10084 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10085 * result of this function. The gain would not be large, and in many
10086 * cases, this is called multiple times on a single inversion list, so
10087 * anything freed may almost immediately be needed again.
10089 * This used to mostly call the 'union' routine, but that is much more
10090 * heavyweight than really needed for a single range addition */
10092 UV* array; /* The array implementing the inversion list */
10093 UV len; /* How many elements in 'array' */
10094 SSize_t i_s; /* index into the invlist array where 'start'
10096 SSize_t i_e = 0; /* And the index where 'end' should go */
10097 UV cur_highest; /* The highest code point in the inversion list
10098 upon entry to this function */
10100 /* This range becomes the whole inversion list if none already existed */
10101 if (invlist == NULL) {
10102 invlist = _new_invlist(2);
10103 _append_range_to_invlist(invlist, start, end);
10107 /* Likewise, if the inversion list is currently empty */
10108 len = _invlist_len(invlist);
10110 _append_range_to_invlist(invlist, start, end);
10114 /* Starting here, we have to know the internals of the list */
10115 array = invlist_array(invlist);
10117 /* If the new range ends higher than the current highest ... */
10118 cur_highest = invlist_highest(invlist);
10119 if (end > cur_highest) {
10121 /* If the whole range is higher, we can just append it */
10122 if (start > cur_highest) {
10123 _append_range_to_invlist(invlist, start, end);
10127 /* Otherwise, add the portion that is higher ... */
10128 _append_range_to_invlist(invlist, cur_highest + 1, end);
10130 /* ... and continue on below to handle the rest. As a result of the
10131 * above append, we know that the index of the end of the range is the
10132 * final even numbered one of the array. Recall that the final element
10133 * always starts a range that extends to infinity. If that range is in
10134 * the set (meaning the set goes from here to infinity), it will be an
10135 * even index, but if it isn't in the set, it's odd, and the final
10136 * range in the set is one less, which is even. */
10137 if (end == UV_MAX) {
10145 /* We have dealt with appending, now see about prepending. If the new
10146 * range starts lower than the current lowest ... */
10147 if (start < array[0]) {
10149 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10150 * Let the union code handle it, rather than having to know the
10151 * trickiness in two code places. */
10152 if (UNLIKELY(start == 0)) {
10155 range_invlist = _new_invlist(2);
10156 _append_range_to_invlist(range_invlist, start, end);
10158 _invlist_union(invlist, range_invlist, &invlist);
10160 SvREFCNT_dec_NN(range_invlist);
10165 /* If the whole new range comes before the first entry, and doesn't
10166 * extend it, we have to insert it as an additional range */
10167 if (end < array[0] - 1) {
10169 goto splice_in_new_range;
10172 /* Here the new range adjoins the existing first range, extending it
10176 /* And continue on below to handle the rest. We know that the index of
10177 * the beginning of the range is the first one of the array */
10180 else { /* Not prepending any part of the new range to the existing list.
10181 * Find where in the list it should go. This finds i_s, such that:
10182 * invlist[i_s] <= start < array[i_s+1]
10184 i_s = _invlist_search(invlist, start);
10187 /* At this point, any extending before the beginning of the inversion list
10188 * and/or after the end has been done. This has made it so that, in the
10189 * code below, each endpoint of the new range is either in a range that is
10190 * in the set, or is in a gap between two ranges that are. This means we
10191 * don't have to worry about exceeding the array bounds.
10193 * Find where in the list the new range ends (but we can skip this if we
10194 * have already determined what it is, or if it will be the same as i_s,
10195 * which we already have computed) */
10197 i_e = (start == end)
10199 : _invlist_search(invlist, end);
10202 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10203 * is a range that goes to infinity there is no element at invlist[i_e+1],
10204 * so only the first relation holds. */
10206 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10208 /* Here, the ranges on either side of the beginning of the new range
10209 * are in the set, and this range starts in the gap between them.
10211 * The new range extends the range above it downwards if the new range
10212 * ends at or above that range's start */
10213 const bool extends_the_range_above = ( end == UV_MAX
10214 || end + 1 >= array[i_s+1]);
10216 /* The new range extends the range below it upwards if it begins just
10217 * after where that range ends */
10218 if (start == array[i_s]) {
10220 /* If the new range fills the entire gap between the other ranges,
10221 * they will get merged together. Other ranges may also get
10222 * merged, depending on how many of them the new range spans. In
10223 * the general case, we do the merge later, just once, after we
10224 * figure out how many to merge. But in the case where the new
10225 * range exactly spans just this one gap (possibly extending into
10226 * the one above), we do the merge here, and an early exit. This
10227 * is done here to avoid having to special case later. */
10228 if (i_e - i_s <= 1) {
10230 /* If i_e - i_s == 1, it means that the new range terminates
10231 * within the range above, and hence 'extends_the_range_above'
10232 * must be true. (If the range above it extends to infinity,
10233 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10234 * will be 0, so no harm done.) */
10235 if (extends_the_range_above) {
10236 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10237 invlist_set_len(invlist,
10239 *(get_invlist_offset_addr(invlist)));
10243 /* Here, i_e must == i_s. We keep them in sync, as they apply
10244 * to the same range, and below we are about to decrement i_s
10249 /* Here, the new range is adjacent to the one below. (It may also
10250 * span beyond the range above, but that will get resolved later.)
10251 * Extend the range below to include this one. */
10252 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10254 start = array[i_s];
10256 else if (extends_the_range_above) {
10258 /* Here the new range only extends the range above it, but not the
10259 * one below. It merges with the one above. Again, we keep i_e
10260 * and i_s in sync if they point to the same range */
10265 array[i_s] = start;
10269 /* Here, we've dealt with the new range start extending any adjoining
10272 * If the new range extends to infinity, it is now the final one,
10273 * regardless of what was there before */
10274 if (UNLIKELY(end == UV_MAX)) {
10275 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10279 /* If i_e started as == i_s, it has also been dealt with,
10280 * and been updated to the new i_s, which will fail the following if */
10281 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10283 /* Here, the ranges on either side of the end of the new range are in
10284 * the set, and this range ends in the gap between them.
10286 * If this range is adjacent to (hence extends) the range above it, it
10287 * becomes part of that range; likewise if it extends the range below,
10288 * it becomes part of that range */
10289 if (end + 1 == array[i_e+1]) {
10291 array[i_e] = start;
10293 else if (start <= array[i_e]) {
10294 array[i_e] = end + 1;
10301 /* If the range fits entirely in an existing range (as possibly already
10302 * extended above), it doesn't add anything new */
10303 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10307 /* Here, no part of the range is in the list. Must add it. It will
10308 * occupy 2 more slots */
10309 splice_in_new_range:
10311 invlist_extend(invlist, len + 2);
10312 array = invlist_array(invlist);
10313 /* Move the rest of the array down two slots. Don't include any
10315 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10317 /* Do the actual splice */
10318 array[i_e+1] = start;
10319 array[i_e+2] = end + 1;
10320 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10324 /* Here the new range crossed the boundaries of a pre-existing range. The
10325 * code above has adjusted things so that both ends are in ranges that are
10326 * in the set. This means everything in between must also be in the set.
10327 * Just squash things together */
10328 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10329 invlist_set_len(invlist,
10331 *(get_invlist_offset_addr(invlist)));
10337 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10338 UV** other_elements_ptr)
10340 /* Create and return an inversion list whose contents are to be populated
10341 * by the caller. The caller gives the number of elements (in 'size') and
10342 * the very first element ('element0'). This function will set
10343 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10344 * are to be placed.
10346 * Obviously there is some trust involved that the caller will properly
10347 * fill in the other elements of the array.
10349 * (The first element needs to be passed in, as the underlying code does
10350 * things differently depending on whether it is zero or non-zero) */
10352 SV* invlist = _new_invlist(size);
10355 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10357 invlist = add_cp_to_invlist(invlist, element0);
10358 offset = *get_invlist_offset_addr(invlist);
10360 invlist_set_len(invlist, size, offset);
10361 *other_elements_ptr = invlist_array(invlist) + 1;
10367 #ifndef PERL_IN_XSUB_RE
10369 Perl__invlist_invert(pTHX_ SV* const invlist)
10371 /* Complement the input inversion list. This adds a 0 if the list didn't
10372 * have a zero; removes it otherwise. As described above, the data
10373 * structure is set up so that this is very efficient */
10375 PERL_ARGS_ASSERT__INVLIST_INVERT;
10377 assert(! invlist_is_iterating(invlist));
10379 /* The inverse of matching nothing is matching everything */
10380 if (_invlist_len(invlist) == 0) {
10381 _append_range_to_invlist(invlist, 0, UV_MAX);
10385 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10389 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10391 /* Return a new inversion list that is a copy of the input one, which is
10392 * unchanged. The new list will not be mortal even if the old one was. */
10394 const STRLEN nominal_length = _invlist_len(invlist);
10395 const STRLEN physical_length = SvCUR(invlist);
10396 const bool offset = *(get_invlist_offset_addr(invlist));
10398 PERL_ARGS_ASSERT_INVLIST_CLONE;
10400 if (new_invlist == NULL) {
10401 new_invlist = _new_invlist(nominal_length);
10404 sv_upgrade(new_invlist, SVt_INVLIST);
10405 initialize_invlist_guts(new_invlist, nominal_length);
10408 *(get_invlist_offset_addr(new_invlist)) = offset;
10409 invlist_set_len(new_invlist, nominal_length, offset);
10410 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10412 return new_invlist;
10417 PERL_STATIC_INLINE UV
10418 S_invlist_lowest(SV* const invlist)
10420 /* Returns the lowest code point that matches an inversion list. This API
10421 * has an ambiguity, as it returns 0 under either the lowest is actually
10422 * 0, or if the list is empty. If this distinction matters to you, check
10423 * for emptiness before calling this function */
10425 UV len = _invlist_len(invlist);
10428 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10434 array = invlist_array(invlist);
10440 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10442 /* Get the contents of an inversion list into a string SV so that they can
10443 * be printed out. If 'traditional_style' is TRUE, it uses the format
10444 * traditionally done for debug tracing; otherwise it uses a format
10445 * suitable for just copying to the output, with blanks between ranges and
10446 * a dash between range components */
10450 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10451 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10453 if (traditional_style) {
10454 output = newSVpvs("\n");
10457 output = newSVpvs("");
10460 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10462 assert(! invlist_is_iterating(invlist));
10464 invlist_iterinit(invlist);
10465 while (invlist_iternext(invlist, &start, &end)) {
10466 if (end == UV_MAX) {
10467 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10468 start, intra_range_delimiter,
10469 inter_range_delimiter);
10471 else if (end != start) {
10472 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10474 intra_range_delimiter,
10475 end, inter_range_delimiter);
10478 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10479 start, inter_range_delimiter);
10483 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10484 SvCUR_set(output, SvCUR(output) - 1);
10490 #ifndef PERL_IN_XSUB_RE
10492 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10493 const char * const indent, SV* const invlist)
10495 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10496 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10497 * the string 'indent'. The output looks like this:
10498 [0] 0x000A .. 0x000D
10500 [4] 0x2028 .. 0x2029
10501 [6] 0x3104 .. INFTY
10502 * This means that the first range of code points matched by the list are
10503 * 0xA through 0xD; the second range contains only the single code point
10504 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10505 * are used to define each range (except if the final range extends to
10506 * infinity, only a single element is needed). The array index of the
10507 * first element for the corresponding range is given in brackets. */
10512 PERL_ARGS_ASSERT__INVLIST_DUMP;
10514 if (invlist_is_iterating(invlist)) {
10515 Perl_dump_indent(aTHX_ level, file,
10516 "%sCan't dump inversion list because is in middle of iterating\n",
10521 invlist_iterinit(invlist);
10522 while (invlist_iternext(invlist, &start, &end)) {
10523 if (end == UV_MAX) {
10524 Perl_dump_indent(aTHX_ level, file,
10525 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10526 indent, (UV)count, start);
10528 else if (end != start) {
10529 Perl_dump_indent(aTHX_ level, file,
10530 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10531 indent, (UV)count, start, end);
10534 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10535 indent, (UV)count, start);
10543 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10545 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10547 /* Return a boolean as to if the two passed in inversion lists are
10548 * identical. The final argument, if TRUE, says to take the complement of
10549 * the second inversion list before doing the comparison */
10551 const UV len_a = _invlist_len(a);
10552 UV len_b = _invlist_len(b);
10554 const UV* array_a = NULL;
10555 const UV* array_b = NULL;
10557 PERL_ARGS_ASSERT__INVLISTEQ;
10559 /* This code avoids accessing the arrays unless it knows the length is
10564 return ! complement_b;
10568 array_a = invlist_array(a);
10572 array_b = invlist_array(b);
10575 /* If are to compare 'a' with the complement of b, set it
10576 * up so are looking at b's complement. */
10577 if (complement_b) {
10579 /* The complement of nothing is everything, so <a> would have to have
10580 * just one element, starting at zero (ending at infinity) */
10582 return (len_a == 1 && array_a[0] == 0);
10584 if (array_b[0] == 0) {
10586 /* Otherwise, to complement, we invert. Here, the first element is
10587 * 0, just remove it. To do this, we just pretend the array starts
10595 /* But if the first element is not zero, we pretend the list starts
10596 * at the 0 that is always stored immediately before the array. */
10602 return len_a == len_b
10603 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10609 * As best we can, determine the characters that can match the start of
10610 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10611 * can be false positive matches
10613 * Returns the invlist as a new SV*; it is the caller's responsibility to
10614 * call SvREFCNT_dec() when done with it.
10617 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10620 const U8 * s = (U8*)STRING(node);
10621 SSize_t bytelen = STR_LEN(node);
10623 /* Start out big enough for 2 separate code points */
10624 SV* invlist = _new_invlist(4);
10626 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10631 /* We punt and assume can match anything if the node begins
10632 * with a multi-character fold. Things are complicated. For
10633 * example, /ffi/i could match any of:
10634 * "\N{LATIN SMALL LIGATURE FFI}"
10635 * "\N{LATIN SMALL LIGATURE FF}I"
10636 * "F\N{LATIN SMALL LIGATURE FI}"
10637 * plus several other things; and making sure we have all the
10638 * possibilities is hard. */
10639 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10640 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10643 /* Any Latin1 range character can potentially match any
10644 * other depending on the locale, and in Turkic locales, U+130 and
10646 if (OP(node) == EXACTFL) {
10647 _invlist_union(invlist, PL_Latin1, &invlist);
10648 invlist = add_cp_to_invlist(invlist,
10649 LATIN_SMALL_LETTER_DOTLESS_I);
10650 invlist = add_cp_to_invlist(invlist,
10651 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10654 /* But otherwise, it matches at least itself. We can
10655 * quickly tell if it has a distinct fold, and if so,
10656 * it matches that as well */
10657 invlist = add_cp_to_invlist(invlist, uc);
10658 if (IS_IN_SOME_FOLD_L1(uc))
10659 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10662 /* Some characters match above-Latin1 ones under /i. This
10663 * is true of EXACTFL ones when the locale is UTF-8 */
10664 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10665 && (! isASCII(uc) || (OP(node) != EXACTFAA
10666 && OP(node) != EXACTFAA_NO_TRIE)))
10668 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10672 else { /* Pattern is UTF-8 */
10673 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10674 const U8* e = s + bytelen;
10677 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10679 /* The only code points that aren't folded in a UTF EXACTFish
10680 * node are are the problematic ones in EXACTFL nodes */
10681 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10682 /* We need to check for the possibility that this EXACTFL
10683 * node begins with a multi-char fold. Therefore we fold
10684 * the first few characters of it so that we can make that
10690 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10692 *(d++) = (U8) toFOLD(*s);
10693 if (fc < 0) { /* Save the first fold */
10700 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10701 if (fc < 0) { /* Save the first fold */
10709 /* And set up so the code below that looks in this folded
10710 * buffer instead of the node's string */
10715 /* When we reach here 's' points to the fold of the first
10716 * character(s) of the node; and 'e' points to far enough along
10717 * the folded string to be just past any possible multi-char
10720 * Unlike the non-UTF-8 case, the macro for determining if a
10721 * string is a multi-char fold requires all the characters to
10722 * already be folded. This is because of all the complications
10723 * if not. Note that they are folded anyway, except in EXACTFL
10724 * nodes. Like the non-UTF case above, we punt if the node
10725 * begins with a multi-char fold */
10727 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10728 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10730 else { /* Single char fold */
10733 const U32 * remaining_folds;
10734 Size_t folds_count;
10736 /* It matches itself */
10737 invlist = add_cp_to_invlist(invlist, fc);
10739 /* ... plus all the things that fold to it, which are found in
10740 * PL_utf8_foldclosures */
10741 folds_count = _inverse_folds(fc, &first_fold,
10743 for (k = 0; k < folds_count; k++) {
10744 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10746 /* /aa doesn't allow folds between ASCII and non- */
10747 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10748 && isASCII(c) != isASCII(fc))
10753 invlist = add_cp_to_invlist(invlist, c);
10756 if (OP(node) == EXACTFL) {
10758 /* If either [iI] are present in an EXACTFL node the above code
10759 * should have added its normal case pair, but under a Turkish
10760 * locale they could match instead the case pairs from it. Add
10761 * those as potential matches as well */
10762 if (isALPHA_FOLD_EQ(fc, 'I')) {
10763 invlist = add_cp_to_invlist(invlist,
10764 LATIN_SMALL_LETTER_DOTLESS_I);
10765 invlist = add_cp_to_invlist(invlist,
10766 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10768 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10769 invlist = add_cp_to_invlist(invlist, 'I');
10771 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10772 invlist = add_cp_to_invlist(invlist, 'i');
10781 #undef HEADER_LENGTH
10782 #undef TO_INTERNAL_SIZE
10783 #undef FROM_INTERNAL_SIZE
10784 #undef INVLIST_VERSION_ID
10786 /* End of inversion list object */
10789 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10791 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10792 * constructs, and updates RExC_flags with them. On input, RExC_parse
10793 * should point to the first flag; it is updated on output to point to the
10794 * final ')' or ':'. There needs to be at least one flag, or this will
10797 /* for (?g), (?gc), and (?o) warnings; warning
10798 about (?c) will warn about (?g) -- japhy */
10800 #define WASTED_O 0x01
10801 #define WASTED_G 0x02
10802 #define WASTED_C 0x04
10803 #define WASTED_GC (WASTED_G|WASTED_C)
10804 I32 wastedflags = 0x00;
10805 U32 posflags = 0, negflags = 0;
10806 U32 *flagsp = &posflags;
10807 char has_charset_modifier = '\0';
10809 bool has_use_defaults = FALSE;
10810 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10811 int x_mod_count = 0;
10813 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10815 /* '^' as an initial flag sets certain defaults */
10816 if (UCHARAT(RExC_parse) == '^') {
10818 has_use_defaults = TRUE;
10819 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10820 cs = (RExC_uni_semantics)
10821 ? REGEX_UNICODE_CHARSET
10822 : REGEX_DEPENDS_CHARSET;
10823 set_regex_charset(&RExC_flags, cs);
10826 cs = get_regex_charset(RExC_flags);
10827 if ( cs == REGEX_DEPENDS_CHARSET
10828 && RExC_uni_semantics)
10830 cs = REGEX_UNICODE_CHARSET;
10834 while (RExC_parse < RExC_end) {
10835 /* && memCHRs("iogcmsx", *RExC_parse) */
10836 /* (?g), (?gc) and (?o) are useless here
10837 and must be globally applied -- japhy */
10838 if ((RExC_pm_flags & PMf_WILDCARD)) {
10839 if (flagsp == & negflags) {
10840 if (*RExC_parse == 'm') {
10842 /* diag_listed_as: Use of %s is not allowed in Unicode
10843 property wildcard subpatterns in regex; marked by <--
10845 vFAIL("Use of modifier '-m' is not allowed in Unicode"
10846 " property wildcard subpatterns");
10850 if (*RExC_parse == 's') {
10851 goto modifier_illegal_in_wildcard;
10856 switch (*RExC_parse) {
10858 /* Code for the imsxn flags */
10859 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10861 case LOCALE_PAT_MOD:
10862 if (has_charset_modifier) {
10863 goto excess_modifier;
10865 else if (flagsp == &negflags) {
10868 cs = REGEX_LOCALE_CHARSET;
10869 has_charset_modifier = LOCALE_PAT_MOD;
10871 case UNICODE_PAT_MOD:
10872 if (has_charset_modifier) {
10873 goto excess_modifier;
10875 else if (flagsp == &negflags) {
10878 cs = REGEX_UNICODE_CHARSET;
10879 has_charset_modifier = UNICODE_PAT_MOD;
10881 case ASCII_RESTRICT_PAT_MOD:
10882 if (flagsp == &negflags) {
10885 if (has_charset_modifier) {
10886 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10887 goto excess_modifier;
10889 /* Doubled modifier implies more restricted */
10890 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10893 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10895 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10897 case DEPENDS_PAT_MOD:
10898 if (has_use_defaults) {
10899 goto fail_modifiers;
10901 else if (flagsp == &negflags) {
10904 else if (has_charset_modifier) {
10905 goto excess_modifier;
10908 /* The dual charset means unicode semantics if the
10909 * pattern (or target, not known until runtime) are
10910 * utf8, or something in the pattern indicates unicode
10912 cs = (RExC_uni_semantics)
10913 ? REGEX_UNICODE_CHARSET
10914 : REGEX_DEPENDS_CHARSET;
10915 has_charset_modifier = DEPENDS_PAT_MOD;
10919 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10920 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10922 else if (has_charset_modifier == *(RExC_parse - 1)) {
10923 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10924 *(RExC_parse - 1));
10927 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10929 NOT_REACHED; /*NOTREACHED*/
10932 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10933 *(RExC_parse - 1));
10934 NOT_REACHED; /*NOTREACHED*/
10935 case GLOBAL_PAT_MOD: /* 'g' */
10936 if (RExC_pm_flags & PMf_WILDCARD) {
10937 goto modifier_illegal_in_wildcard;
10940 case ONCE_PAT_MOD: /* 'o' */
10941 if (ckWARN(WARN_REGEXP)) {
10942 const I32 wflagbit = *RExC_parse == 'o'
10945 if (! (wastedflags & wflagbit) ) {
10946 wastedflags |= wflagbit;
10947 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10950 "Useless (%s%c) - %suse /%c modifier",
10951 flagsp == &negflags ? "?-" : "?",
10953 flagsp == &negflags ? "don't " : "",
10960 case CONTINUE_PAT_MOD: /* 'c' */
10961 if (RExC_pm_flags & PMf_WILDCARD) {
10962 goto modifier_illegal_in_wildcard;
10964 if (ckWARN(WARN_REGEXP)) {
10965 if (! (wastedflags & WASTED_C) ) {
10966 wastedflags |= WASTED_GC;
10967 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10970 "Useless (%sc) - %suse /gc modifier",
10971 flagsp == &negflags ? "?-" : "?",
10972 flagsp == &negflags ? "don't " : ""
10977 case KEEPCOPY_PAT_MOD: /* 'p' */
10978 if (RExC_pm_flags & PMf_WILDCARD) {
10979 goto modifier_illegal_in_wildcard;
10981 if (flagsp == &negflags) {
10982 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10984 *flagsp |= RXf_PMf_KEEPCOPY;
10988 /* A flag is a default iff it is following a minus, so
10989 * if there is a minus, it means will be trying to
10990 * re-specify a default which is an error */
10991 if (has_use_defaults || flagsp == &negflags) {
10992 goto fail_modifiers;
10994 flagsp = &negflags;
10995 wastedflags = 0; /* reset so (?g-c) warns twice */
11001 if ( (RExC_pm_flags & PMf_WILDCARD)
11002 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11005 /* diag_listed_as: Use of %s is not allowed in Unicode
11006 property wildcard subpatterns in regex; marked by <--
11008 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11009 " property wildcard subpatterns",
11010 has_charset_modifier);
11013 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11014 negflags |= RXf_PMf_EXTENDED_MORE;
11016 RExC_flags |= posflags;
11018 if (negflags & RXf_PMf_EXTENDED) {
11019 negflags |= RXf_PMf_EXTENDED_MORE;
11021 RExC_flags &= ~negflags;
11022 set_regex_charset(&RExC_flags, cs);
11027 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11028 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11029 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11030 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11031 NOT_REACHED; /*NOTREACHED*/
11034 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11037 vFAIL("Sequence (?... not terminated");
11039 modifier_illegal_in_wildcard:
11041 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11042 subpatterns in regex; marked by <-- HERE in m/%s/ */
11043 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11044 " subpatterns", *(RExC_parse - 1));
11048 - reg - regular expression, i.e. main body or parenthesized thing
11050 * Caller must absorb opening parenthesis.
11052 * Combining parenthesis handling with the base level of regular expression
11053 * is a trifle forced, but the need to tie the tails of the branches to what
11054 * follows makes it hard to avoid.
11056 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11058 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11060 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11063 STATIC regnode_offset
11064 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11066 char * parse_start,
11070 regnode_offset ret;
11071 char* name_start = RExC_parse;
11073 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11074 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11076 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11078 if (RExC_parse == name_start || *RExC_parse != ch) {
11079 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11080 vFAIL2("Sequence %.3s... not terminated", parse_start);
11084 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11085 RExC_rxi->data->data[num]=(void*)sv_dat;
11086 SvREFCNT_inc_simple_void_NN(sv_dat);
11089 ret = reganode(pRExC_state,
11092 : (ASCII_FOLD_RESTRICTED)
11094 : (AT_LEAST_UNI_SEMANTICS)
11100 *flagp |= HASWIDTH;
11102 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11103 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11105 nextchar(pRExC_state);
11109 /* On success, returns the offset at which any next node should be placed into
11110 * the regex engine program being compiled.
11112 * Returns 0 otherwise, with *flagp set to indicate why:
11113 * TRYAGAIN at the end of (?) that only sets flags.
11114 * RESTART_PARSE if the parse needs to be restarted, or'd with
11115 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11116 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11118 STATIC regnode_offset
11119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11120 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11121 * 2 is like 1, but indicates that nextchar() has been called to advance
11122 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11123 * this flag alerts us to the need to check for that */
11125 regnode_offset ret = 0; /* Will be the head of the group. */
11127 regnode_offset lastbr;
11128 regnode_offset ender = 0;
11131 U32 oregflags = RExC_flags;
11132 bool have_branch = 0;
11134 I32 freeze_paren = 0;
11135 I32 after_freeze = 0;
11136 I32 num; /* numeric backreferences */
11137 SV * max_open; /* Max number of unclosed parens */
11139 char * parse_start = RExC_parse; /* MJD */
11140 char * const oregcomp_parse = RExC_parse;
11142 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11144 PERL_ARGS_ASSERT_REG;
11145 DEBUG_PARSE("reg ");
11147 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11149 if (!SvIOK(max_open)) {
11150 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11152 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11154 vFAIL("Too many nested open parens");
11157 *flagp = 0; /* Tentatively. */
11159 if (RExC_in_lookbehind) {
11160 RExC_in_lookbehind++;
11162 if (RExC_in_lookahead) {
11163 RExC_in_lookahead++;
11166 /* Having this true makes it feasible to have a lot fewer tests for the
11167 * parse pointer being in scope. For example, we can write
11168 * while(isFOO(*RExC_parse)) RExC_parse++;
11170 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11172 assert(*RExC_end == '\0');
11174 /* Make an OPEN node, if parenthesized. */
11177 /* Under /x, space and comments can be gobbled up between the '(' and
11178 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11179 * intervening space, as the sequence is a token, and a token should be
11181 bool has_intervening_patws = (paren == 2)
11182 && *(RExC_parse - 1) != '(';
11184 if (RExC_parse >= RExC_end) {
11185 vFAIL("Unmatched (");
11188 if (paren == 'r') { /* Atomic script run */
11192 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11193 char *start_verb = RExC_parse + 1;
11195 char *start_arg = NULL;
11196 unsigned char op = 0;
11197 int arg_required = 0;
11198 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11199 bool has_upper = FALSE;
11201 if (has_intervening_patws) {
11202 RExC_parse++; /* past the '*' */
11204 /* For strict backwards compatibility, don't change the message
11205 * now that we also have lowercase operands */
11206 if (isUPPER(*RExC_parse)) {
11207 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11210 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11213 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11214 if ( *RExC_parse == ':' ) {
11215 start_arg = RExC_parse + 1;
11219 if (isUPPER(*RExC_parse)) {
11225 RExC_parse += UTF8SKIP(RExC_parse);
11228 verb_len = RExC_parse - start_verb;
11230 if (RExC_parse >= RExC_end) {
11231 goto unterminated_verb_pattern;
11234 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11235 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11236 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11238 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11239 unterminated_verb_pattern:
11241 vFAIL("Unterminated verb pattern argument");
11244 vFAIL("Unterminated '(*...' argument");
11248 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11250 vFAIL("Unterminated verb pattern");
11253 vFAIL("Unterminated '(*...' construct");
11258 /* Here, we know that RExC_parse < RExC_end */
11260 switch ( *start_verb ) {
11261 case 'A': /* (*ACCEPT) */
11262 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11264 internal_argval = RExC_nestroot;
11267 case 'C': /* (*COMMIT) */
11268 if ( memEQs(start_verb, verb_len,"COMMIT") )
11271 case 'F': /* (*FAIL) */
11272 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11276 case ':': /* (*:NAME) */
11277 case 'M': /* (*MARK:NAME) */
11278 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11283 case 'P': /* (*PRUNE) */
11284 if ( memEQs(start_verb, verb_len,"PRUNE") )
11287 case 'S': /* (*SKIP) */
11288 if ( memEQs(start_verb, verb_len,"SKIP") )
11291 case 'T': /* (*THEN) */
11292 /* [19:06] <TimToady> :: is then */
11293 if ( memEQs(start_verb, verb_len,"THEN") ) {
11295 RExC_seen |= REG_CUTGROUP_SEEN;
11299 if ( memEQs(start_verb, verb_len, "asr")
11300 || memEQs(start_verb, verb_len, "atomic_script_run"))
11302 paren = 'r'; /* Mnemonic: recursed run */
11305 else if (memEQs(start_verb, verb_len, "atomic")) {
11306 paren = 't'; /* AtOMIC */
11307 goto alpha_assertions;
11311 if ( memEQs(start_verb, verb_len, "plb")
11312 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11315 goto lookbehind_alpha_assertions;
11317 else if ( memEQs(start_verb, verb_len, "pla")
11318 || memEQs(start_verb, verb_len, "positive_lookahead"))
11321 goto alpha_assertions;
11325 if ( memEQs(start_verb, verb_len, "nlb")
11326 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11329 goto lookbehind_alpha_assertions;
11331 else if ( memEQs(start_verb, verb_len, "nla")
11332 || memEQs(start_verb, verb_len, "negative_lookahead"))
11335 goto alpha_assertions;
11339 if ( memEQs(start_verb, verb_len, "sr")
11340 || memEQs(start_verb, verb_len, "script_run"))
11342 regnode_offset atomic;
11348 /* This indicates Unicode rules. */
11349 REQUIRE_UNI_RULES(flagp, 0);
11355 RExC_parse = start_arg;
11357 if (RExC_in_script_run) {
11359 /* Nested script runs are treated as no-ops, because
11360 * if the nested one fails, the outer one must as
11361 * well. It could fail sooner, and avoid (??{} with
11362 * side effects, but that is explicitly documented as
11363 * undefined behavior. */
11367 if (paren == 's') {
11372 /* But, the atomic part of a nested atomic script run
11373 * isn't a no-op, but can be treated just like a '(?>'
11379 if (paren == 's') {
11380 /* Here, we're starting a new regular script run */
11381 ret = reg_node(pRExC_state, SROPEN);
11382 RExC_in_script_run = 1;
11387 /* Here, we are starting an atomic script run. This is
11388 * handled by recursing to deal with the atomic portion
11389 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11391 ret = reg_node(pRExC_state, SROPEN);
11393 RExC_in_script_run = 1;
11395 atomic = reg(pRExC_state, 'r', &flags, depth);
11396 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11397 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11401 if (! REGTAIL(pRExC_state, ret, atomic)) {
11402 REQUIRE_BRANCHJ(flagp, 0);
11405 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11408 REQUIRE_BRANCHJ(flagp, 0);
11411 RExC_in_script_run = 0;
11417 lookbehind_alpha_assertions:
11418 RExC_seen |= REG_LOOKBEHIND_SEEN;
11419 RExC_in_lookbehind++;
11424 RExC_seen_zerolen++;
11430 /* An empty negative lookahead assertion simply is failure */
11431 if (paren == 'A' && RExC_parse == start_arg) {
11432 ret=reganode(pRExC_state, OPFAIL, 0);
11433 nextchar(pRExC_state);
11437 RExC_parse = start_arg;
11442 "'(*%" UTF8f "' requires a terminating ':'",
11443 UTF8fARG(UTF, verb_len, start_verb));
11444 NOT_REACHED; /*NOTREACHED*/
11446 } /* End of switch */
11449 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11451 if (has_upper || verb_len == 0) {
11453 "Unknown verb pattern '%" UTF8f "'",
11454 UTF8fARG(UTF, verb_len, start_verb));
11458 "Unknown '(*...)' construct '%" UTF8f "'",
11459 UTF8fARG(UTF, verb_len, start_verb));
11462 if ( RExC_parse == start_arg ) {
11465 if ( arg_required && !start_arg ) {
11466 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11467 (int) verb_len, start_verb);
11469 if (internal_argval == -1) {
11470 ret = reganode(pRExC_state, op, 0);
11472 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11474 RExC_seen |= REG_VERBARG_SEEN;
11476 SV *sv = newSVpvn( start_arg,
11477 RExC_parse - start_arg);
11478 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11479 STR_WITH_LEN("S"));
11480 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11481 FLAGS(REGNODE_p(ret)) = 1;
11483 FLAGS(REGNODE_p(ret)) = 0;
11485 if ( internal_argval != -1 )
11486 ARG2L_SET(REGNODE_p(ret), internal_argval);
11487 nextchar(pRExC_state);
11490 else if (*RExC_parse == '?') { /* (?...) */
11491 bool is_logical = 0;
11492 const char * const seqstart = RExC_parse;
11493 const char * endptr;
11494 const char non_existent_group_msg[]
11495 = "Reference to nonexistent group";
11496 const char impossible_group[] = "Invalid reference to group";
11498 if (has_intervening_patws) {
11500 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11503 RExC_parse++; /* past the '?' */
11504 paren = *RExC_parse; /* might be a trailing NUL, if not
11506 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11507 if (RExC_parse > RExC_end) {
11510 ret = 0; /* For look-ahead/behind. */
11513 case 'P': /* (?P...) variants for those used to PCRE/Python */
11514 paren = *RExC_parse;
11515 if ( paren == '<') { /* (?P<...>) named capture */
11517 if (RExC_parse >= RExC_end) {
11518 vFAIL("Sequence (?P<... not terminated");
11520 goto named_capture;
11522 else if (paren == '>') { /* (?P>name) named recursion */
11524 if (RExC_parse >= RExC_end) {
11525 vFAIL("Sequence (?P>... not terminated");
11527 goto named_recursion;
11529 else if (paren == '=') { /* (?P=...) named backref */
11531 return handle_named_backref(pRExC_state, flagp,
11534 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11535 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11536 vFAIL3("Sequence (%.*s...) not recognized",
11537 (int) (RExC_parse - seqstart), seqstart);
11538 NOT_REACHED; /*NOTREACHED*/
11539 case '<': /* (?<...) */
11540 /* If you want to support (?<*...), first reconcile with GH #17363 */
11541 if (*RExC_parse == '!')
11543 else if (*RExC_parse != '=')
11550 case '\'': /* (?'...') */
11551 name_start = RExC_parse;
11552 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11553 if ( RExC_parse == name_start
11554 || RExC_parse >= RExC_end
11555 || *RExC_parse != paren)
11557 vFAIL2("Sequence (?%c... not terminated",
11558 paren=='>' ? '<' : (char) paren);
11563 if (!svname) /* shouldn't happen */
11565 "panic: reg_scan_name returned NULL");
11566 if (!RExC_paren_names) {
11567 RExC_paren_names= newHV();
11568 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11570 RExC_paren_name_list= newAV();
11571 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11574 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11576 sv_dat = HeVAL(he_str);
11578 /* croak baby croak */
11580 "panic: paren_name hash element allocation failed");
11581 } else if ( SvPOK(sv_dat) ) {
11582 /* (?|...) can mean we have dupes so scan to check
11583 its already been stored. Maybe a flag indicating
11584 we are inside such a construct would be useful,
11585 but the arrays are likely to be quite small, so
11586 for now we punt -- dmq */
11587 IV count = SvIV(sv_dat);
11588 I32 *pv = (I32*)SvPVX(sv_dat);
11590 for ( i = 0 ; i < count ; i++ ) {
11591 if ( pv[i] == RExC_npar ) {
11597 pv = (I32*)SvGROW(sv_dat,
11598 SvCUR(sv_dat) + sizeof(I32)+1);
11599 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11600 pv[count] = RExC_npar;
11601 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11604 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11605 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11608 SvIV_set(sv_dat, 1);
11611 /* Yes this does cause a memory leak in debugging Perls
11613 if (!av_store(RExC_paren_name_list,
11614 RExC_npar, SvREFCNT_inc_NN(svname)))
11615 SvREFCNT_dec_NN(svname);
11618 /*sv_dump(sv_dat);*/
11620 nextchar(pRExC_state);
11622 goto capturing_parens;
11625 RExC_seen |= REG_LOOKBEHIND_SEEN;
11626 RExC_in_lookbehind++;
11628 if (RExC_parse >= RExC_end) {
11629 vFAIL("Sequence (?... not terminated");
11631 RExC_seen_zerolen++;
11633 case '=': /* (?=...) */
11634 RExC_seen_zerolen++;
11635 RExC_in_lookahead++;
11637 case '!': /* (?!...) */
11638 RExC_seen_zerolen++;
11639 /* check if we're really just a "FAIL" assertion */
11640 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11641 FALSE /* Don't force to /x */ );
11642 if (*RExC_parse == ')') {
11643 ret=reganode(pRExC_state, OPFAIL, 0);
11644 nextchar(pRExC_state);
11648 case '|': /* (?|...) */
11649 /* branch reset, behave like a (?:...) except that
11650 buffers in alternations share the same numbers */
11652 after_freeze = freeze_paren = RExC_npar;
11654 /* XXX This construct currently requires an extra pass.
11655 * Investigation would be required to see if that could be
11657 REQUIRE_PARENS_PASS;
11659 case ':': /* (?:...) */
11660 case '>': /* (?>...) */
11662 case '$': /* (?$...) */
11663 case '@': /* (?@...) */
11664 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11666 case '0' : /* (?0) */
11667 case 'R' : /* (?R) */
11668 if (RExC_parse == RExC_end || *RExC_parse != ')')
11669 FAIL("Sequence (?R) not terminated");
11671 RExC_seen |= REG_RECURSE_SEEN;
11673 /* XXX These constructs currently require an extra pass.
11674 * It probably could be changed */
11675 REQUIRE_PARENS_PASS;
11677 *flagp |= POSTPONED;
11678 goto gen_recurse_regop;
11680 /* named and numeric backreferences */
11681 case '&': /* (?&NAME) */
11682 parse_start = RExC_parse - 1;
11685 SV *sv_dat = reg_scan_name(pRExC_state,
11686 REG_RSN_RETURN_DATA);
11687 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11689 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11690 vFAIL("Sequence (?&... not terminated");
11691 goto gen_recurse_regop;
11694 if (! inRANGE(RExC_parse[0], '1', '9')) {
11696 vFAIL("Illegal pattern");
11698 goto parse_recursion;
11700 case '-': /* (?-1) */
11701 if (! inRANGE(RExC_parse[0], '1', '9')) {
11702 RExC_parse--; /* rewind to let it be handled later */
11706 case '1': case '2': case '3': case '4': /* (?1) */
11707 case '5': case '6': case '7': case '8': case '9':
11708 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11711 bool is_neg = FALSE;
11713 parse_start = RExC_parse - 1; /* MJD */
11714 if (*RExC_parse == '-') {
11719 if (grok_atoUV(RExC_parse, &unum, &endptr)
11723 RExC_parse = (char*)endptr;
11725 else { /* Overflow, or something like that. Position
11726 beyond all digits for the message */
11727 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
11730 vFAIL(impossible_group);
11733 /* -num is always representable on 1 and 2's complement
11738 if (*RExC_parse!=')')
11739 vFAIL("Expecting close bracket");
11742 if (paren == '-' || paren == '+') {
11744 /* Don't overflow */
11745 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11747 vFAIL(impossible_group);
11751 Diagram of capture buffer numbering.
11752 Top line is the normal capture buffer numbers
11753 Bottom line is the negative indexing as from
11757 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11758 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11759 - 5 4 3 2 1 X Y x x
11761 Resolve to absolute group. Recall that RExC_npar is +1 of
11762 the actual parenthesis group number. For lookahead, we
11763 have to compensate for that. Using the above example, when
11764 we get to Y in the parse, num is 2 and RExC_npar is 6. We
11765 want 7 for +2, and 4 for -2.
11767 if ( paren == '+' ) {
11773 if (paren == '-' && num < 1) {
11775 vFAIL(non_existent_group_msg);
11779 if (num >= RExC_npar) {
11781 /* It might be a forward reference; we can't fail until we
11782 * know, by completing the parse to get all the groups, and
11783 * then reparsing */
11784 if (ALL_PARENS_COUNTED) {
11785 if (num >= RExC_total_parens) {
11787 vFAIL(non_existent_group_msg);
11791 REQUIRE_PARENS_PASS;
11795 /* We keep track how many GOSUB items we have produced.
11796 To start off the ARG2L() of the GOSUB holds its "id",
11797 which is used later in conjunction with RExC_recurse
11798 to calculate the offset we need to jump for the GOSUB,
11799 which it will store in the final representation.
11800 We have to defer the actual calculation until much later
11801 as the regop may move.
11803 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11804 RExC_recurse_count++;
11805 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11806 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11807 22, "| |", (int)(depth * 2 + 1), "",
11808 (UV)ARG(REGNODE_p(ret)),
11809 (IV)ARG2L(REGNODE_p(ret))));
11810 RExC_seen |= REG_RECURSE_SEEN;
11812 Set_Node_Length(REGNODE_p(ret),
11813 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11814 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11816 *flagp |= POSTPONED;
11817 assert(*RExC_parse == ')');
11818 nextchar(pRExC_state);
11823 case '?': /* (??...) */
11825 if (*RExC_parse != '{') {
11826 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11827 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11829 "Sequence (%" UTF8f "...) not recognized",
11830 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11831 NOT_REACHED; /*NOTREACHED*/
11833 *flagp |= POSTPONED;
11837 case '{': /* (?{...}) */
11840 struct reg_code_block *cb;
11843 RExC_seen_zerolen++;
11845 if ( !pRExC_state->code_blocks
11846 || pRExC_state->code_index
11847 >= pRExC_state->code_blocks->count
11848 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11849 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11852 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11853 FAIL("panic: Sequence (?{...}): no code block found\n");
11854 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11856 /* this is a pre-compiled code block (?{...}) */
11857 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11858 RExC_parse = RExC_start + cb->end;
11860 if (cb->src_regex) {
11861 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11862 RExC_rxi->data->data[n] =
11863 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11864 RExC_rxi->data->data[n+1] = (void*)o;
11867 n = add_data(pRExC_state,
11868 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11869 RExC_rxi->data->data[n] = (void*)o;
11871 pRExC_state->code_index++;
11872 nextchar(pRExC_state);
11875 regnode_offset eval;
11876 ret = reg_node(pRExC_state, LOGICAL);
11878 eval = reg2Lanode(pRExC_state, EVAL,
11881 /* for later propagation into (??{})
11883 RExC_flags & RXf_PMf_COMPILETIME
11885 FLAGS(REGNODE_p(ret)) = 2;
11886 if (! REGTAIL(pRExC_state, ret, eval)) {
11887 REQUIRE_BRANCHJ(flagp, 0);
11889 /* deal with the length of this later - MJD */
11892 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11893 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11894 Set_Node_Offset(REGNODE_p(ret), parse_start);
11897 case '(': /* (?(?{...})...) and (?(?=...)...) */
11900 const int DEFINE_len = sizeof("DEFINE") - 1;
11901 if ( RExC_parse < RExC_end - 1
11902 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11903 && ( RExC_parse[1] == '='
11904 || RExC_parse[1] == '!'
11905 || RExC_parse[1] == '<'
11906 || RExC_parse[1] == '{'))
11907 || ( RExC_parse[0] == '*' /* (?(*...)) */
11908 && ( memBEGINs(RExC_parse + 1,
11909 (Size_t) (RExC_end - (RExC_parse + 1)),
11911 || memBEGINs(RExC_parse + 1,
11912 (Size_t) (RExC_end - (RExC_parse + 1)),
11914 || memBEGINs(RExC_parse + 1,
11915 (Size_t) (RExC_end - (RExC_parse + 1)),
11917 || memBEGINs(RExC_parse + 1,
11918 (Size_t) (RExC_end - (RExC_parse + 1)),
11920 || memBEGINs(RExC_parse + 1,
11921 (Size_t) (RExC_end - (RExC_parse + 1)),
11922 "positive_lookahead:")
11923 || memBEGINs(RExC_parse + 1,
11924 (Size_t) (RExC_end - (RExC_parse + 1)),
11925 "positive_lookbehind:")
11926 || memBEGINs(RExC_parse + 1,
11927 (Size_t) (RExC_end - (RExC_parse + 1)),
11928 "negative_lookahead:")
11929 || memBEGINs(RExC_parse + 1,
11930 (Size_t) (RExC_end - (RExC_parse + 1)),
11931 "negative_lookbehind:"))))
11932 ) { /* Lookahead or eval. */
11934 regnode_offset tail;
11936 ret = reg_node(pRExC_state, LOGICAL);
11937 FLAGS(REGNODE_p(ret)) = 1;
11939 tail = reg(pRExC_state, 1, &flag, depth+1);
11940 RETURN_FAIL_ON_RESTART(flag, flagp);
11941 if (! REGTAIL(pRExC_state, ret, tail)) {
11942 REQUIRE_BRANCHJ(flagp, 0);
11946 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11947 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11949 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11950 char *name_start= RExC_parse++;
11952 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11953 if ( RExC_parse == name_start
11954 || RExC_parse >= RExC_end
11955 || *RExC_parse != ch)
11957 vFAIL2("Sequence (?(%c... not terminated",
11958 (ch == '>' ? '<' : ch));
11962 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11963 RExC_rxi->data->data[num]=(void*)sv_dat;
11964 SvREFCNT_inc_simple_void_NN(sv_dat);
11966 ret = reganode(pRExC_state, GROUPPN, num);
11967 goto insert_if_check_paren;
11969 else if (memBEGINs(RExC_parse,
11970 (STRLEN) (RExC_end - RExC_parse),
11973 ret = reganode(pRExC_state, DEFINEP, 0);
11974 RExC_parse += DEFINE_len;
11976 goto insert_if_check_paren;
11978 else if (RExC_parse[0] == 'R') {
11980 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11981 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11982 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11985 if (RExC_parse[0] == '0') {
11989 else if (inRANGE(RExC_parse[0], '1', '9')) {
11992 if (grok_atoUV(RExC_parse, &uv, &endptr)
11995 parno = (I32)uv + 1;
11996 RExC_parse = (char*)endptr;
11998 /* else "Switch condition not recognized" below */
11999 } else if (RExC_parse[0] == '&') {
12002 sv_dat = reg_scan_name(pRExC_state,
12003 REG_RSN_RETURN_DATA);
12005 parno = 1 + *((I32 *)SvPVX(sv_dat));
12007 ret = reganode(pRExC_state, INSUBP, parno);
12008 goto insert_if_check_paren;
12010 else if (inRANGE(RExC_parse[0], '1', '9')) {
12015 if (grok_atoUV(RExC_parse, &uv, &endptr)
12019 RExC_parse = (char*)endptr;
12022 vFAIL("panic: grok_atoUV returned FALSE");
12024 ret = reganode(pRExC_state, GROUPP, parno);
12026 insert_if_check_paren:
12027 if (UCHARAT(RExC_parse) != ')') {
12029 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12031 vFAIL("Switch condition not recognized");
12033 nextchar(pRExC_state);
12035 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12038 REQUIRE_BRANCHJ(flagp, 0);
12040 br = regbranch(pRExC_state, &flags, 1, depth+1);
12042 RETURN_FAIL_ON_RESTART(flags,flagp);
12043 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12046 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12049 REQUIRE_BRANCHJ(flagp, 0);
12051 c = UCHARAT(RExC_parse);
12052 nextchar(pRExC_state);
12053 if (flags&HASWIDTH)
12054 *flagp |= HASWIDTH;
12057 vFAIL("(?(DEFINE)....) does not allow branches");
12059 /* Fake one for optimizer. */
12060 lastbr = reganode(pRExC_state, IFTHEN, 0);
12062 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12063 RETURN_FAIL_ON_RESTART(flags, flagp);
12064 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12067 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12068 REQUIRE_BRANCHJ(flagp, 0);
12070 if (flags&HASWIDTH)
12071 *flagp |= HASWIDTH;
12072 c = UCHARAT(RExC_parse);
12073 nextchar(pRExC_state);
12078 if (RExC_parse >= RExC_end)
12079 vFAIL("Switch (?(condition)... not terminated");
12081 vFAIL("Switch (?(condition)... contains too many branches");
12083 ender = reg_node(pRExC_state, TAIL);
12084 if (! REGTAIL(pRExC_state, br, ender)) {
12085 REQUIRE_BRANCHJ(flagp, 0);
12088 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12089 REQUIRE_BRANCHJ(flagp, 0);
12091 if (! REGTAIL(pRExC_state,
12094 NEXTOPER(REGNODE_p(lastbr)))),
12097 REQUIRE_BRANCHJ(flagp, 0);
12101 if (! REGTAIL(pRExC_state, ret, ender)) {
12102 REQUIRE_BRANCHJ(flagp, 0);
12104 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12105 RExC_size++; /* XXX WHY do we need this?!!
12106 For large programs it seems to be required
12107 but I can't figure out why. -- dmq*/
12112 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12114 vFAIL("Unknown switch condition (?(...))");
12116 case '[': /* (?[ ... ]) */
12117 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12119 case 0: /* A NUL */
12120 RExC_parse--; /* for vFAIL to print correctly */
12121 vFAIL("Sequence (? incomplete");
12125 if (RExC_strict) { /* [perl #132851] */
12126 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12129 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12131 default: /* e.g., (?i) */
12132 RExC_parse = (char *) seqstart + 1;
12134 parse_lparen_question_flags(pRExC_state);
12135 if (UCHARAT(RExC_parse) != ':') {
12136 if (RExC_parse < RExC_end)
12137 nextchar(pRExC_state);
12142 nextchar(pRExC_state);
12147 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12151 if (! ALL_PARENS_COUNTED) {
12152 /* If we are in our first pass through (and maybe only pass),
12153 * we need to allocate memory for the capturing parentheses
12157 if (!RExC_parens_buf_size) {
12158 /* first guess at number of parens we might encounter */
12159 RExC_parens_buf_size = 10;
12161 /* setup RExC_open_parens, which holds the address of each
12162 * OPEN tag, and to make things simpler for the 0 index the
12163 * start of the program - this is used later for offsets */
12164 Newxz(RExC_open_parens, RExC_parens_buf_size,
12166 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12168 /* setup RExC_close_parens, which holds the address of each
12169 * CLOSE tag, and to make things simpler for the 0 index
12170 * the end of the program - this is used later for offsets
12172 Newxz(RExC_close_parens, RExC_parens_buf_size,
12174 /* we dont know where end op starts yet, so we dont need to
12175 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12178 else if (RExC_npar > RExC_parens_buf_size) {
12179 I32 old_size = RExC_parens_buf_size;
12181 RExC_parens_buf_size *= 2;
12183 Renew(RExC_open_parens, RExC_parens_buf_size,
12185 Zero(RExC_open_parens + old_size,
12186 RExC_parens_buf_size - old_size, regnode_offset);
12188 Renew(RExC_close_parens, RExC_parens_buf_size,
12190 Zero(RExC_close_parens + old_size,
12191 RExC_parens_buf_size - old_size, regnode_offset);
12195 ret = reganode(pRExC_state, OPEN, parno);
12196 if (!RExC_nestroot)
12197 RExC_nestroot = parno;
12198 if (RExC_open_parens && !RExC_open_parens[parno])
12200 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12201 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12202 22, "| |", (int)(depth * 2 + 1), "",
12204 RExC_open_parens[parno]= ret;
12207 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12208 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12211 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12220 /* Pick up the branches, linking them together. */
12221 parse_start = RExC_parse; /* MJD */
12222 br = regbranch(pRExC_state, &flags, 1, depth+1);
12224 /* branch_len = (paren != 0); */
12227 RETURN_FAIL_ON_RESTART(flags, flagp);
12228 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12230 if (*RExC_parse == '|') {
12231 if (RExC_use_BRANCHJ) {
12232 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12235 reginsert(pRExC_state, BRANCH, br, depth+1);
12236 Set_Node_Length(REGNODE_p(br), paren != 0);
12237 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12241 else if (paren == ':') {
12242 *flagp |= flags&SIMPLE;
12244 if (is_open) { /* Starts with OPEN. */
12245 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12246 REQUIRE_BRANCHJ(flagp, 0);
12249 else if (paren != '?') /* Not Conditional */
12251 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12253 while (*RExC_parse == '|') {
12254 if (RExC_use_BRANCHJ) {
12257 ender = reganode(pRExC_state, LONGJMP, 0);
12259 /* Append to the previous. */
12260 shut_gcc_up = REGTAIL(pRExC_state,
12261 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12263 PERL_UNUSED_VAR(shut_gcc_up);
12265 nextchar(pRExC_state);
12266 if (freeze_paren) {
12267 if (RExC_npar > after_freeze)
12268 after_freeze = RExC_npar;
12269 RExC_npar = freeze_paren;
12271 br = regbranch(pRExC_state, &flags, 0, depth+1);
12274 RETURN_FAIL_ON_RESTART(flags, flagp);
12275 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12277 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12278 REQUIRE_BRANCHJ(flagp, 0);
12281 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12284 if (have_branch || paren != ':') {
12287 /* Make a closing node, and hook it on the end. */
12290 ender = reg_node(pRExC_state, TAIL);
12293 ender = reganode(pRExC_state, CLOSE, parno);
12294 if ( RExC_close_parens ) {
12295 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12296 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12297 22, "| |", (int)(depth * 2 + 1), "",
12298 (IV)parno, ender));
12299 RExC_close_parens[parno]= ender;
12300 if (RExC_nestroot == parno)
12303 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12304 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12307 ender = reg_node(pRExC_state, SRCLOSE);
12308 RExC_in_script_run = 0;
12318 *flagp &= ~HASWIDTH;
12320 case 't': /* aTomic */
12322 ender = reg_node(pRExC_state, SUCCEED);
12325 ender = reg_node(pRExC_state, END);
12326 assert(!RExC_end_op); /* there can only be one! */
12327 RExC_end_op = REGNODE_p(ender);
12328 if (RExC_close_parens) {
12329 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12330 "%*s%*s Setting close paren #0 (END) to %zu\n",
12331 22, "| |", (int)(depth * 2 + 1), "",
12334 RExC_close_parens[0]= ender;
12339 DEBUG_PARSE_MSG("lsbr");
12340 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12341 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12342 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12343 SvPV_nolen_const(RExC_mysv1),
12345 SvPV_nolen_const(RExC_mysv2),
12347 (IV)(ender - lastbr)
12350 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12351 REQUIRE_BRANCHJ(flagp, 0);
12355 char is_nothing= 1;
12357 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12359 /* Hook the tails of the branches to the closing node. */
12360 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12361 const U8 op = PL_regkind[OP(br)];
12362 if (op == BRANCH) {
12363 if (! REGTAIL_STUDY(pRExC_state,
12364 REGNODE_OFFSET(NEXTOPER(br)),
12367 REQUIRE_BRANCHJ(flagp, 0);
12369 if ( OP(NEXTOPER(br)) != NOTHING
12370 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12373 else if (op == BRANCHJ) {
12374 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12375 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12377 PERL_UNUSED_VAR(shut_gcc_up);
12378 /* for now we always disable this optimisation * /
12379 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12380 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12386 regnode * ret_as_regnode = REGNODE_p(ret);
12387 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12388 ? regnext(ret_as_regnode)
12391 DEBUG_PARSE_MSG("NADA");
12392 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12393 NULL, pRExC_state);
12394 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12395 NULL, pRExC_state);
12396 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12397 SvPV_nolen_const(RExC_mysv1),
12398 (IV)REG_NODE_NUM(ret_as_regnode),
12399 SvPV_nolen_const(RExC_mysv2),
12405 if (OP(REGNODE_p(ender)) == TAIL) {
12407 RExC_emit= REGNODE_OFFSET(br) + 1;
12410 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12411 OP(opt)= OPTIMIZED;
12412 NEXT_OFF(br)= REGNODE_p(ender) - br;
12420 /* Even/odd or x=don't care: 010101x10x */
12421 static const char parens[] = "=!aA<,>Bbt";
12422 /* flag below is set to 0 up through 'A'; 1 for larger */
12424 if (paren && (p = strchr(parens, paren))) {
12425 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12426 int flag = (p - parens) > 3;
12428 if (paren == '>' || paren == 't') {
12429 node = SUSPEND, flag = 0;
12432 reginsert(pRExC_state, node, ret, depth+1);
12433 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12434 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12435 FLAGS(REGNODE_p(ret)) = flag;
12436 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12438 REQUIRE_BRANCHJ(flagp, 0);
12443 /* Check for proper termination. */
12445 /* restore original flags, but keep (?p) and, if we've encountered
12446 * something in the parse that changes /d rules into /u, keep the /u */
12447 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12448 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12449 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12451 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12452 RExC_parse = oregcomp_parse;
12453 vFAIL("Unmatched (");
12455 nextchar(pRExC_state);
12457 else if (!paren && RExC_parse < RExC_end) {
12458 if (*RExC_parse == ')') {
12460 vFAIL("Unmatched )");
12463 FAIL("Junk on end of regexp"); /* "Can't happen". */
12464 NOT_REACHED; /* NOTREACHED */
12467 if (RExC_in_lookbehind) {
12468 RExC_in_lookbehind--;
12470 if (RExC_in_lookahead) {
12471 RExC_in_lookahead--;
12473 if (after_freeze > RExC_npar)
12474 RExC_npar = after_freeze;
12479 - regbranch - one alternative of an | operator
12481 * Implements the concatenation operator.
12483 * On success, returns the offset at which any next node should be placed into
12484 * the regex engine program being compiled.
12486 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12487 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12490 STATIC regnode_offset
12491 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12493 regnode_offset ret;
12494 regnode_offset chain = 0;
12495 regnode_offset latest;
12496 I32 flags = 0, c = 0;
12497 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12499 PERL_ARGS_ASSERT_REGBRANCH;
12501 DEBUG_PARSE("brnc");
12506 if (RExC_use_BRANCHJ)
12507 ret = reganode(pRExC_state, BRANCHJ, 0);
12509 ret = reg_node(pRExC_state, BRANCH);
12510 Set_Node_Length(REGNODE_p(ret), 1);
12514 *flagp = WORST; /* Tentatively. */
12516 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12517 FALSE /* Don't force to /x */ );
12518 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12519 flags &= ~TRYAGAIN;
12520 latest = regpiece(pRExC_state, &flags, depth+1);
12522 if (flags & TRYAGAIN)
12524 RETURN_FAIL_ON_RESTART(flags, flagp);
12525 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12529 *flagp |= flags&(HASWIDTH|POSTPONED);
12530 if (chain == 0) /* First piece. */
12531 *flagp |= flags&SPSTART;
12533 /* FIXME adding one for every branch after the first is probably
12534 * excessive now we have TRIE support. (hv) */
12536 if (! REGTAIL(pRExC_state, chain, latest)) {
12537 /* XXX We could just redo this branch, but figuring out what
12538 * bookkeeping needs to be reset is a pain, and it's likely
12539 * that other branches that goto END will also be too large */
12540 REQUIRE_BRANCHJ(flagp, 0);
12546 if (chain == 0) { /* Loop ran zero times. */
12547 chain = reg_node(pRExC_state, NOTHING);
12552 *flagp |= flags&SIMPLE;
12559 - regpiece - something followed by possible quantifier * + ? {n,m}
12561 * Note that the branching code sequences used for ? and the general cases
12562 * of * and + are somewhat optimized: they use the same NOTHING node as
12563 * both the endmarker for their branch list and the body of the last branch.
12564 * It might seem that this node could be dispensed with entirely, but the
12565 * endmarker role is not redundant.
12567 * On success, returns the offset at which any next node should be placed into
12568 * the regex engine program being compiled.
12570 * Returns 0 otherwise, with *flagp set to indicate why:
12571 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12572 * RESTART_PARSE if the parse needs to be restarted, or'd with
12573 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12575 STATIC regnode_offset
12576 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12578 regnode_offset ret;
12582 const char * const origparse = RExC_parse;
12584 I32 max = REG_INFTY;
12585 #ifdef RE_TRACK_PATTERN_OFFSETS
12588 const char *maxpos = NULL;
12591 /* Save the original in case we change the emitted regop to a FAIL. */
12592 const regnode_offset orig_emit = RExC_emit;
12594 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12596 PERL_ARGS_ASSERT_REGPIECE;
12598 DEBUG_PARSE("piec");
12600 ret = regatom(pRExC_state, &flags, depth+1);
12602 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12603 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12608 if (op == '{' && regcurly(RExC_parse)) {
12610 #ifdef RE_TRACK_PATTERN_OFFSETS
12611 parse_start = RExC_parse; /* MJD */
12613 next = RExC_parse + 1;
12614 while (isDIGIT(*next) || *next == ',') {
12615 if (*next == ',') {
12623 if (*next == '}') { /* got one */
12624 const char* endptr;
12628 if (isDIGIT(*RExC_parse)) {
12630 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12631 vFAIL("Invalid quantifier in {,}");
12632 if (uv >= REG_INFTY)
12633 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12638 if (*maxpos == ',')
12641 maxpos = RExC_parse;
12642 if (isDIGIT(*maxpos)) {
12644 if (!grok_atoUV(maxpos, &uv, &endptr))
12645 vFAIL("Invalid quantifier in {,}");
12646 if (uv >= REG_INFTY)
12647 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12650 max = REG_INFTY; /* meaning "infinity" */
12653 nextchar(pRExC_state);
12654 if (max < min) { /* If can't match, warn and optimize to fail
12656 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12657 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12658 NEXT_OFF(REGNODE_p(orig_emit)) =
12659 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12662 else if (min == max && *RExC_parse == '?')
12664 ckWARN2reg(RExC_parse + 1,
12665 "Useless use of greediness modifier '%c'",
12670 if ((flags&SIMPLE)) {
12671 if (min == 0 && max == REG_INFTY) {
12673 /* Going from 0..inf is currently forbidden in wildcard
12674 * subpatterns. The only reason is to make it harder to
12675 * write patterns that take a long long time to halt, and
12676 * because the use of this construct isn't necessary in
12677 * matching Unicode property values */
12678 if (RExC_pm_flags & PMf_WILDCARD) {
12680 /* diag_listed_as: Use of %s is not allowed in Unicode
12681 property wildcard subpatterns in regex; marked by
12682 <-- HERE in m/%s/ */
12683 vFAIL("Use of quantifier '*' is not allowed in"
12684 " Unicode property wildcard subpatterns");
12685 /* Note, don't need to worry about {0,}, as a '}' isn't
12686 * legal at all in wildcards, so wouldn't get this far
12689 reginsert(pRExC_state, STAR, ret, depth+1);
12691 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12694 if (min == 1 && max == REG_INFTY) {
12695 reginsert(pRExC_state, PLUS, ret, depth+1);
12697 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12700 MARK_NAUGHTY_EXP(2, 2);
12701 reginsert(pRExC_state, CURLY, ret, depth+1);
12702 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12703 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12706 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12708 FLAGS(REGNODE_p(w)) = 0;
12709 if (! REGTAIL(pRExC_state, ret, w)) {
12710 REQUIRE_BRANCHJ(flagp, 0);
12712 if (RExC_use_BRANCHJ) {
12713 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12714 reginsert(pRExC_state, NOTHING, ret, depth+1);
12715 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12717 reginsert(pRExC_state, CURLYX, ret, depth+1);
12719 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12720 Set_Node_Length(REGNODE_p(ret),
12721 op == '{' ? (RExC_parse - parse_start) : 1);
12723 if (RExC_use_BRANCHJ)
12724 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12726 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12729 REQUIRE_BRANCHJ(flagp, 0);
12731 RExC_whilem_seen++;
12732 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12734 FLAGS(REGNODE_p(ret)) = 0;
12739 *flagp |= HASWIDTH;
12740 ARG1_SET(REGNODE_p(ret), (U16)min);
12741 ARG2_SET(REGNODE_p(ret), (U16)max);
12742 if (max == REG_INFTY)
12743 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12749 if (!ISMULT1(op)) {
12754 #if 0 /* Now runtime fix should be reliable. */
12756 /* if this is reinstated, don't forget to put this back into perldiag:
12758 =item Regexp *+ operand could be empty at {#} in regex m/%s/
12760 (F) The part of the regexp subject to either the * or + quantifier
12761 could match an empty string. The {#} shows in the regular
12762 expression about where the problem was discovered.
12766 if (!(flags&HASWIDTH) && op != '?')
12767 vFAIL("Regexp *+ operand could be empty");
12770 #ifdef RE_TRACK_PATTERN_OFFSETS
12771 parse_start = RExC_parse;
12773 nextchar(pRExC_state);
12775 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12781 else if (op == '+') {
12785 else if (op == '?') {
12790 if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12791 if (origparse[0] == '\\' && origparse[1] == 'K') {
12793 "%" UTF8f " is forbidden - matches null string many times",
12794 UTF8fARG(UTF, (RExC_parse >= origparse
12795 ? RExC_parse - origparse
12800 ckWARN2reg(RExC_parse,
12801 "%" UTF8f " matches null string many times",
12802 UTF8fARG(UTF, (RExC_parse >= origparse
12803 ? RExC_parse - origparse
12809 if (*RExC_parse == '?') {
12810 nextchar(pRExC_state);
12811 reginsert(pRExC_state, MINMOD, ret, depth+1);
12812 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12813 REQUIRE_BRANCHJ(flagp, 0);
12816 else if (*RExC_parse == '+') {
12817 regnode_offset ender;
12818 nextchar(pRExC_state);
12819 ender = reg_node(pRExC_state, SUCCEED);
12820 if (! REGTAIL(pRExC_state, ret, ender)) {
12821 REQUIRE_BRANCHJ(flagp, 0);
12823 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12824 ender = reg_node(pRExC_state, TAIL);
12825 if (! REGTAIL(pRExC_state, ret, ender)) {
12826 REQUIRE_BRANCHJ(flagp, 0);
12830 if (ISMULT2(RExC_parse)) {
12832 vFAIL("Nested quantifiers");
12839 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12840 regnode_offset * node_p,
12848 /* This routine teases apart the various meanings of \N and returns
12849 * accordingly. The input parameters constrain which meaning(s) is/are valid
12850 * in the current context.
12852 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12854 * If <code_point_p> is not NULL, the context is expecting the result to be a
12855 * single code point. If this \N instance turns out to a single code point,
12856 * the function returns TRUE and sets *code_point_p to that code point.
12858 * If <node_p> is not NULL, the context is expecting the result to be one of
12859 * the things representable by a regnode. If this \N instance turns out to be
12860 * one such, the function generates the regnode, returns TRUE and sets *node_p
12861 * to point to the offset of that regnode into the regex engine program being
12864 * If this instance of \N isn't legal in any context, this function will
12865 * generate a fatal error and not return.
12867 * On input, RExC_parse should point to the first char following the \N at the
12868 * time of the call. On successful return, RExC_parse will have been updated
12869 * to point to just after the sequence identified by this routine. Also
12870 * *flagp has been updated as needed.
12872 * When there is some problem with the current context and this \N instance,
12873 * the function returns FALSE, without advancing RExC_parse, nor setting
12874 * *node_p, nor *code_point_p, nor *flagp.
12876 * If <cp_count> is not NULL, the caller wants to know the length (in code
12877 * points) that this \N sequence matches. This is set, and the input is
12878 * parsed for errors, even if the function returns FALSE, as detailed below.
12880 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12882 * Probably the most common case is for the \N to specify a single code point.
12883 * *cp_count will be set to 1, and *code_point_p will be set to that code
12886 * Another possibility is for the input to be an empty \N{}. This is no
12887 * longer accepted, and will generate a fatal error.
12889 * Another possibility is for a custom charnames handler to be in effect which
12890 * translates the input name to an empty string. *cp_count will be set to 0.
12891 * *node_p will be set to a generated NOTHING node.
12893 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12894 * set to 0. *node_p will be set to a generated REG_ANY node.
12896 * The fifth possibility is that \N resolves to a sequence of more than one
12897 * code points. *cp_count will be set to the number of code points in the
12898 * sequence. *node_p will be set to a generated node returned by this
12899 * function calling S_reg().
12901 * The final possibility is that it is premature to be calling this function;
12902 * the parse needs to be restarted. This can happen when this changes from
12903 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12904 * latter occurs only when the fifth possibility would otherwise be in
12905 * effect, and is because one of those code points requires the pattern to be
12906 * recompiled as UTF-8. The function returns FALSE, and sets the
12907 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12908 * happens, the caller needs to desist from continuing parsing, and return
12909 * this information to its caller. This is not set for when there is only one
12910 * code point, as this can be called as part of an ANYOF node, and they can
12911 * store above-Latin1 code points without the pattern having to be in UTF-8.
12913 * For non-single-quoted regexes, the tokenizer has resolved character and
12914 * sequence names inside \N{...} into their Unicode values, normalizing the
12915 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12916 * hex-represented code points in the sequence. This is done there because
12917 * the names can vary based on what charnames pragma is in scope at the time,
12918 * so we need a way to take a snapshot of what they resolve to at the time of
12919 * the original parse. [perl #56444].
12921 * That parsing is skipped for single-quoted regexes, so here we may get
12922 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12923 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12924 * the native character set for non-ASCII platforms. The other possibilities
12925 * are already native, so no translation is done. */
12927 char * endbrace; /* points to '}' following the name */
12928 char* p = RExC_parse; /* Temporary */
12930 SV * substitute_parse = NULL;
12935 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12937 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12939 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12940 assert(! (node_p && cp_count)); /* At most 1 should be set */
12942 if (cp_count) { /* Initialize return for the most common case */
12946 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12947 * modifier. The other meanings do not, so use a temporary until we find
12948 * out which we are being called with */
12949 skip_to_be_ignored_text(pRExC_state, &p,
12950 FALSE /* Don't force to /x */ );
12952 /* Disambiguate between \N meaning a named character versus \N meaning
12953 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12954 * quantifier, or if there is no '{' at all */
12955 if (*p != '{' || regcurly(p)) {
12965 *node_p = reg_node(pRExC_state, REG_ANY);
12966 *flagp |= HASWIDTH|SIMPLE;
12968 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12972 /* The test above made sure that the next real character is a '{', but
12973 * under the /x modifier, it could be separated by space (or a comment and
12974 * \n) and this is not allowed (for consistency with \x{...} and the
12975 * tokenizer handling of \N{NAME}). */
12976 if (*RExC_parse != '{') {
12977 vFAIL("Missing braces on \\N{}");
12980 RExC_parse++; /* Skip past the '{' */
12982 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12983 if (! endbrace) { /* no trailing brace */
12984 vFAIL2("Missing right brace on \\%c{}", 'N');
12987 /* Here, we have decided it should be a named character or sequence. These
12988 * imply Unicode semantics */
12989 REQUIRE_UNI_RULES(flagp, FALSE);
12991 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12992 * nothing at all (not allowed under strict) */
12993 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12994 RExC_parse = endbrace;
12996 RExC_parse++; /* Position after the "}" */
12997 vFAIL("Zero length \\N{}");
13003 nextchar(pRExC_state);
13008 *node_p = reg_node(pRExC_state, NOTHING);
13012 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13014 /* Here, the name isn't of the form U+.... This can happen if the
13015 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13016 * is the time to find out what the name means */
13018 const STRLEN name_len = endbrace - RExC_parse;
13019 SV * value_sv; /* What does this name evaluate to */
13021 const U8 * value; /* string of name's value */
13022 STRLEN value_len; /* and its length */
13024 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13025 * toke.c, and their values. Make sure is initialized */
13026 if (! RExC_unlexed_names) {
13027 RExC_unlexed_names = newHV();
13030 /* If we have already seen this name in this pattern, use that. This
13031 * allows us to only call the charnames handler once per name per
13032 * pattern. A broken or malicious handler could return something
13033 * different each time, which could cause the results to vary depending
13034 * on if something gets added or subtracted from the pattern that
13035 * causes the number of passes to change, for example */
13036 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13039 value_sv = *value_svp;
13041 else { /* Otherwise we have to go out and get the name */
13042 const char * error_msg = NULL;
13043 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13047 RExC_parse = endbrace;
13051 /* If no error message, should have gotten a valid return */
13054 /* Save the name's meaning for later use */
13055 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13058 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13062 /* Here, we have the value the name evaluates to in 'value_sv' */
13063 value = (U8 *) SvPV(value_sv, value_len);
13065 /* See if the result is one code point vs 0 or multiple */
13066 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13070 /* Here, exactly one code point. If that isn't what is wanted,
13072 if (! code_point_p) {
13077 /* Convert from string to numeric code point */
13078 *code_point_p = (SvUTF8(value_sv))
13079 ? valid_utf8_to_uvchr(value, NULL)
13082 /* Have parsed this entire single code point \N{...}. *cp_count
13083 * has already been set to 1, so don't do it again. */
13084 RExC_parse = endbrace;
13085 nextchar(pRExC_state);
13087 } /* End of is a single code point */
13089 /* Count the code points, if caller desires. The API says to do this
13090 * even if we will later return FALSE */
13094 *cp_count = (SvUTF8(value_sv))
13095 ? utf8_length(value, value + value_len)
13099 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13100 * But don't back the pointer up if the caller wants to know how many
13101 * code points there are (they need to handle it themselves in this
13110 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13111 * reg recursively to parse it. That way, it retains its atomicness,
13112 * while not having to worry about any special handling that some code
13113 * points may have. */
13115 substitute_parse = newSVpvs("?:");
13116 sv_catsv(substitute_parse, value_sv);
13117 sv_catpv(substitute_parse, ")");
13119 /* The value should already be native, so no need to convert on EBCDIC
13121 assert(! RExC_recode_x_to_native);
13124 else { /* \N{U+...} */
13125 Size_t count = 0; /* code point count kept internally */
13127 /* We can get to here when the input is \N{U+...} or when toke.c has
13128 * converted a name to the \N{U+...} form. This include changing a
13129 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13131 RExC_parse += 2; /* Skip past the 'U+' */
13133 /* Code points are separated by dots. The '}' terminates the whole
13136 do { /* Loop until the ending brace */
13137 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13138 | PERL_SCAN_SILENT_ILLDIGIT
13139 | PERL_SCAN_NOTIFY_ILLDIGIT
13140 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13141 | PERL_SCAN_DISALLOW_PREFIX;
13142 STRLEN len = endbrace - RExC_parse;
13144 char * start_digit = RExC_parse;
13145 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13150 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13155 if (cp > MAX_LEGAL_CP) {
13156 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13159 if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13164 /* Here, is a single code point; fail if doesn't want that */
13165 if (! code_point_p) {
13170 /* A single code point is easy to handle; just return it */
13171 *code_point_p = UNI_TO_NATIVE(cp);
13172 RExC_parse = endbrace;
13173 nextchar(pRExC_state);
13177 /* Here, the parse stopped bfore the ending brace. This is legal
13178 * only if that character is a dot separating code points, like a
13179 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13180 * So the next character must be a dot (and the one after that
13181 * can't be the endbrace, or we'd have something like \N{U+100.} )
13183 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13184 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13185 ? UTF8SKIP(RExC_parse)
13187 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13192 /* Here, looks like its really a multiple character sequence. Fail
13193 * if that's not what the caller wants. But continue with counting
13194 * and error checking if they still want a count */
13195 if (! node_p && ! cp_count) {
13199 /* What is done here is to convert this to a sub-pattern of the
13200 * form \x{char1}\x{char2}... and then call reg recursively to
13201 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13202 * atomicness, while not having to worry about special handling
13203 * that some code points may have. We don't create a subpattern,
13204 * but go through the motions of code point counting and error
13205 * checking, if the caller doesn't want a node returned. */
13207 if (node_p && ! substitute_parse) {
13208 substitute_parse = newSVpvs("?:");
13214 /* Convert to notation the rest of the code understands */
13215 sv_catpvs(substitute_parse, "\\x{");
13216 sv_catpvn(substitute_parse, start_digit,
13217 RExC_parse - start_digit);
13218 sv_catpvs(substitute_parse, "}");
13221 /* Move to after the dot (or ending brace the final time through.)
13226 } while (RExC_parse < endbrace);
13228 if (! node_p) { /* Doesn't want the node */
13235 sv_catpvs(substitute_parse, ")");
13237 /* The values are Unicode, and therefore have to be converted to native
13238 * on a non-Unicode (meaning non-ASCII) platform. */
13239 SET_recode_x_to_native(1);
13242 /* Here, we have the string the name evaluates to, ready to be parsed,
13243 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13244 * constructs. This can be called from within a substitute parse already.
13245 * The error reporting mechanism doesn't work for 2 levels of this, but the
13246 * code above has validated this new construct, so there should be no
13247 * errors generated by the below. And this isn' an exact copy, so the
13248 * mechanism to seamlessly deal with this won't work, so turn off warnings
13250 save_start = RExC_start;
13251 orig_end = RExC_end;
13253 RExC_parse = RExC_start = SvPVX(substitute_parse);
13254 RExC_end = RExC_parse + SvCUR(substitute_parse);
13255 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13257 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13259 /* Restore the saved values */
13261 RExC_start = save_start;
13262 RExC_parse = endbrace;
13263 RExC_end = orig_end;
13264 SET_recode_x_to_native(0);
13266 SvREFCNT_dec_NN(substitute_parse);
13269 RETURN_FAIL_ON_RESTART(flags, flagp);
13270 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13273 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13275 nextchar(pRExC_state);
13282 S_compute_EXACTish(RExC_state_t *pRExC_state)
13286 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13294 op = get_regex_charset(RExC_flags);
13295 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13296 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13297 been, so there is no hole */
13300 return op + EXACTF;
13304 S_new_regcurly(const char *s, const char *e)
13306 /* This is a temporary function designed to match the most lenient form of
13307 * a {m,n} quantifier we ever envision, with either number omitted, and
13308 * spaces anywhere between/before/after them.
13310 * If this function fails, then the string it matches is very unlikely to
13311 * ever be considered a valid quantifier, so we can allow the '{' that
13312 * begins it to be considered as a literal */
13314 bool has_min = FALSE;
13315 bool has_max = FALSE;
13317 PERL_ARGS_ASSERT_NEW_REGCURLY;
13319 if (s >= e || *s++ != '{')
13322 while (s < e && isSPACE(*s)) {
13325 while (s < e && isDIGIT(*s)) {
13329 while (s < e && isSPACE(*s)) {
13335 while (s < e && isSPACE(*s)) {
13338 while (s < e && isDIGIT(*s)) {
13342 while (s < e && isSPACE(*s)) {
13347 return s < e && *s == '}' && (has_min || has_max);
13350 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13351 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13354 S_backref_value(char *p, char *e)
13356 const char* endptr = e;
13358 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13365 - regatom - the lowest level
13367 Try to identify anything special at the start of the current parse position.
13368 If there is, then handle it as required. This may involve generating a
13369 single regop, such as for an assertion; or it may involve recursing, such as
13370 to handle a () structure.
13372 If the string doesn't start with something special then we gobble up
13373 as much literal text as we can. If we encounter a quantifier, we have to
13374 back off the final literal character, as that quantifier applies to just it
13375 and not to the whole string of literals.
13377 Once we have been able to handle whatever type of thing started the
13378 sequence, we return the offset into the regex engine program being compiled
13379 at which any next regnode should be placed.
13381 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13382 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13383 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13384 Otherwise does not return 0.
13386 Note: we have to be careful with escapes, as they can be both literal
13387 and special, and in the case of \10 and friends, context determines which.
13389 A summary of the code structure is:
13391 switch (first_byte) {
13392 cases for each special:
13393 handle this special;
13396 switch (2nd byte) {
13397 cases for each unambiguous special:
13398 handle this special;
13400 cases for each ambigous special/literal:
13402 if (special) handle here
13404 default: // unambiguously literal:
13407 default: // is a literal char
13410 create EXACTish node for literal;
13411 while (more input and node isn't full) {
13412 switch (input_byte) {
13413 cases for each special;
13414 make sure parse pointer is set so that the next call to
13415 regatom will see this special first
13416 goto loopdone; // EXACTish node terminated by prev. char
13418 append char to EXACTISH node;
13420 get next input byte;
13424 return the generated node;
13426 Specifically there are two separate switches for handling
13427 escape sequences, with the one for handling literal escapes requiring
13428 a dummy entry for all of the special escapes that are actually handled
13433 STATIC regnode_offset
13434 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13437 regnode_offset ret = 0;
13443 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13445 *flagp = WORST; /* Tentatively. */
13447 DEBUG_PARSE("atom");
13449 PERL_ARGS_ASSERT_REGATOM;
13452 parse_start = RExC_parse;
13453 assert(RExC_parse < RExC_end);
13454 switch ((U8)*RExC_parse) {
13456 RExC_seen_zerolen++;
13457 nextchar(pRExC_state);
13458 if (RExC_flags & RXf_PMf_MULTILINE)
13459 ret = reg_node(pRExC_state, MBOL);
13461 ret = reg_node(pRExC_state, SBOL);
13462 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13465 nextchar(pRExC_state);
13467 RExC_seen_zerolen++;
13468 if (RExC_flags & RXf_PMf_MULTILINE)
13469 ret = reg_node(pRExC_state, MEOL);
13471 ret = reg_node(pRExC_state, SEOL);
13472 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13475 nextchar(pRExC_state);
13476 if (RExC_flags & RXf_PMf_SINGLELINE)
13477 ret = reg_node(pRExC_state, SANY);
13479 ret = reg_node(pRExC_state, REG_ANY);
13480 *flagp |= HASWIDTH|SIMPLE;
13482 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13486 char * const oregcomp_parse = ++RExC_parse;
13487 ret = regclass(pRExC_state, flagp, depth+1,
13488 FALSE, /* means parse the whole char class */
13489 TRUE, /* allow multi-char folds */
13490 FALSE, /* don't silence non-portable warnings. */
13491 (bool) RExC_strict,
13492 TRUE, /* Allow an optimized regnode result */
13495 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13496 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13499 if (*RExC_parse != ']') {
13500 RExC_parse = oregcomp_parse;
13501 vFAIL("Unmatched [");
13503 nextchar(pRExC_state);
13504 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13508 nextchar(pRExC_state);
13509 ret = reg(pRExC_state, 2, &flags, depth+1);
13511 if (flags & TRYAGAIN) {
13512 if (RExC_parse >= RExC_end) {
13513 /* Make parent create an empty node if needed. */
13514 *flagp |= TRYAGAIN;
13519 RETURN_FAIL_ON_RESTART(flags, flagp);
13520 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13523 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13527 if (flags & TRYAGAIN) {
13528 *flagp |= TRYAGAIN;
13531 vFAIL("Internal urp");
13532 /* Supposed to be caught earlier. */
13538 vFAIL("Quantifier follows nothing");
13543 This switch handles escape sequences that resolve to some kind
13544 of special regop and not to literal text. Escape sequences that
13545 resolve to literal text are handled below in the switch marked
13548 Every entry in this switch *must* have a corresponding entry
13549 in the literal escape switch. However, the opposite is not
13550 required, as the default for this switch is to jump to the
13551 literal text handling code.
13554 switch ((U8)*RExC_parse) {
13555 /* Special Escapes */
13557 RExC_seen_zerolen++;
13558 /* Under wildcards, this is changed to match \n; should be
13559 * invisible to the user, as they have to compile under /m */
13560 if (RExC_pm_flags & PMf_WILDCARD) {
13561 ret = reg_node(pRExC_state, MBOL);
13564 ret = reg_node(pRExC_state, SBOL);
13565 /* SBOL is shared with /^/ so we set the flags so we can tell
13566 * /\A/ from /^/ in split. */
13567 FLAGS(REGNODE_p(ret)) = 1;
13568 *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
13570 goto finish_meta_pat;
13572 if (RExC_pm_flags & PMf_WILDCARD) {
13574 /* diag_listed_as: Use of %s is not allowed in Unicode property
13575 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13577 vFAIL("Use of '\\G' is not allowed in Unicode property"
13578 " wildcard subpatterns");
13580 ret = reg_node(pRExC_state, GPOS);
13581 RExC_seen |= REG_GPOS_SEEN;
13583 goto finish_meta_pat;
13585 if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13586 RExC_seen_zerolen++;
13587 ret = reg_node(pRExC_state, KEEPS);
13589 /* XXX:dmq : disabling in-place substitution seems to
13590 * be necessary here to avoid cases of memory corruption, as
13591 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13593 RExC_seen |= REG_LOOKBEHIND_SEEN;
13594 goto finish_meta_pat;
13597 ++RExC_parse; /* advance past the 'K' */
13598 vFAIL("\\K not permitted in lookahead/lookbehind");
13601 if (RExC_pm_flags & PMf_WILDCARD) {
13602 /* See comment under \A above */
13603 ret = reg_node(pRExC_state, MEOL);
13606 ret = reg_node(pRExC_state, SEOL);
13607 *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
13609 RExC_seen_zerolen++; /* Do not optimize RE away */
13610 goto finish_meta_pat;
13612 if (RExC_pm_flags & PMf_WILDCARD) {
13613 /* See comment under \A above */
13614 ret = reg_node(pRExC_state, MEOL);
13617 ret = reg_node(pRExC_state, EOS);
13618 *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
13620 RExC_seen_zerolen++; /* Do not optimize RE away */
13621 goto finish_meta_pat;
13623 vFAIL("\\C no longer supported");
13625 ret = reg_node(pRExC_state, CLUMP);
13626 *flagp |= HASWIDTH;
13627 goto finish_meta_pat;
13635 regex_charset charset = get_regex_charset(RExC_flags);
13637 RExC_seen_zerolen++;
13638 RExC_seen |= REG_LOOKBEHIND_SEEN;
13639 op = BOUND + charset;
13641 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13642 flags = TRADITIONAL_BOUND;
13643 if (op > BOUNDA) { /* /aa is same as /a */
13649 char name = *RExC_parse;
13650 char * endbrace = NULL;
13652 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13655 vFAIL2("Missing right brace on \\%c{}", name);
13657 /* XXX Need to decide whether to take spaces or not. Should be
13658 * consistent with \p{}, but that currently is SPACE, which
13659 * means vertical too, which seems wrong
13660 * while (isBLANK(*RExC_parse)) {
13663 if (endbrace == RExC_parse) {
13664 RExC_parse++; /* After the '}' */
13665 vFAIL2("Empty \\%c{}", name);
13667 length = endbrace - RExC_parse;
13668 /*while (isBLANK(*(RExC_parse + length - 1))) {
13671 switch (*RExC_parse) {
13674 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13676 goto bad_bound_type;
13681 if (length != 2 || *(RExC_parse + 1) != 'b') {
13682 goto bad_bound_type;
13687 if (length != 2 || *(RExC_parse + 1) != 'b') {
13688 goto bad_bound_type;
13693 if (length != 2 || *(RExC_parse + 1) != 'b') {
13694 goto bad_bound_type;
13700 RExC_parse = endbrace;
13702 "'%" UTF8f "' is an unknown bound type",
13703 UTF8fARG(UTF, length, endbrace - length));
13704 NOT_REACHED; /*NOTREACHED*/
13706 RExC_parse = endbrace;
13707 REQUIRE_UNI_RULES(flagp, 0);
13712 else if (op >= BOUNDA) { /* /aa is same as /a */
13716 /* Don't have to worry about UTF-8, in this message because
13717 * to get here the contents of the \b must be ASCII */
13718 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13719 "Using /u for '%.*s' instead of /%s",
13721 endbrace - length + 1,
13722 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13723 ? ASCII_RESTRICT_PAT_MODS
13724 : ASCII_MORE_RESTRICT_PAT_MODS);
13729 RExC_seen_d_op = TRUE;
13731 else if (op == BOUNDL) {
13732 RExC_contains_locale = 1;
13736 op += NBOUND - BOUND;
13739 ret = reg_node(pRExC_state, op);
13740 FLAGS(REGNODE_p(ret)) = flags;
13744 goto finish_meta_pat;
13748 ret = reg_node(pRExC_state, LNBREAK);
13749 *flagp |= HASWIDTH|SIMPLE;
13750 goto finish_meta_pat;
13764 /* These all have the same meaning inside [brackets], and it knows
13765 * how to do the best optimizations for them. So, pretend we found
13766 * these within brackets, and let it do the work */
13769 ret = regclass(pRExC_state, flagp, depth+1,
13770 TRUE, /* means just parse this element */
13771 FALSE, /* don't allow multi-char folds */
13772 FALSE, /* don't silence non-portable warnings. It
13773 would be a bug if these returned
13775 (bool) RExC_strict,
13776 TRUE, /* Allow an optimized regnode result */
13778 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13779 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13780 * multi-char folds are allowed. */
13782 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13785 RExC_parse--; /* regclass() leaves this one too far ahead */
13788 /* The escapes above that don't take a parameter can't be
13789 * followed by a '{'. But 'pX', 'p{foo}' and
13790 * correspondingly 'P' can be */
13791 if ( RExC_parse - parse_start == 1
13792 && UCHARAT(RExC_parse + 1) == '{'
13793 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13796 vFAIL("Unescaped left brace in regex is illegal here");
13798 Set_Node_Offset(REGNODE_p(ret), parse_start);
13799 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13800 nextchar(pRExC_state);
13803 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13804 * \N{...} evaluates to a sequence of more than one code points).
13805 * The function call below returns a regnode, which is our result.
13806 * The parameters cause it to fail if the \N{} evaluates to a
13807 * single code point; we handle those like any other literal. The
13808 * reason that the multicharacter case is handled here and not as
13809 * part of the EXACtish code is because of quantifiers. In
13810 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13811 * this way makes that Just Happen. dmq.
13812 * join_exact() will join this up with adjacent EXACTish nodes
13813 * later on, if appropriate. */
13815 if (grok_bslash_N(pRExC_state,
13816 &ret, /* Want a regnode returned */
13817 NULL, /* Fail if evaluates to a single code
13819 NULL, /* Don't need a count of how many code
13828 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13830 /* Here, evaluates to a single code point. Go get that */
13831 RExC_parse = parse_start;
13834 case 'k': /* Handle \k<NAME> and \k'NAME' */
13838 if ( RExC_parse >= RExC_end - 1
13839 || (( ch = RExC_parse[1]) != '<'
13844 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13845 vFAIL2("Sequence %.2s... not terminated", parse_start);
13848 ret = handle_named_backref(pRExC_state,
13860 case '1': case '2': case '3': case '4':
13861 case '5': case '6': case '7': case '8': case '9':
13866 if (*RExC_parse == 'g') {
13870 if (*RExC_parse == '{') {
13874 if (*RExC_parse == '-') {
13878 if (hasbrace && !isDIGIT(*RExC_parse)) {
13879 if (isrel) RExC_parse--;
13881 goto parse_named_seq;
13884 if (RExC_parse >= RExC_end) {
13885 goto unterminated_g;
13887 num = S_backref_value(RExC_parse, RExC_end);
13889 vFAIL("Reference to invalid group 0");
13890 else if (num == I32_MAX) {
13891 if (isDIGIT(*RExC_parse))
13892 vFAIL("Reference to nonexistent group");
13895 vFAIL("Unterminated \\g... pattern");
13899 num = RExC_npar - num;
13901 vFAIL("Reference to nonexistent or unclosed group");
13905 num = S_backref_value(RExC_parse, RExC_end);
13906 /* bare \NNN might be backref or octal - if it is larger
13907 * than or equal RExC_npar then it is assumed to be an
13908 * octal escape. Note RExC_npar is +1 from the actual
13909 * number of parens. */
13910 /* Note we do NOT check if num == I32_MAX here, as that is
13911 * handled by the RExC_npar check */
13914 /* any numeric escape < 10 is always a backref */
13916 /* any numeric escape < RExC_npar is a backref */
13917 && num >= RExC_npar
13918 /* cannot be an octal escape if it starts with 8 */
13919 && *RExC_parse != '8'
13920 /* cannot be an octal escape if it starts with 9 */
13921 && *RExC_parse != '9'
13923 /* Probably not meant to be a backref, instead likely
13924 * to be an octal character escape, e.g. \35 or \777.
13925 * The above logic should make it obvious why using
13926 * octal escapes in patterns is problematic. - Yves */
13927 RExC_parse = parse_start;
13932 /* At this point RExC_parse points at a numeric escape like
13933 * \12 or \88 or something similar, which we should NOT treat
13934 * as an octal escape. It may or may not be a valid backref
13935 * escape. For instance \88888888 is unlikely to be a valid
13937 while (isDIGIT(*RExC_parse))
13940 if (*RExC_parse != '}')
13941 vFAIL("Unterminated \\g{...} pattern");
13944 if (num >= (I32)RExC_npar) {
13946 /* It might be a forward reference; we can't fail until we
13947 * know, by completing the parse to get all the groups, and
13948 * then reparsing */
13949 if (ALL_PARENS_COUNTED) {
13950 if (num >= RExC_total_parens) {
13951 vFAIL("Reference to nonexistent group");
13955 REQUIRE_PARENS_PASS;
13959 ret = reganode(pRExC_state,
13962 : (ASCII_FOLD_RESTRICTED)
13964 : (AT_LEAST_UNI_SEMANTICS)
13970 if (OP(REGNODE_p(ret)) == REFF) {
13971 RExC_seen_d_op = TRUE;
13973 *flagp |= HASWIDTH;
13975 /* override incorrect value set in reganode MJD */
13976 Set_Node_Offset(REGNODE_p(ret), parse_start);
13977 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13978 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13979 FALSE /* Don't force to /x */ );
13983 if (RExC_parse >= RExC_end)
13984 FAIL("Trailing \\");
13987 /* Do not generate "unrecognized" warnings here, we fall
13988 back into the quick-grab loop below */
13989 RExC_parse = parse_start;
13991 } /* end of switch on a \foo sequence */
13996 /* '#' comments should have been spaced over before this function was
13998 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14000 if (RExC_flags & RXf_PMf_EXTENDED) {
14001 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14002 if (RExC_parse < RExC_end)
14012 /* Here, we have determined that the next thing is probably a
14013 * literal character. RExC_parse points to the first byte of its
14014 * definition. (It still may be an escape sequence that evaluates
14015 * to a single character) */
14020 char *s, *old_s = NULL, *old_old_s = NULL;
14022 U32 max_string_len = 255;
14024 /* We may have to reparse the node, artificially stopping filling
14025 * it early, based on info gleaned in the first parse. This
14026 * variable gives where we stop. Make it above the normal stopping
14027 * place first time through; otherwise it would stop too early */
14028 U32 upper_fill = max_string_len + 1;
14030 /* We start out as an EXACT node, even if under /i, until we find a
14031 * character which is in a fold. The algorithm now segregates into
14032 * separate nodes, characters that fold from those that don't under
14033 * /i. (This hopefully will create nodes that are fixed strings
14034 * even under /i, giving the optimizer something to grab on to.)
14035 * So, if a node has something in it and the next character is in
14036 * the opposite category, that node is closed up, and the function
14037 * returns. Then regatom is called again, and a new node is
14038 * created for the new category. */
14039 U8 node_type = EXACT;
14041 /* Assume the node will be fully used; the excess is given back at
14042 * the end. Under /i, we may need to temporarily add the fold of
14043 * an extra character or two at the end to check for splitting
14044 * multi-char folds, so allocate extra space for that. We can't
14045 * make any other length assumptions, as a byte input sequence
14046 * could shrink down. */
14047 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14051 ? UTF8_MAXBYTES_CASE
14052 /* Max non-UTF-8 expansion is 2 */ : 2)));
14054 bool next_is_quantifier;
14055 char * oldp = NULL;
14057 /* We can convert EXACTF nodes to EXACTFU if they contain only
14058 * characters that match identically regardless of the target
14059 * string's UTF8ness. The reason to do this is that EXACTF is not
14060 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14063 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14064 * contain only above-Latin1 characters (hence must be in UTF8),
14065 * which don't participate in folds with Latin1-range characters,
14066 * as the latter's folds aren't known until runtime. */
14067 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14069 /* Single-character EXACTish nodes are almost always SIMPLE. This
14070 * allows us to override this as encountered */
14071 U8 maybe_SIMPLE = SIMPLE;
14073 /* Does this node contain something that can't match unless the
14074 * target string is (also) in UTF-8 */
14075 bool requires_utf8_target = FALSE;
14077 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14078 bool has_ss = FALSE;
14080 /* So is the MICRO SIGN */
14081 bool has_micro_sign = FALSE;
14083 /* Set when we fill up the current node and there is still more
14084 * text to process */
14087 /* Allocate an EXACT node. The node_type may change below to
14088 * another EXACTish node, but since the size of the node doesn't
14089 * change, it works */
14090 ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14092 FILL_NODE(ret, node_type);
14095 s = STRING(REGNODE_p(ret));
14106 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14107 maybe_SIMPLE = SIMPLE;
14108 requires_utf8_target = FALSE;
14110 has_micro_sign = FALSE;
14114 /* This breaks under rare circumstances. If folding, we do not
14115 * want to split a node at a character that is a non-final in a
14116 * multi-char fold, as an input string could just happen to want to
14117 * match across the node boundary. The code at the end of the loop
14118 * looks for this, and backs off until it finds not such a
14119 * character, but it is possible (though extremely, extremely
14120 * unlikely) for all characters in the node to be non-final fold
14121 * ones, in which case we just leave the node fully filled, and
14122 * hope that it doesn't match the string in just the wrong place */
14124 assert( ! UTF /* Is at the beginning of a character */
14125 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14126 || UTF8_IS_START(UCHARAT(RExC_parse)));
14128 overflowed = FALSE;
14130 /* Here, we have a literal character. Find the maximal string of
14131 * them in the input that we can fit into a single EXACTish node.
14132 * We quit at the first non-literal or when the node gets full, or
14133 * under /i the categorization of folding/non-folding character
14135 while (p < RExC_end && len < upper_fill) {
14137 /* In most cases each iteration adds one byte to the output.
14138 * The exceptions override this */
14139 Size_t added_len = 1;
14145 /* White space has already been ignored */
14146 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14147 || ! is_PATWS_safe((p), RExC_end, UTF));
14150 const char* message;
14163 /* Literal Escapes Switch
14165 This switch is meant to handle escape sequences that
14166 resolve to a literal character.
14168 Every escape sequence that represents something
14169 else, like an assertion or a char class, is handled
14170 in the switch marked 'Special Escapes' above in this
14171 routine, but also has an entry here as anything that
14172 isn't explicitly mentioned here will be treated as
14173 an unescaped equivalent literal.
14176 switch ((U8)*++p) {
14178 /* These are all the special escapes. */
14179 case 'A': /* Start assertion */
14180 case 'b': case 'B': /* Word-boundary assertion*/
14181 case 'C': /* Single char !DANGEROUS! */
14182 case 'd': case 'D': /* digit class */
14183 case 'g': case 'G': /* generic-backref, pos assertion */
14184 case 'h': case 'H': /* HORIZWS */
14185 case 'k': case 'K': /* named backref, keep marker */
14186 case 'p': case 'P': /* Unicode property */
14187 case 'R': /* LNBREAK */
14188 case 's': case 'S': /* space class */
14189 case 'v': case 'V': /* VERTWS */
14190 case 'w': case 'W': /* word class */
14191 case 'X': /* eXtended Unicode "combining
14192 character sequence" */
14193 case 'z': case 'Z': /* End of line/string assertion */
14197 /* Anything after here is an escape that resolves to a
14198 literal. (Except digits, which may or may not)
14204 case 'N': /* Handle a single-code point named character. */
14205 RExC_parse = p + 1;
14206 if (! grok_bslash_N(pRExC_state,
14207 NULL, /* Fail if evaluates to
14208 anything other than a
14209 single code point */
14210 &ender, /* The returned single code
14212 NULL, /* Don't need a count of
14213 how many code points */
14218 if (*flagp & NEED_UTF8)
14219 FAIL("panic: grok_bslash_N set NEED_UTF8");
14220 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14222 /* Here, it wasn't a single code point. Go close
14223 * up this EXACTish node. The switch() prior to
14224 * this switch handles the other cases */
14225 RExC_parse = p = oldp;
14229 RExC_parse = parse_start;
14231 /* The \N{} means the pattern, if previously /d,
14232 * becomes /u. That means it can't be an EXACTF node,
14233 * but an EXACTFU */
14234 if (node_type == EXACTF) {
14235 node_type = EXACTFU;
14237 /* If the node already contains something that
14238 * differs between EXACTF and EXACTFU, reparse it
14240 if (! maybe_exactfu) {
14261 ender = ESC_NATIVE;
14269 if (! grok_bslash_o(&p,
14274 (bool) RExC_strict,
14275 FALSE, /* No illegal cp's */
14278 RExC_parse = p; /* going to die anyway; point to
14279 exact spot of failure */
14283 if (message && TO_OUTPUT_WARNINGS(p)) {
14284 warn_non_literal_string(p, packed_warn, message);
14288 if (! grok_bslash_x(&p,
14293 (bool) RExC_strict,
14294 FALSE, /* No illegal cp's */
14297 RExC_parse = p; /* going to die anyway; point
14298 to exact spot of failure */
14302 if (message && TO_OUTPUT_WARNINGS(p)) {
14303 warn_non_literal_string(p, packed_warn, message);
14307 if (ender < 0x100) {
14308 if (RExC_recode_x_to_native) {
14309 ender = LATIN1_TO_NATIVE(ender);
14316 if (! grok_bslash_c(*p, &grok_c_char,
14317 &message, &packed_warn))
14319 /* going to die anyway; point to exact spot of
14321 RExC_parse = p + ((UTF)
14322 ? UTF8_SAFE_SKIP(p, RExC_end)
14327 ender = grok_c_char;
14329 if (message && TO_OUTPUT_WARNINGS(p)) {
14330 warn_non_literal_string(p, packed_warn, message);
14334 case '8': case '9': /* must be a backreference */
14336 /* we have an escape like \8 which cannot be an octal escape
14337 * so we exit the loop, and let the outer loop handle this
14338 * escape which may or may not be a legitimate backref. */
14340 case '1': case '2': case '3':case '4':
14341 case '5': case '6': case '7':
14342 /* When we parse backslash escapes there is ambiguity
14343 * between backreferences and octal escapes. Any escape
14344 * from \1 - \9 is a backreference, any multi-digit
14345 * escape which does not start with 0 and which when
14346 * evaluated as decimal could refer to an already
14347 * parsed capture buffer is a back reference. Anything
14350 * Note this implies that \118 could be interpreted as
14351 * 118 OR as "\11" . "8" depending on whether there
14352 * were 118 capture buffers defined already in the
14355 /* NOTE, RExC_npar is 1 more than the actual number of
14356 * parens we have seen so far, hence the "<" as opposed
14358 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14359 { /* Not to be treated as an octal constant, go
14367 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14368 | PERL_SCAN_NOTIFY_ILLDIGIT;
14370 ender = grok_oct(p, &numlen, &flags, NULL);
14372 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14373 && isDIGIT(*p) /* like \08, \178 */
14374 && ckWARN(WARN_REGEXP))
14376 reg_warn_non_literal_string(
14378 form_alien_digit_msg(8, numlen, p,
14379 RExC_end, UTF, FALSE));
14385 FAIL("Trailing \\");
14388 if (isALPHANUMERIC(*p)) {
14389 /* An alpha followed by '{' is going to fail next
14390 * iteration, so don't output this warning in that
14392 if (! isALPHA(*p) || *(p + 1) != '{') {
14393 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14394 " passed through", p);
14397 goto normal_default;
14398 } /* End of switch on '\' */
14401 /* Trying to gain new uses for '{' without breaking too
14402 * much existing code is hard. The solution currently
14404 * 1) If there is no ambiguity that a '{' should always
14405 * be taken literally, at the start of a construct, we
14407 * 2) If the literal '{' conflicts with our desired use
14408 * of it as a metacharacter, we die. The deprecation
14409 * cycles for this have come and gone.
14410 * 3) If there is ambiguity, we raise a simple warning.
14411 * This could happen, for example, if the user
14412 * intended it to introduce a quantifier, but slightly
14413 * misspelled the quantifier. Without this warning,
14414 * the quantifier would silently be taken as a literal
14415 * string of characters instead of a meta construct */
14416 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14418 || ( p > parse_start + 1
14419 && isALPHA_A(*(p - 1))
14420 && *(p - 2) == '\\')
14421 || new_regcurly(p, RExC_end))
14423 RExC_parse = p + 1;
14424 vFAIL("Unescaped left brace in regex is "
14427 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14428 " passed through");
14430 goto normal_default;
14433 if (p > RExC_parse && RExC_strict) {
14434 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14437 default: /* A literal character */
14439 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14441 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14442 &numlen, UTF8_ALLOW_DEFAULT);
14448 } /* End of switch on the literal */
14450 /* Here, have looked at the literal character, and <ender>
14451 * contains its ordinal; <p> points to the character after it.
14455 REQUIRE_UTF8(flagp);
14456 if ( UNICODE_IS_PERL_EXTENDED(ender)
14457 && TO_OUTPUT_WARNINGS(p))
14459 ckWARN2_non_literal_string(p,
14460 packWARN(WARN_PORTABLE),
14461 PL_extended_cp_format,
14466 /* We need to check if the next non-ignored thing is a
14467 * quantifier. Move <p> to after anything that should be
14468 * ignored, which, as a side effect, positions <p> for the next
14469 * loop iteration */
14470 skip_to_be_ignored_text(pRExC_state, &p,
14471 FALSE /* Don't force to /x */ );
14473 /* If the next thing is a quantifier, it applies to this
14474 * character only, which means that this character has to be in
14475 * its own node and can't just be appended to the string in an
14476 * existing node, so if there are already other characters in
14477 * the node, close the node with just them, and set up to do
14478 * this character again next time through, when it will be the
14479 * only thing in its new node */
14481 next_is_quantifier = LIKELY(p < RExC_end)
14482 && UNLIKELY(ISMULT2(p));
14484 if (next_is_quantifier && LIKELY(len)) {
14489 /* Ready to add 'ender' to the node */
14491 if (! FOLD) { /* The simple case, just append the literal */
14494 /* Don't output if it would overflow */
14495 if (UNLIKELY(len > max_string_len - ((UTF)
14496 ? UVCHR_SKIP(ender)
14503 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14504 *(s++) = (char) ender;
14507 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14508 added_len = (char *) new_s - s;
14509 s = (char *) new_s;
14512 requires_utf8_target = TRUE;
14516 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14518 /* Here are folding under /l, and the code point is
14519 * problematic. If this is the first character in the
14520 * node, change the node type to folding. Otherwise, if
14521 * this is the first problematic character, close up the
14522 * existing node, so can start a new node with this one */
14524 node_type = EXACTFL;
14525 RExC_contains_locale = 1;
14527 else if (node_type == EXACT) {
14532 /* This problematic code point means we can't simplify
14534 maybe_exactfu = FALSE;
14536 /* Here, we are adding a problematic fold character.
14537 * "Problematic" in this context means that its fold isn't
14538 * known until runtime. (The non-problematic code points
14539 * are the above-Latin1 ones that fold to also all
14540 * above-Latin1. Their folds don't vary no matter what the
14541 * locale is.) But here we have characters whose fold
14542 * depends on the locale. We just add in the unfolded
14543 * character, and wait until runtime to fold it */
14544 goto not_fold_common;
14546 else /* regular fold; see if actually is in a fold */
14547 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14549 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14551 /* Here, folding, but the character isn't in a fold.
14553 * Start a new node if previous characters in the node were
14555 if (len && node_type != EXACT) {
14560 /* Here, continuing a node with non-folded characters. Add
14562 goto not_fold_common;
14564 else { /* Here, does participate in some fold */
14566 /* If this is the first character in the node, change its
14567 * type to folding. Otherwise, if this is the first
14568 * folding character in the node, close up the existing
14569 * node, so can start a new node with this one. */
14571 node_type = compute_EXACTish(pRExC_state);
14573 else if (node_type == EXACT) {
14578 if (UTF) { /* Alway use the folded value for UTF-8
14580 if (UVCHR_IS_INVARIANT(ender)) {
14581 if (UNLIKELY(len + 1 > max_string_len)) {
14586 *(s)++ = (U8) toFOLD(ender);
14589 UV folded = _to_uni_fold_flags(
14591 (U8 *) s, /* We have allocated extra space
14592 in 's' so can't run off the
14595 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14596 ? FOLD_FLAGS_NOMIX_ASCII
14598 if (UNLIKELY(len + added_len > max_string_len)) {
14606 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14608 /* U+B5 folds to the MU, so its possible for a
14609 * non-UTF-8 target to match it */
14610 requires_utf8_target = TRUE;
14614 else { /* Here is non-UTF8. */
14616 /* The fold will be one or (rarely) two characters.
14617 * Check that there's room for at least a single one
14618 * before setting any flags, etc. Because otherwise an
14619 * overflowing character could cause a flag to be set
14620 * even though it doesn't end up in this node. (For
14621 * the two character fold, we check again, before
14622 * setting any flags) */
14623 if (UNLIKELY(len + 1 > max_string_len)) {
14628 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14629 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14630 || UNICODE_DOT_DOT_VERSION > 0)
14632 /* On non-ancient Unicodes, check for the only possible
14633 * multi-char fold */
14634 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14636 /* This potential multi-char fold means the node
14637 * can't be simple (because it could match more
14638 * than a single char). And in some cases it will
14639 * match 'ss', so set that flag */
14643 /* It can't change to be an EXACTFU (unless already
14644 * is one). We fold it iff under /u rules. */
14645 if (node_type != EXACTFU) {
14646 maybe_exactfu = FALSE;
14649 if (UNLIKELY(len + 2 > max_string_len)) {
14658 goto done_with_this_char;
14661 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14663 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14665 /* Also, the sequence 'ss' is special when not
14666 * under /u. If the target string is UTF-8, it
14667 * should match SHARP S; otherwise it won't. So,
14668 * here we have to exclude the possibility of this
14669 * node moving to /u.*/
14671 maybe_exactfu = FALSE;
14674 /* Here, the fold will be a single character */
14676 if (UNLIKELY(ender == MICRO_SIGN)) {
14677 has_micro_sign = TRUE;
14679 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14681 /* If the character's fold differs between /d and
14682 * /u, this can't change to be an EXACTFU node */
14683 maybe_exactfu = FALSE;
14686 *(s++) = (DEPENDS_SEMANTICS)
14687 ? (char) toFOLD(ender)
14689 /* Under /u, the fold of any character in
14690 * the 0-255 range happens to be its
14691 * lowercase equivalent, except for LATIN
14692 * SMALL LETTER SHARP S, which was handled
14693 * above, and the MICRO SIGN, whose fold
14694 * requires UTF-8 to represent. */
14695 : (char) toLOWER_L1(ender);
14697 } /* End of adding current character to the node */
14699 done_with_this_char:
14703 if (next_is_quantifier) {
14705 /* Here, the next input is a quantifier, and to get here,
14706 * the current character is the only one in the node. */
14710 } /* End of loop through literal characters */
14712 /* Here we have either exhausted the input or run out of room in
14713 * the node. If the former, we are done. (If we encountered a
14714 * character that can't be in the node, transfer is made directly
14715 * to <loopdone>, and so we wouldn't have fallen off the end of the
14717 if (LIKELY(! overflowed)) {
14721 /* Here we have run out of room. We can grow plain EXACT and
14722 * LEXACT nodes. If the pattern is gigantic enough, though,
14723 * eventually we'll have to artificially chunk the pattern into
14724 * multiple nodes. */
14725 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14726 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14727 Size_t overhead_expansion = 0;
14729 Size_t max_nodes_for_string;
14733 /* Here we couldn't fit the final character in the current
14734 * node, so it will have to be reparsed, no matter what else we
14738 /* If would have overflowed a regular EXACT node, switch
14739 * instead to an LEXACT. The code below is structured so that
14740 * the actual growing code is common to changing from an EXACT
14741 * or just increasing the LEXACT size. This means that we have
14742 * to save the string in the EXACT case before growing, and
14743 * then copy it afterwards to its new location */
14744 if (node_type == EXACT) {
14745 overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14746 RExC_emit += overhead_expansion;
14747 Copy(s0, temp, len, char);
14750 /* Ready to grow. If it was a plain EXACT, the string was
14751 * saved, and the first few bytes of it overwritten by adding
14752 * an argument field. We assume, as we do elsewhere in this
14753 * file, that one byte of remaining input will translate into
14754 * one byte of output, and if that's too small, we grow again,
14755 * if too large the excess memory is freed at the end */
14757 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14758 achievable = MIN(max_nodes_for_string,
14759 current_string_nodes + STR_SZ(RExC_end - p));
14760 delta = achievable - current_string_nodes;
14762 /* If there is just no more room, go finish up this chunk of
14768 change_engine_size(pRExC_state, delta + overhead_expansion);
14769 current_string_nodes += delta;
14771 = sizeof(struct regnode) * current_string_nodes;
14772 upper_fill = max_string_len + 1;
14774 /* If the length was small, we know this was originally an
14775 * EXACT node now converted to LEXACT, and the string has to be
14776 * restored. Otherwise the string was untouched. 260 is just
14777 * a number safely above 255 so don't have to worry about
14778 * getting it precise */
14780 node_type = LEXACT;
14781 FILL_NODE(ret, node_type);
14782 s0 = STRING(REGNODE_p(ret));
14783 Copy(temp, s0, len, char);
14787 goto continue_parse;
14790 bool splittable = FALSE;
14791 bool backed_up = FALSE;
14792 char * e; /* should this be U8? */
14793 char * s_start; /* should this be U8? */
14795 /* Here is /i. Running out of room creates a problem if we are
14796 * folding, and the split happens in the middle of a
14797 * multi-character fold, as a match that should have occurred,
14798 * won't, due to the way nodes are matched, and our artificial
14799 * boundary. So back off until we aren't splitting such a
14800 * fold. If there is no such place to back off to, we end up
14801 * taking the entire node as-is. This can happen if the node
14802 * consists entirely of 'f' or entirely of 's' characters (or
14803 * things that fold to them) as 'ff' and 'ss' are
14804 * multi-character folds.
14806 * The Unicode standard says that multi character folds consist
14807 * of either two or three characters. That means we would be
14808 * splitting one if the final character in the node is at the
14809 * beginning of either type, or is the second of a three
14813 * ender is the code point of the character that won't fit
14815 * s points to just beyond the final byte in the node.
14816 * It's where we would place ender if there were
14817 * room, and where in fact we do place ender's fold
14818 * in the code below, as we've over-allocated space
14819 * for s0 (hence s) to allow for this
14820 * e starts at 's' and advances as we append things.
14821 * old_s is the same as 's'. (If ender had fit, 's' would
14822 * have been advanced to beyond it).
14823 * old_old_s points to the beginning byte of the final
14824 * character in the node
14825 * p points to the beginning byte in the input of the
14826 * character beyond 'ender'.
14827 * oldp points to the beginning byte in the input of
14830 * In the case of /il, we haven't folded anything that could be
14831 * affected by the locale. That means only above-Latin1
14832 * characters that fold to other above-latin1 characters get
14833 * folded at compile time. To check where a good place to
14834 * split nodes is, everything in it will have to be folded.
14835 * The boolean 'maybe_exactfu' keeps track in /il if there are
14836 * any unfolded characters in the node. */
14837 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14839 /* If we do need to fold the node, we need a place to store the
14840 * folded copy, and a way to map back to the unfolded original
14842 char * locfold_buf = NULL;
14843 Size_t * loc_correspondence = NULL;
14845 if (! need_to_fold_loc) { /* The normal case. Just
14846 initialize to the actual node */
14849 s = old_old_s; /* Point to the beginning of the final char
14850 that fits in the node */
14854 /* Here, we have filled a /il node, and there are unfolded
14855 * characters in it. If the runtime locale turns out to be
14856 * UTF-8, there are possible multi-character folds, just
14857 * like when not under /l. The node hence can't terminate
14858 * in the middle of such a fold. To determine this, we
14859 * have to create a folded copy of this node. That means
14860 * reparsing the node, folding everything assuming a UTF-8
14861 * locale. (If at runtime it isn't such a locale, the
14862 * actions here wouldn't have been necessary, but we have
14863 * to assume the worst case.) If we find we need to back
14864 * off the folded string, we do so, and then map that
14865 * position back to the original unfolded node, which then
14866 * gets output, truncated at that spot */
14868 char * redo_p = RExC_parse;
14872 /* Allow enough space assuming a single byte input folds to
14873 * a single byte output, plus assume that the two unparsed
14874 * characters (that we may need) fold to the largest number
14875 * of bytes possible, plus extra for one more worst case
14876 * scenario. In the loop below, if we start eating into
14877 * that final spare space, we enlarge this initial space */
14878 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14880 Newxz(locfold_buf, size, char);
14881 Newxz(loc_correspondence, size, Size_t);
14883 /* Redo this node's parse, folding into 'locfold_buf' */
14884 redo_p = RExC_parse;
14885 old_redo_e = redo_e = locfold_buf;
14886 while (redo_p <= oldp) {
14888 old_redo_e = redo_e;
14889 loc_correspondence[redo_e - locfold_buf]
14890 = redo_p - RExC_parse;
14895 (void) _to_utf8_fold_flags((U8 *) redo_p,
14900 redo_e += added_len;
14901 redo_p += UTF8SKIP(redo_p);
14905 /* Note that if this code is run on some ancient
14906 * Unicode versions, SHARP S doesn't fold to 'ss',
14907 * but rather than clutter the code with #ifdef's,
14908 * as is done above, we ignore that possibility.
14909 * This is ok because this code doesn't affect what
14910 * gets matched, but merely where the node gets
14912 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14913 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14923 /* If we're getting so close to the end that a
14924 * worst-case fold in the next character would cause us
14925 * to overflow, increase, assuming one byte output byte
14926 * per one byte input one, plus room for another worst
14928 if ( redo_p <= oldp
14929 && redo_e > locfold_buf + size
14930 - (UTF8_MAXBYTES_CASE + 1))
14932 Size_t new_size = size
14934 + UTF8_MAXBYTES_CASE + 1;
14935 Ptrdiff_t e_offset = redo_e - locfold_buf;
14937 Renew(locfold_buf, new_size, char);
14938 Renew(loc_correspondence, new_size, Size_t);
14941 redo_e = locfold_buf + e_offset;
14945 /* Set so that things are in terms of the folded, temporary
14948 s_start = locfold_buf;
14953 /* Here, we have 's', 's_start' and 'e' set up to point to the
14954 * input that goes into the node, folded.
14956 * If the final character of the node and the fold of ender
14957 * form the first two characters of a three character fold, we
14958 * need to peek ahead at the next (unparsed) character in the
14959 * input to determine if the three actually do form such a
14960 * fold. Just looking at that character is not generally
14961 * sufficient, as it could be, for example, an escape sequence
14962 * that evaluates to something else, and it needs to be folded.
14964 * khw originally thought to just go through the parse loop one
14965 * extra time, but that doesn't work easily as that iteration
14966 * could cause things to think that the parse is over and to
14967 * goto loopdone. The character could be a '$' for example, or
14968 * the character beyond could be a quantifier, and other
14969 * glitches as well.
14971 * The solution used here for peeking ahead is to look at that
14972 * next character. If it isn't ASCII punctuation, then it will
14973 * be something that continues in an EXACTish node if there
14974 * were space. We append the fold of it to s, having reserved
14975 * enough room in s0 for the purpose. If we can't reasonably
14976 * peek ahead, we instead assume the worst case: that it is
14977 * something that would form the completion of a multi-char
14980 * If we can't split between s and ender, we work backwards
14981 * character-by-character down to s0. At each current point
14982 * see if we are at the beginning of a multi-char fold. If so,
14983 * that means we would be splitting the fold across nodes, and
14984 * so we back up one and try again.
14986 * If we're not at the beginning, we still could be at the
14987 * final two characters of a (rare) three character fold. We
14988 * check if the sequence starting at the character before the
14989 * current position (and including the current and next
14990 * characters) is a three character fold. If not, the node can
14991 * be split here. If it is, we have to backup two characters
14994 * Otherwise, the node can be split at the current position.
14996 * The same logic is used for UTF-8 patterns and not */
15000 /* Append the fold of ender */
15001 (void) _to_uni_fold_flags(
15005 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15006 ? FOLD_FLAGS_NOMIX_ASCII
15010 /* 's' and the character folded to by ender may be the
15011 * first two of a three-character fold, in which case the
15012 * node should not be split here. That may mean examining
15013 * the so-far unparsed character starting at 'p'. But if
15014 * ender folded to more than one character, we already have
15015 * three characters to look at. Also, we first check if
15016 * the sequence consisting of s and the next character form
15017 * the first two of some three character fold. If not,
15018 * there's no need to peek ahead. */
15019 if ( added_len <= UTF8SKIP(e - added_len)
15020 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15022 /* Here, the two do form the beginning of a potential
15023 * three character fold. The unexamined character may
15024 * or may not complete it. Peek at it. It might be
15025 * something that ends the node or an escape sequence,
15026 * in which case we don't know without a lot of work
15027 * what it evaluates to, so we have to assume the worst
15028 * case: that it does complete the fold, and so we
15029 * can't split here. All such instances will have
15030 * that character be an ASCII punctuation character,
15031 * like a backslash. So, for that case, backup one and
15032 * drop down to try at that position */
15034 s = (char *) utf8_hop_back((U8 *) s, -1,
15039 /* Here, since it's not punctuation, it must be a
15040 * real character, and we can append its fold to
15041 * 'e' (having deliberately reserved enough space
15042 * for this eventuality) and drop down to check if
15043 * the three actually do form a folded sequence */
15044 (void) _to_utf8_fold_flags(
15045 (U8 *) p, (U8 *) RExC_end,
15048 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15049 ? FOLD_FLAGS_NOMIX_ASCII
15055 /* Here, we either have three characters available in
15056 * sequence starting at 's', or we have two characters and
15057 * know that the following one can't possibly be part of a
15058 * three character fold. We go through the node backwards
15059 * until we find a place where we can split it without
15060 * breaking apart a multi-character fold. At any given
15061 * point we have to worry about if such a fold begins at
15062 * the current 's', and also if a three-character fold
15063 * begins at s-1, (containing s and s+1). Splitting in
15064 * either case would break apart a fold */
15066 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15069 /* If is a multi-char fold, can't split here. Backup
15070 * one char and try again */
15071 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15077 /* If the two characters beginning at 's' are part of a
15078 * three character fold starting at the character
15079 * before s, we can't split either before or after s.
15080 * Backup two chars and try again */
15081 if ( LIKELY(s > s_start)
15082 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15085 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15090 /* Here there's no multi-char fold between s and the
15091 * next character following it. We can split */
15095 } while (s > s_start); /* End of loops backing up through the node */
15097 /* Here we either couldn't find a place to split the node,
15098 * or else we broke out of the loop setting 'splittable' to
15099 * true. In the latter case, the place to split is between
15100 * the first and second characters in the sequence starting
15106 else { /* Pattern not UTF-8 */
15107 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15108 || ASCII_FOLD_RESTRICTED)
15110 assert( toLOWER_L1(ender) < 256 );
15111 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15119 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15126 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15127 || ASCII_FOLD_RESTRICTED)
15129 assert( toLOWER_L1(ender) < 256 );
15130 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15140 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15146 if ( LIKELY(s > s_start)
15147 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15157 } while (s > s_start);
15164 /* Here, we are done backing up. If we didn't backup at all
15165 * (the likely case), just proceed */
15168 /* If we did find a place to split, reparse the entire node
15169 * stopping where we have calculated. */
15172 /* If we created a temporary folded string under /l, we
15173 * have to map that back to the original */
15174 if (need_to_fold_loc) {
15175 upper_fill = loc_correspondence[s - s_start];
15176 Safefree(locfold_buf);
15177 Safefree(loc_correspondence);
15179 if (upper_fill == 0) {
15180 FAIL2("panic: loc_correspondence[%d] is 0",
15181 (int) (s - s_start));
15185 upper_fill = s - s0;
15189 else if (need_to_fold_loc) {
15190 Safefree(locfold_buf);
15191 Safefree(loc_correspondence);
15194 /* Here the node consists entirely of non-final multi-char
15195 * folds. (Likely it is all 'f's or all 's's.) There's no
15196 * decent place to split it, so give up and just take the
15200 } /* End of verifying node ends with an appropriate char */
15202 /* We need to start the next node at the character that didn't fit
15206 loopdone: /* Jumped to when encounters something that shouldn't be
15209 /* Free up any over-allocated space; cast is to silence bogus
15210 * warning in MS VC */
15211 change_engine_size(pRExC_state,
15212 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15214 /* I (khw) don't know if you can get here with zero length, but the
15215 * old code handled this situation by creating a zero-length EXACT
15216 * node. Might as well be NOTHING instead */
15218 OP(REGNODE_p(ret)) = NOTHING;
15222 /* If the node type is EXACT here, check to see if it
15223 * should be EXACTL, or EXACT_REQ8. */
15224 if (node_type == EXACT) {
15226 node_type = EXACTL;
15228 else if (requires_utf8_target) {
15229 node_type = EXACT_REQ8;
15232 else if (node_type == LEXACT) {
15233 if (requires_utf8_target) {
15234 node_type = LEXACT_REQ8;
15238 if ( UNLIKELY(has_micro_sign || has_ss)
15239 && (node_type == EXACTFU || ( node_type == EXACTF
15240 && maybe_exactfu)))
15241 { /* These two conditions are problematic in non-UTF-8
15244 node_type = EXACTFUP;
15246 else if (node_type == EXACTFL) {
15248 /* 'maybe_exactfu' is deliberately set above to
15249 * indicate this node type, where all code points in it
15251 if (maybe_exactfu) {
15252 node_type = EXACTFLU8;
15255 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15257 /* A character that folds to more than one will
15258 * match multiple characters, so can't be SIMPLE.
15259 * We don't have to worry about this with EXACTFLU8
15260 * nodes just above, as they have already been
15261 * folded (since the fold doesn't vary at run
15262 * time). Here, if the final character in the node
15263 * folds to multiple, it can't be simple. (This
15264 * only has an effect if the node has only a single
15265 * character, hence the final one, as elsewhere we
15266 * turn off simple for nodes whose length > 1 */
15270 else if (node_type == EXACTF) { /* Means is /di */
15272 /* This intermediate variable is needed solely because
15273 * the asserts in the macro where used exceed Win32's
15274 * literal string capacity */
15275 char first_char = * STRING(REGNODE_p(ret));
15277 /* If 'maybe_exactfu' is clear, then we need to stay
15278 * /di. If it is set, it means there are no code
15279 * points that match differently depending on UTF8ness
15280 * of the target string, so it can become an EXACTFU
15282 if (! maybe_exactfu) {
15283 RExC_seen_d_op = TRUE;
15285 else if ( isALPHA_FOLD_EQ(first_char, 's')
15286 || isALPHA_FOLD_EQ(ender, 's'))
15288 /* But, if the node begins or ends in an 's' we
15289 * have to defer changing it into an EXACTFU, as
15290 * the node could later get joined with another one
15291 * that ends or begins with 's' creating an 'ss'
15292 * sequence which would then wrongly match the
15293 * sharp s without the target being UTF-8. We
15294 * create a special node that we resolve later when
15295 * we join nodes together */
15297 node_type = EXACTFU_S_EDGE;
15300 node_type = EXACTFU;
15304 if (requires_utf8_target && node_type == EXACTFU) {
15305 node_type = EXACTFU_REQ8;
15309 OP(REGNODE_p(ret)) = node_type;
15310 setSTR_LEN(REGNODE_p(ret), len);
15311 RExC_emit += STR_SZ(len);
15313 /* If the node isn't a single character, it can't be SIMPLE */
15314 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15318 *flagp |= HASWIDTH | maybe_SIMPLE;
15321 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15325 /* len is STRLEN which is unsigned, need to copy to signed */
15328 vFAIL("Internal disaster");
15331 } /* End of label 'defchar:' */
15333 } /* End of giant switch on input character */
15335 /* Position parse to next real character */
15336 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15337 FALSE /* Don't force to /x */ );
15338 if ( *RExC_parse == '{'
15339 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15341 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15343 vFAIL("Unescaped left brace in regex is illegal here");
15345 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15346 " passed through");
15354 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15356 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
15357 * sets up the bitmap and any flags, removing those code points from the
15358 * inversion list, setting it to NULL should it become completely empty */
15362 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15363 assert(PL_regkind[OP(node)] == ANYOF);
15365 /* There is no bitmap for this node type */
15366 if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15370 ANYOF_BITMAP_ZERO(node);
15371 if (*invlist_ptr) {
15373 /* This gets set if we actually need to modify things */
15374 bool change_invlist = FALSE;
15378 /* Start looking through *invlist_ptr */
15379 invlist_iterinit(*invlist_ptr);
15380 while (invlist_iternext(*invlist_ptr, &start, &end)) {
15384 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15385 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15388 /* Quit if are above what we should change */
15389 if (start >= NUM_ANYOF_CODE_POINTS) {
15393 change_invlist = TRUE;
15395 /* Set all the bits in the range, up to the max that we are doing */
15396 high = (end < NUM_ANYOF_CODE_POINTS - 1)
15398 : NUM_ANYOF_CODE_POINTS - 1;
15399 for (i = start; i <= (int) high; i++) {
15400 if (! ANYOF_BITMAP_TEST(node, i)) {
15401 ANYOF_BITMAP_SET(node, i);
15405 invlist_iterfinish(*invlist_ptr);
15407 /* Done with loop; remove any code points that are in the bitmap from
15408 * *invlist_ptr; similarly for code points above the bitmap if we have
15409 * a flag to match all of them anyways */
15410 if (change_invlist) {
15411 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15413 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15414 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15417 /* If have completely emptied it, remove it completely */
15418 if (_invlist_len(*invlist_ptr) == 0) {
15419 SvREFCNT_dec_NN(*invlist_ptr);
15420 *invlist_ptr = NULL;
15425 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15426 Character classes ([:foo:]) can also be negated ([:^foo:]).
15427 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15428 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15429 but trigger failures because they are currently unimplemented. */
15431 #define POSIXCC_DONE(c) ((c) == ':')
15432 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15433 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15434 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15436 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
15437 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
15438 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
15440 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15442 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15444 #define ADD_POSIX_WARNING(p, text) STMT_START { \
15445 if (posix_warnings) { \
15446 if (! RExC_warn_text ) RExC_warn_text = \
15447 (AV *) sv_2mortal((SV *) newAV()); \
15448 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
15452 REPORT_LOCATION_ARGS(p))); \
15455 #define CLEAR_POSIX_WARNINGS() \
15457 if (posix_warnings && RExC_warn_text) \
15458 av_clear(RExC_warn_text); \
15461 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
15463 CLEAR_POSIX_WARNINGS(); \
15468 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15470 const char * const s, /* Where the putative posix class begins.
15471 Normally, this is one past the '['. This
15472 parameter exists so it can be somewhere
15473 besides RExC_parse. */
15474 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15476 AV ** posix_warnings, /* Where to place any generated warnings, or
15478 const bool check_only /* Don't die if error */
15481 /* This parses what the caller thinks may be one of the three POSIX
15483 * 1) a character class, like [:blank:]
15484 * 2) a collating symbol, like [. .]
15485 * 3) an equivalence class, like [= =]
15486 * In the latter two cases, it croaks if it finds a syntactically legal
15487 * one, as these are not handled by Perl.
15489 * The main purpose is to look for a POSIX character class. It returns:
15490 * a) the class number
15491 * if it is a completely syntactically and semantically legal class.
15492 * 'updated_parse_ptr', if not NULL, is set to point to just after the
15493 * closing ']' of the class
15494 * b) OOB_NAMEDCLASS
15495 * if it appears that one of the three POSIX constructs was meant, but
15496 * its specification was somehow defective. 'updated_parse_ptr', if
15497 * not NULL, is set to point to the character just after the end
15498 * character of the class. See below for handling of warnings.
15499 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15500 * if it doesn't appear that a POSIX construct was intended.
15501 * 'updated_parse_ptr' is not changed. No warnings nor errors are
15504 * In b) there may be errors or warnings generated. If 'check_only' is
15505 * TRUE, then any errors are discarded. Warnings are returned to the
15506 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
15507 * instead it is NULL, warnings are suppressed.
15509 * The reason for this function, and its complexity is that a bracketed
15510 * character class can contain just about anything. But it's easy to
15511 * mistype the very specific posix class syntax but yielding a valid
15512 * regular bracketed class, so it silently gets compiled into something
15513 * quite unintended.
15515 * The solution adopted here maintains backward compatibility except that
15516 * it adds a warning if it looks like a posix class was intended but
15517 * improperly specified. The warning is not raised unless what is input
15518 * very closely resembles one of the 14 legal posix classes. To do this,
15519 * it uses fuzzy parsing. It calculates how many single-character edits it
15520 * would take to transform what was input into a legal posix class. Only
15521 * if that number is quite small does it think that the intention was a
15522 * posix class. Obviously these are heuristics, and there will be cases
15523 * where it errs on one side or another, and they can be tweaked as
15524 * experience informs.
15526 * The syntax for a legal posix class is:
15528 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15530 * What this routine considers syntactically to be an intended posix class
15531 * is this (the comments indicate some restrictions that the pattern
15534 * qr/(?x: \[? # The left bracket, possibly
15536 * \h* # possibly followed by blanks
15537 * (?: \^ \h* )? # possibly a misplaced caret
15538 * [:;]? # The opening class character,
15539 * # possibly omitted. A typo
15540 * # semi-colon can also be used.
15542 * \^? # possibly a correctly placed
15543 * # caret, but not if there was also
15544 * # a misplaced one
15546 * .{3,15} # The class name. If there are
15547 * # deviations from the legal syntax,
15548 * # its edit distance must be close
15549 * # to a real class name in order
15550 * # for it to be considered to be
15551 * # an intended posix class.
15553 * [[:punct:]]? # The closing class character,
15554 * # possibly omitted. If not a colon
15555 * # nor semi colon, the class name
15556 * # must be even closer to a valid
15559 * \]? # The right bracket, possibly
15563 * In the above, \h must be ASCII-only.
15565 * These are heuristics, and can be tweaked as field experience dictates.
15566 * There will be cases when someone didn't intend to specify a posix class
15567 * that this warns as being so. The goal is to minimize these, while
15568 * maximizing the catching of things intended to be a posix class that
15569 * aren't parsed as such.
15573 const char * const e = RExC_end;
15574 unsigned complement = 0; /* If to complement the class */
15575 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15576 bool has_opening_bracket = FALSE;
15577 bool has_opening_colon = FALSE;
15578 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15580 const char * possible_end = NULL; /* used for a 2nd parse pass */
15581 const char* name_start; /* ptr to class name first char */
15583 /* If the number of single-character typos the input name is away from a
15584 * legal name is no more than this number, it is considered to have meant
15585 * the legal name */
15586 int max_distance = 2;
15588 /* to store the name. The size determines the maximum length before we
15589 * decide that no posix class was intended. Should be at least
15590 * sizeof("alphanumeric") */
15592 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15594 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15596 CLEAR_POSIX_WARNINGS();
15599 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15602 if (*(p - 1) != '[') {
15603 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15604 found_problem = TRUE;
15607 has_opening_bracket = TRUE;
15610 /* They could be confused and think you can put spaces between the
15613 found_problem = TRUE;
15617 } while (p < e && isBLANK(*p));
15619 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15622 /* For [. .] and [= =]. These are quite different internally from [: :],
15623 * so they are handled separately. */
15624 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15625 and 1 for at least one char in it
15628 const char open_char = *p;
15629 const char * temp_ptr = p + 1;
15631 /* These two constructs are not handled by perl, and if we find a
15632 * syntactically valid one, we croak. khw, who wrote this code, finds
15633 * this explanation of them very unclear:
15634 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15635 * And searching the rest of the internet wasn't very helpful either.
15636 * It looks like just about any byte can be in these constructs,
15637 * depending on the locale. But unless the pattern is being compiled
15638 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15639 * In that case, it looks like [= =] isn't allowed at all, and that
15640 * [. .] could be any single code point, but for longer strings the
15641 * constituent characters would have to be the ASCII alphabetics plus
15642 * the minus-hyphen. Any sensible locale definition would limit itself
15643 * to these. And any portable one definitely should. Trying to parse
15644 * the general case is a nightmare (see [perl #127604]). So, this code
15645 * looks only for interiors of these constructs that match:
15647 * Using \w relaxes the apparent rules a little, without adding much
15648 * danger of mistaking something else for one of these constructs.
15650 * [. .] in some implementations described on the internet is usable to
15651 * escape a character that otherwise is special in bracketed character
15652 * classes. For example [.].] means a literal right bracket instead of
15653 * the ending of the class
15655 * [= =] can legitimately contain a [. .] construct, but we don't
15656 * handle this case, as that [. .] construct will later get parsed
15657 * itself and croak then. And [= =] is checked for even when not under
15658 * /l, as Perl has long done so.
15660 * The code below relies on there being a trailing NUL, so it doesn't
15661 * have to keep checking if the parse ptr < e.
15663 if (temp_ptr[1] == open_char) {
15666 else while ( temp_ptr < e
15667 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15672 if (*temp_ptr == open_char) {
15674 if (*temp_ptr == ']') {
15676 if (! found_problem && ! check_only) {
15677 RExC_parse = (char *) temp_ptr;
15678 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15679 "extensions", open_char, open_char);
15682 /* Here, the syntax wasn't completely valid, or else the call
15683 * is to check-only */
15684 if (updated_parse_ptr) {
15685 *updated_parse_ptr = (char *) temp_ptr;
15688 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15692 /* If we find something that started out to look like one of these
15693 * constructs, but isn't, we continue below so that it can be checked
15694 * for being a class name with a typo of '.' or '=' instead of a colon.
15698 /* Here, we think there is a possibility that a [: :] class was meant, and
15699 * we have the first real character. It could be they think the '^' comes
15702 found_problem = TRUE;
15703 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15708 found_problem = TRUE;
15712 } while (p < e && isBLANK(*p));
15714 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15718 /* But the first character should be a colon, which they could have easily
15719 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15720 * distinguish from a colon, so treat that as a colon). */
15723 has_opening_colon = TRUE;
15725 else if (*p == ';') {
15726 found_problem = TRUE;
15728 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15729 has_opening_colon = TRUE;
15732 found_problem = TRUE;
15733 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15735 /* Consider an initial punctuation (not one of the recognized ones) to
15736 * be a left terminator */
15737 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15742 /* They may think that you can put spaces between the components */
15744 found_problem = TRUE;
15748 } while (p < e && isBLANK(*p));
15750 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15755 /* We consider something like [^:^alnum:]] to not have been intended to
15756 * be a posix class, but XXX maybe we should */
15758 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15765 /* Again, they may think that you can put spaces between the components */
15767 found_problem = TRUE;
15771 } while (p < e && isBLANK(*p));
15773 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15778 /* XXX This ']' may be a typo, and something else was meant. But
15779 * treating it as such creates enough complications, that that
15780 * possibility isn't currently considered here. So we assume that the
15781 * ']' is what is intended, and if we've already found an initial '[',
15782 * this leaves this construct looking like [:] or [:^], which almost
15783 * certainly weren't intended to be posix classes */
15784 if (has_opening_bracket) {
15785 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15788 /* But this function can be called when we parse the colon for
15789 * something like qr/[alpha:]]/, so we back up to look for the
15794 found_problem = TRUE;
15795 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15797 else if (*p != ':') {
15799 /* XXX We are currently very restrictive here, so this code doesn't
15800 * consider the possibility that, say, /[alpha.]]/ was intended to
15801 * be a posix class. */
15802 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15805 /* Here we have something like 'foo:]'. There was no initial colon,
15806 * and we back up over 'foo. XXX Unlike the going forward case, we
15807 * don't handle typos of non-word chars in the middle */
15808 has_opening_colon = FALSE;
15811 while (p > RExC_start && isWORDCHAR(*p)) {
15816 /* Here, we have positioned ourselves to where we think the first
15817 * character in the potential class is */
15820 /* Now the interior really starts. There are certain key characters that
15821 * can end the interior, or these could just be typos. To catch both
15822 * cases, we may have to do two passes. In the first pass, we keep on
15823 * going unless we come to a sequence that matches
15824 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15825 * This means it takes a sequence to end the pass, so two typos in a row if
15826 * that wasn't what was intended. If the class is perfectly formed, just
15827 * this one pass is needed. We also stop if there are too many characters
15828 * being accumulated, but this number is deliberately set higher than any
15829 * real class. It is set high enough so that someone who thinks that
15830 * 'alphanumeric' is a correct name would get warned that it wasn't.
15831 * While doing the pass, we keep track of where the key characters were in
15832 * it. If we don't find an end to the class, and one of the key characters
15833 * was found, we redo the pass, but stop when we get to that character.
15834 * Thus the key character was considered a typo in the first pass, but a
15835 * terminator in the second. If two key characters are found, we stop at
15836 * the second one in the first pass. Again this can miss two typos, but
15837 * catches a single one
15839 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15840 * point to the first key character. For the second pass, it starts as -1.
15846 bool has_blank = FALSE;
15847 bool has_upper = FALSE;
15848 bool has_terminating_colon = FALSE;
15849 bool has_terminating_bracket = FALSE;
15850 bool has_semi_colon = FALSE;
15851 unsigned int name_len = 0;
15852 int punct_count = 0;
15856 /* Squeeze out blanks when looking up the class name below */
15857 if (isBLANK(*p) ) {
15859 found_problem = TRUE;
15864 /* The name will end with a punctuation */
15866 const char * peek = p + 1;
15868 /* Treat any non-']' punctuation followed by a ']' (possibly
15869 * with intervening blanks) as trying to terminate the class.
15870 * ']]' is very likely to mean a class was intended (but
15871 * missing the colon), but the warning message that gets
15872 * generated shows the error position better if we exit the
15873 * loop at the bottom (eventually), so skip it here. */
15875 if (peek < e && isBLANK(*peek)) {
15877 found_problem = TRUE;
15880 } while (peek < e && isBLANK(*peek));
15883 if (peek < e && *peek == ']') {
15884 has_terminating_bracket = TRUE;
15886 has_terminating_colon = TRUE;
15888 else if (*p == ';') {
15889 has_semi_colon = TRUE;
15890 has_terminating_colon = TRUE;
15893 found_problem = TRUE;
15900 /* Here we have punctuation we thought didn't end the class.
15901 * Keep track of the position of the key characters that are
15902 * more likely to have been class-enders */
15903 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15905 /* Allow just one such possible class-ender not actually
15906 * ending the class. */
15907 if (possible_end) {
15913 /* If we have too many punctuation characters, no use in
15915 if (++punct_count > max_distance) {
15919 /* Treat the punctuation as a typo. */
15920 input_text[name_len++] = *p;
15923 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15924 input_text[name_len++] = toLOWER(*p);
15926 found_problem = TRUE;
15928 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15929 input_text[name_len++] = *p;
15933 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15937 /* The declaration of 'input_text' is how long we allow a potential
15938 * class name to be, before saying they didn't mean a class name at
15940 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15945 /* We get to here when the possible class name hasn't been properly
15946 * terminated before:
15947 * 1) we ran off the end of the pattern; or
15948 * 2) found two characters, each of which might have been intended to
15949 * be the name's terminator
15950 * 3) found so many punctuation characters in the purported name,
15951 * that the edit distance to a valid one is exceeded
15952 * 4) we decided it was more characters than anyone could have
15953 * intended to be one. */
15955 found_problem = TRUE;
15957 /* In the final two cases, we know that looking up what we've
15958 * accumulated won't lead to a match, even a fuzzy one. */
15959 if ( name_len >= C_ARRAY_LENGTH(input_text)
15960 || punct_count > max_distance)
15962 /* If there was an intermediate key character that could have been
15963 * an intended end, redo the parse, but stop there */
15964 if (possible_end && possible_end != (char *) -1) {
15965 possible_end = (char *) -1; /* Special signal value to say
15966 we've done a first pass */
15971 /* Otherwise, it can't have meant to have been a class */
15972 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15975 /* If we ran off the end, and the final character was a punctuation
15976 * one, back up one, to look at that final one just below. Later, we
15977 * will restore the parse pointer if appropriate */
15978 if (name_len && p == e && isPUNCT(*(p-1))) {
15983 if (p < e && isPUNCT(*p)) {
15985 has_terminating_bracket = TRUE;
15987 /* If this is a 2nd ']', and the first one is just below this
15988 * one, consider that to be the real terminator. This gives a
15989 * uniform and better positioning for the warning message */
15991 && possible_end != (char *) -1
15992 && *possible_end == ']'
15993 && name_len && input_text[name_len - 1] == ']')
15998 /* And this is actually equivalent to having done the 2nd
15999 * pass now, so set it to not try again */
16000 possible_end = (char *) -1;
16005 has_terminating_colon = TRUE;
16007 else if (*p == ';') {
16008 has_semi_colon = TRUE;
16009 has_terminating_colon = TRUE;
16017 /* Here, we have a class name to look up. We can short circuit the
16018 * stuff below for short names that can't possibly be meant to be a
16019 * class name. (We can do this on the first pass, as any second pass
16020 * will yield an even shorter name) */
16021 if (name_len < 3) {
16022 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16025 /* Find which class it is. Initially switch on the length of the name.
16027 switch (name_len) {
16029 if (memEQs(name_start, 4, "word")) {
16030 /* this is not POSIX, this is the Perl \w */
16031 class_number = ANYOF_WORDCHAR;
16035 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16036 * graph lower print punct space upper
16037 * Offset 4 gives the best switch position. */
16038 switch (name_start[4]) {
16040 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16041 class_number = ANYOF_ALPHA;
16044 if (memBEGINs(name_start, 5, "spac")) /* space */
16045 class_number = ANYOF_SPACE;
16048 if (memBEGINs(name_start, 5, "grap")) /* graph */
16049 class_number = ANYOF_GRAPH;
16052 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16053 class_number = ANYOF_ASCII;
16056 if (memBEGINs(name_start, 5, "blan")) /* blank */
16057 class_number = ANYOF_BLANK;
16060 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16061 class_number = ANYOF_CNTRL;
16064 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16065 class_number = ANYOF_ALPHANUMERIC;
16068 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16069 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16070 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16071 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16074 if (memBEGINs(name_start, 5, "digi")) /* digit */
16075 class_number = ANYOF_DIGIT;
16076 else if (memBEGINs(name_start, 5, "prin")) /* print */
16077 class_number = ANYOF_PRINT;
16078 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16079 class_number = ANYOF_PUNCT;
16084 if (memEQs(name_start, 6, "xdigit"))
16085 class_number = ANYOF_XDIGIT;
16089 /* If the name exactly matches a posix class name the class number will
16090 * here be set to it, and the input almost certainly was meant to be a
16091 * posix class, so we can skip further checking. If instead the syntax
16092 * is exactly correct, but the name isn't one of the legal ones, we
16093 * will return that as an error below. But if neither of these apply,
16094 * it could be that no posix class was intended at all, or that one
16095 * was, but there was a typo. We tease these apart by doing fuzzy
16096 * matching on the name */
16097 if (class_number == OOB_NAMEDCLASS && found_problem) {
16098 const UV posix_names[][6] = {
16099 { 'a', 'l', 'n', 'u', 'm' },
16100 { 'a', 'l', 'p', 'h', 'a' },
16101 { 'a', 's', 'c', 'i', 'i' },
16102 { 'b', 'l', 'a', 'n', 'k' },
16103 { 'c', 'n', 't', 'r', 'l' },
16104 { 'd', 'i', 'g', 'i', 't' },
16105 { 'g', 'r', 'a', 'p', 'h' },
16106 { 'l', 'o', 'w', 'e', 'r' },
16107 { 'p', 'r', 'i', 'n', 't' },
16108 { 'p', 'u', 'n', 'c', 't' },
16109 { 's', 'p', 'a', 'c', 'e' },
16110 { 'u', 'p', 'p', 'e', 'r' },
16111 { 'w', 'o', 'r', 'd' },
16112 { 'x', 'd', 'i', 'g', 'i', 't' }
16114 /* The names of the above all have added NULs to make them the same
16115 * size, so we need to also have the real lengths */
16116 const UV posix_name_lengths[] = {
16117 sizeof("alnum") - 1,
16118 sizeof("alpha") - 1,
16119 sizeof("ascii") - 1,
16120 sizeof("blank") - 1,
16121 sizeof("cntrl") - 1,
16122 sizeof("digit") - 1,
16123 sizeof("graph") - 1,
16124 sizeof("lower") - 1,
16125 sizeof("print") - 1,
16126 sizeof("punct") - 1,
16127 sizeof("space") - 1,
16128 sizeof("upper") - 1,
16129 sizeof("word") - 1,
16130 sizeof("xdigit")- 1
16133 int temp_max = max_distance; /* Use a temporary, so if we
16134 reparse, we haven't changed the
16137 /* Use a smaller max edit distance if we are missing one of the
16139 if ( has_opening_bracket + has_opening_colon < 2
16140 || has_terminating_bracket + has_terminating_colon < 2)
16145 /* See if the input name is close to a legal one */
16146 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16148 /* Short circuit call if the lengths are too far apart to be
16150 if (abs( (int) (name_len - posix_name_lengths[i]))
16156 if (edit_distance(input_text,
16159 posix_name_lengths[i],
16163 { /* If it is close, it probably was intended to be a class */
16164 goto probably_meant_to_be;
16168 /* Here the input name is not close enough to a valid class name
16169 * for us to consider it to be intended to be a posix class. If
16170 * we haven't already done so, and the parse found a character that
16171 * could have been terminators for the name, but which we absorbed
16172 * as typos during the first pass, repeat the parse, signalling it
16173 * to stop at that character */
16174 if (possible_end && possible_end != (char *) -1) {
16175 possible_end = (char *) -1;
16180 /* Here neither pass found a close-enough class name */
16181 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16184 probably_meant_to_be:
16186 /* Here we think that a posix specification was intended. Update any
16188 if (updated_parse_ptr) {
16189 *updated_parse_ptr = (char *) p;
16192 /* If a posix class name was intended but incorrectly specified, we
16193 * output or return the warnings */
16194 if (found_problem) {
16196 /* We set flags for these issues in the parse loop above instead of
16197 * adding them to the list of warnings, because we can parse it
16198 * twice, and we only want one warning instance */
16200 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16203 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16205 if (has_semi_colon) {
16206 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16208 else if (! has_terminating_colon) {
16209 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16211 if (! has_terminating_bracket) {
16212 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16215 if ( posix_warnings
16217 && av_top_index(RExC_warn_text) > -1)
16219 *posix_warnings = RExC_warn_text;
16222 else if (class_number != OOB_NAMEDCLASS) {
16223 /* If it is a known class, return the class. The class number
16224 * #defines are structured so each complement is +1 to the normal
16226 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16228 else if (! check_only) {
16230 /* Here, it is an unrecognized class. This is an error (unless the
16231 * call is to check only, which we've already handled above) */
16232 const char * const complement_string = (complement)
16235 RExC_parse = (char *) p;
16236 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16238 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16242 return OOB_NAMEDCLASS;
16244 #undef ADD_POSIX_WARNING
16246 STATIC unsigned int
16247 S_regex_set_precedence(const U8 my_operator) {
16249 /* Returns the precedence in the (?[...]) construct of the input operator,
16250 * specified by its character representation. The precedence follows
16251 * general Perl rules, but it extends this so that ')' and ']' have (low)
16252 * precedence even though they aren't really operators */
16254 switch (my_operator) {
16270 NOT_REACHED; /* NOTREACHED */
16271 return 0; /* Silence compiler warning */
16274 STATIC regnode_offset
16275 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16276 I32 *flagp, U32 depth,
16277 char * const oregcomp_parse)
16279 /* Handle the (?[...]) construct to do set operations */
16281 U8 curchar; /* Current character being parsed */
16282 UV start, end; /* End points of code point ranges */
16283 SV* final = NULL; /* The end result inversion list */
16284 SV* result_string; /* 'final' stringified */
16285 AV* stack; /* stack of operators and operands not yet
16287 AV* fence_stack = NULL; /* A stack containing the positions in
16288 'stack' of where the undealt-with left
16289 parens would be if they were actually
16291 /* The 'volatile' is a workaround for an optimiser bug
16292 * in Solaris Studio 12.3. See RT #127455 */
16293 volatile IV fence = 0; /* Position of where most recent undealt-
16294 with left paren in stack is; -1 if none.
16296 STRLEN len; /* Temporary */
16297 regnode_offset node; /* Temporary, and final regnode returned by
16299 const bool save_fold = FOLD; /* Temporary */
16300 char *save_end, *save_parse; /* Temporaries */
16301 const bool in_locale = LOC; /* we turn off /l during processing */
16303 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16305 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16306 PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16308 DEBUG_PARSE("xcls");
16311 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16314 /* The use of this operator implies /u. This is required so that the
16315 * compile time values are valid in all runtime cases */
16316 REQUIRE_UNI_RULES(flagp, 0);
16318 ckWARNexperimental(RExC_parse,
16319 WARN_EXPERIMENTAL__REGEX_SETS,
16320 "The regex_sets feature is experimental");
16322 /* Everything in this construct is a metacharacter. Operands begin with
16323 * either a '\' (for an escape sequence), or a '[' for a bracketed
16324 * character class. Any other character should be an operator, or
16325 * parenthesis for grouping. Both types of operands are handled by calling
16326 * regclass() to parse them. It is called with a parameter to indicate to
16327 * return the computed inversion list. The parsing here is implemented via
16328 * a stack. Each entry on the stack is a single character representing one
16329 * of the operators; or else a pointer to an operand inversion list. */
16331 #define IS_OPERATOR(a) SvIOK(a)
16332 #define IS_OPERAND(a) (! IS_OPERATOR(a))
16334 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
16335 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16336 * with pronouncing it called it Reverse Polish instead, but now that YOU
16337 * know how to pronounce it you can use the correct term, thus giving due
16338 * credit to the person who invented it, and impressing your geek friends.
16339 * Wikipedia says that the pronounciation of "Ł" has been changing so that
16340 * it is now more like an English initial W (as in wonk) than an L.)
16342 * This means that, for example, 'a | b & c' is stored on the stack as
16350 * where the numbers in brackets give the stack [array] element number.
16351 * In this implementation, parentheses are not stored on the stack.
16352 * Instead a '(' creates a "fence" so that the part of the stack below the
16353 * fence is invisible except to the corresponding ')' (this allows us to
16354 * replace testing for parens, by using instead subtraction of the fence
16355 * position). As new operands are processed they are pushed onto the stack
16356 * (except as noted in the next paragraph). New operators of higher
16357 * precedence than the current final one are inserted on the stack before
16358 * the lhs operand (so that when the rhs is pushed next, everything will be
16359 * in the correct positions shown above. When an operator of equal or
16360 * lower precedence is encountered in parsing, all the stacked operations
16361 * of equal or higher precedence are evaluated, leaving the result as the
16362 * top entry on the stack. This makes higher precedence operations
16363 * evaluate before lower precedence ones, and causes operations of equal
16364 * precedence to left associate.
16366 * The only unary operator '!' is immediately pushed onto the stack when
16367 * encountered. When an operand is encountered, if the top of the stack is
16368 * a '!", the complement is immediately performed, and the '!' popped. The
16369 * resulting value is treated as a new operand, and the logic in the
16370 * previous paragraph is executed. Thus in the expression
16372 * the stack looks like
16378 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16385 * A ')' is treated as an operator with lower precedence than all the
16386 * aforementioned ones, which causes all operations on the stack above the
16387 * corresponding '(' to be evaluated down to a single resultant operand.
16388 * Then the fence for the '(' is removed, and the operand goes through the
16389 * algorithm above, without the fence.
16391 * A separate stack is kept of the fence positions, so that the position of
16392 * the latest so-far unbalanced '(' is at the top of it.
16394 * The ']' ending the construct is treated as the lowest operator of all,
16395 * so that everything gets evaluated down to a single operand, which is the
16398 sv_2mortal((SV *)(stack = newAV()));
16399 sv_2mortal((SV *)(fence_stack = newAV()));
16401 while (RExC_parse < RExC_end) {
16402 I32 top_index; /* Index of top-most element in 'stack' */
16403 SV** top_ptr; /* Pointer to top 'stack' element */
16404 SV* current = NULL; /* To contain the current inversion list
16406 SV* only_to_avoid_leaks;
16408 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16409 TRUE /* Force /x */ );
16410 if (RExC_parse >= RExC_end) { /* Fail */
16414 curchar = UCHARAT(RExC_parse);
16418 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16419 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16420 DEBUG_U(dump_regex_sets_structures(pRExC_state,
16421 stack, fence, fence_stack));
16424 top_index = av_tindex_skip_len_mg(stack);
16427 SV** stacked_ptr; /* Ptr to something already on 'stack' */
16428 char stacked_operator; /* The topmost operator on the 'stack'. */
16429 SV* lhs; /* Operand to the left of the operator */
16430 SV* rhs; /* Operand to the right of the operator */
16431 SV* fence_ptr; /* Pointer to top element of the fence
16435 if ( RExC_parse < RExC_end - 2
16436 && UCHARAT(RExC_parse + 1) == '?'
16437 && UCHARAT(RExC_parse + 2) == '^')
16439 const regnode_offset orig_emit = RExC_emit;
16440 SV * resultant_invlist;
16442 /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16443 * This happens when we have some thing like
16445 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16447 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
16449 * Here we would be handling the interpolated
16450 * '$thai_or_lao'. We handle this by a recursive call to
16451 * reg which returns the inversion list the
16452 * interpolated expression evaluates to. Actually, the
16453 * return is a special regnode containing a pointer to that
16454 * inversion list. If the return isn't that regnode alone,
16455 * we know that this wasn't such an interpolation, which is
16456 * an error: we need to get a single inversion list back
16457 * from the recursion */
16462 node = reg(pRExC_state, 2, flagp, depth+1);
16463 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16465 if ( OP(REGNODE_p(node)) != REGEX_SET
16466 /* If more than a single node returned, the nested
16467 * parens evaluated to more than just a (?[...]),
16468 * which isn't legal */
16470 vFAIL("Expecting interpolated extended charclass");
16472 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16473 current = invlist_clone(resultant_invlist, NULL);
16474 SvREFCNT_dec(resultant_invlist);
16477 RExC_emit = orig_emit;
16478 goto handle_operand;
16481 /* A regular '('. Look behind for illegal syntax */
16482 if (top_index - fence >= 0) {
16483 /* If the top entry on the stack is an operator, it had
16484 * better be a '!', otherwise the entry below the top
16485 * operand should be an operator */
16486 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
16487 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16488 || ( IS_OPERAND(*top_ptr)
16489 && ( top_index - fence < 1
16490 || ! (stacked_ptr = av_fetch(stack,
16493 || ! IS_OPERATOR(*stacked_ptr))))
16496 vFAIL("Unexpected '(' with no preceding operator");
16500 /* Stack the position of this undealt-with left paren */
16501 av_push(fence_stack, newSViv(fence));
16502 fence = top_index + 1;
16506 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16507 * multi-char folds are allowed. */
16508 if (!regclass(pRExC_state, flagp, depth+1,
16509 TRUE, /* means parse just the next thing */
16510 FALSE, /* don't allow multi-char folds */
16511 FALSE, /* don't silence non-portable warnings. */
16513 FALSE, /* Require return to be an ANYOF */
16516 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16517 goto regclass_failed;
16522 /* regclass() will return with parsing just the \ sequence,
16523 * leaving the parse pointer at the next thing to parse */
16525 goto handle_operand;
16527 case '[': /* Is a bracketed character class */
16529 /* See if this is a [:posix:] class. */
16530 bool is_posix_class = (OOB_NAMEDCLASS
16531 < handle_possible_posix(pRExC_state,
16535 TRUE /* checking only */));
16536 /* If it is a posix class, leave the parse pointer at the '['
16537 * to fool regclass() into thinking it is part of a
16538 * '[[:posix:]]'. */
16539 if (! is_posix_class) {
16543 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16544 * multi-char folds are allowed. */
16545 if (!regclass(pRExC_state, flagp, depth+1,
16546 is_posix_class, /* parse the whole char
16547 class only if not a
16549 FALSE, /* don't allow multi-char folds */
16550 TRUE, /* silence non-portable warnings. */
16552 FALSE, /* Require return to be an ANYOF */
16555 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16556 goto regclass_failed;
16561 /* function call leaves parse pointing to the ']', except if we
16563 if (is_posix_class) {
16567 goto handle_operand;
16571 if (top_index >= 1) {
16572 goto join_operators;
16575 /* Only a single operand on the stack: are done */
16579 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16580 if (UCHARAT(RExC_parse - 1) == ']') {
16584 vFAIL("Unexpected ')'");
16587 /* If nothing after the fence, is missing an operand */
16588 if (top_index - fence < 0) {
16592 /* If at least two things on the stack, treat this as an
16594 if (top_index - fence >= 1) {
16595 goto join_operators;
16598 /* Here only a single thing on the fenced stack, and there is a
16599 * fence. Get rid of it */
16600 fence_ptr = av_pop(fence_stack);
16602 fence = SvIV(fence_ptr);
16603 SvREFCNT_dec_NN(fence_ptr);
16610 /* Having gotten rid of the fence, we pop the operand at the
16611 * stack top and process it as a newly encountered operand */
16612 current = av_pop(stack);
16613 if (IS_OPERAND(current)) {
16614 goto handle_operand;
16626 /* These binary operators should have a left operand already
16628 if ( top_index - fence < 0
16629 || top_index - fence == 1
16630 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16631 || ! IS_OPERAND(*top_ptr))
16633 goto unexpected_binary;
16636 /* If only the one operand is on the part of the stack visible
16637 * to us, we just place this operator in the proper position */
16638 if (top_index - fence < 2) {
16640 /* Place the operator before the operand */
16642 SV* lhs = av_pop(stack);
16643 av_push(stack, newSVuv(curchar));
16644 av_push(stack, lhs);
16648 /* But if there is something else on the stack, we need to
16649 * process it before this new operator if and only if the
16650 * stacked operation has equal or higher precedence than the
16655 /* The operator on the stack is supposed to be below both its
16657 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16658 || IS_OPERAND(*stacked_ptr))
16660 /* But if not, it's legal and indicates we are completely
16661 * done if and only if we're currently processing a ']',
16662 * which should be the final thing in the expression */
16663 if (curchar == ']') {
16669 vFAIL2("Unexpected binary operator '%c' with no "
16670 "preceding operand", curchar);
16672 stacked_operator = (char) SvUV(*stacked_ptr);
16674 if (regex_set_precedence(curchar)
16675 > regex_set_precedence(stacked_operator))
16677 /* Here, the new operator has higher precedence than the
16678 * stacked one. This means we need to add the new one to
16679 * the stack to await its rhs operand (and maybe more
16680 * stuff). We put it before the lhs operand, leaving
16681 * untouched the stacked operator and everything below it
16683 lhs = av_pop(stack);
16684 assert(IS_OPERAND(lhs));
16686 av_push(stack, newSVuv(curchar));
16687 av_push(stack, lhs);
16691 /* Here, the new operator has equal or lower precedence than
16692 * what's already there. This means the operation already
16693 * there should be performed now, before the new one. */
16695 rhs = av_pop(stack);
16696 if (! IS_OPERAND(rhs)) {
16698 /* This can happen when a ! is not followed by an operand,
16699 * like in /(?[\t &!])/ */
16703 lhs = av_pop(stack);
16705 if (! IS_OPERAND(lhs)) {
16707 /* This can happen when there is an empty (), like in
16708 * /(?[[0]+()+])/ */
16712 switch (stacked_operator) {
16714 _invlist_intersection(lhs, rhs, &rhs);
16719 _invlist_union(lhs, rhs, &rhs);
16723 _invlist_subtract(lhs, rhs, &rhs);
16726 case '^': /* The union minus the intersection */
16731 _invlist_union(lhs, rhs, &u);
16732 _invlist_intersection(lhs, rhs, &i);
16733 _invlist_subtract(u, i, &rhs);
16734 SvREFCNT_dec_NN(i);
16735 SvREFCNT_dec_NN(u);
16741 /* Here, the higher precedence operation has been done, and the
16742 * result is in 'rhs'. We overwrite the stacked operator with
16743 * the result. Then we redo this code to either push the new
16744 * operator onto the stack or perform any higher precedence
16745 * stacked operation */
16746 only_to_avoid_leaks = av_pop(stack);
16747 SvREFCNT_dec(only_to_avoid_leaks);
16748 av_push(stack, rhs);
16751 case '!': /* Highest priority, right associative */
16753 /* If what's already at the top of the stack is another '!",
16754 * they just cancel each other out */
16755 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16756 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16758 only_to_avoid_leaks = av_pop(stack);
16759 SvREFCNT_dec(only_to_avoid_leaks);
16761 else { /* Otherwise, since it's right associative, just push
16763 av_push(stack, newSVuv(curchar));
16768 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16769 if (RExC_parse >= RExC_end) {
16772 vFAIL("Unexpected character");
16776 /* Here 'current' is the operand. If something is already on the
16777 * stack, we have to check if it is a !. But first, the code above
16778 * may have altered the stack in the time since we earlier set
16781 top_index = av_tindex_skip_len_mg(stack);
16782 if (top_index - fence >= 0) {
16783 /* If the top entry on the stack is an operator, it had better
16784 * be a '!', otherwise the entry below the top operand should
16785 * be an operator */
16786 top_ptr = av_fetch(stack, top_index, FALSE);
16788 if (IS_OPERATOR(*top_ptr)) {
16790 /* The only permissible operator at the top of the stack is
16791 * '!', which is applied immediately to this operand. */
16792 curchar = (char) SvUV(*top_ptr);
16793 if (curchar != '!') {
16794 SvREFCNT_dec(current);
16795 vFAIL2("Unexpected binary operator '%c' with no "
16796 "preceding operand", curchar);
16799 _invlist_invert(current);
16801 only_to_avoid_leaks = av_pop(stack);
16802 SvREFCNT_dec(only_to_avoid_leaks);
16804 /* And we redo with the inverted operand. This allows
16805 * handling multiple ! in a row */
16806 goto handle_operand;
16808 /* Single operand is ok only for the non-binary ')'
16810 else if ((top_index - fence == 0 && curchar != ')')
16811 || (top_index - fence > 0
16812 && (! (stacked_ptr = av_fetch(stack,
16815 || IS_OPERAND(*stacked_ptr))))
16817 SvREFCNT_dec(current);
16818 vFAIL("Operand with no preceding operator");
16822 /* Here there was nothing on the stack or the top element was
16823 * another operand. Just add this new one */
16824 av_push(stack, current);
16826 } /* End of switch on next parse token */
16828 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16829 } /* End of loop parsing through the construct */
16831 vFAIL("Syntax error in (?[...])");
16835 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16836 if (RExC_parse < RExC_end) {
16840 vFAIL("Unexpected ']' with no following ')' in (?[...");
16843 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16844 vFAIL("Unmatched (");
16847 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16848 || ((final = av_pop(stack)) == NULL)
16849 || ! IS_OPERAND(final)
16850 || ! is_invlist(final)
16851 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16854 SvREFCNT_dec(final);
16855 vFAIL("Incomplete expression within '(?[ ])'");
16858 /* Here, 'final' is the resultant inversion list from evaluating the
16859 * expression. Return it if so requested */
16860 if (return_invlist) {
16861 *return_invlist = final;
16865 if (RExC_sets_depth) { /* If within a recursive call, return in a special
16868 node = regpnode(pRExC_state, REGEX_SET, final);
16872 /* Otherwise generate a resultant node, based on 'final'. regclass()
16873 * is expecting a string of ranges and individual code points */
16874 invlist_iterinit(final);
16875 result_string = newSVpvs("");
16876 while (invlist_iternext(final, &start, &end)) {
16877 if (start == end) {
16878 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16881 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16882 UVXf "}", start, end);
16886 /* About to generate an ANYOF (or similar) node from the inversion list
16887 * we have calculated */
16888 save_parse = RExC_parse;
16889 RExC_parse = SvPV(result_string, len);
16890 save_end = RExC_end;
16891 RExC_end = RExC_parse + len;
16892 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16894 /* We turn off folding around the call, as the class we have
16895 * constructed already has all folding taken into consideration, and we
16896 * don't want regclass() to add to that */
16897 RExC_flags &= ~RXf_PMf_FOLD;
16898 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16899 * folds are allowed. */
16900 node = regclass(pRExC_state, flagp, depth+1,
16901 FALSE, /* means parse the whole char class */
16902 FALSE, /* don't allow multi-char folds */
16903 TRUE, /* silence non-portable warnings. The above may
16904 very well have generated non-portable code
16905 points, but they're valid on this machine */
16906 FALSE, /* similarly, no need for strict */
16908 /* We can optimize into something besides an ANYOF,
16909 * except under /l, which needs to be ANYOF because of
16910 * runtime checks for locale sanity, etc */
16916 RExC_parse = save_parse + 1;
16917 RExC_end = save_end;
16918 SvREFCNT_dec_NN(final);
16919 SvREFCNT_dec_NN(result_string);
16922 RExC_flags |= RXf_PMf_FOLD;
16926 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16927 goto regclass_failed;
16930 /* Fix up the node type if we are in locale. (We have pretended we are
16931 * under /u for the purposes of regclass(), as this construct will only
16932 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
16933 * (so as to cause any warnings about bad locales to be output in
16934 * regexec.c), and add the flag that indicates to check if not in a
16935 * UTF-8 locale. The reason we above forbid optimization into
16936 * something other than an ANYOF node is simply to minimize the number
16937 * of code changes in regexec.c. Otherwise we would have to create new
16938 * EXACTish node types and deal with them. This decision could be
16939 * revisited should this construct become popular.
16941 * (One might think we could look at the resulting ANYOF node and
16942 * suppress the flag if everything is above 255, as those would be
16943 * UTF-8 only, but this isn't true, as the components that led to that
16944 * result could have been locale-affected, and just happen to cancel
16945 * each other out under UTF-8 locales.) */
16947 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16949 assert(OP(REGNODE_p(node)) == ANYOF);
16951 OP(REGNODE_p(node)) = ANYOFL;
16952 ANYOF_FLAGS(REGNODE_p(node))
16953 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16957 nextchar(pRExC_state);
16958 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16962 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16966 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16969 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16970 AV * stack, const IV fence, AV * fence_stack)
16971 { /* Dumps the stacks in handle_regex_sets() */
16973 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16974 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16977 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16979 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16981 if (stack_top < 0) {
16982 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16985 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16986 for (i = stack_top; i >= 0; i--) {
16987 SV ** element_ptr = av_fetch(stack, i, FALSE);
16988 if (! element_ptr) {
16991 if (IS_OPERATOR(*element_ptr)) {
16992 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16993 (int) i, (int) SvIV(*element_ptr));
16996 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16997 sv_dump(*element_ptr);
17002 if (fence_stack_top < 0) {
17003 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17006 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17007 for (i = fence_stack_top; i >= 0; i--) {
17008 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17009 if (! element_ptr) {
17012 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17013 (int) i, (int) SvIV(*element_ptr));
17024 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17026 /* This adds the Latin1/above-Latin1 folding rules.
17028 * This should be called only for a Latin1-range code points, cp, which is
17029 * known to be involved in a simple fold with other code points above
17030 * Latin1. It would give false results if /aa has been specified.
17031 * Multi-char folds are outside the scope of this, and must be handled
17034 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17036 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17038 /* The rules that are valid for all Unicode versions are hard-coded in */
17043 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17047 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17050 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17051 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17053 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17054 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17055 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17057 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17058 *invlist = add_cp_to_invlist(*invlist,
17059 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17062 default: /* Other code points are checked against the data for the
17063 current Unicode version */
17065 Size_t folds_count;
17067 const U32 * remaining_folds;
17071 folded_cp = toFOLD(cp);
17074 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17076 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17079 if (folded_cp > 255) {
17080 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17083 folds_count = _inverse_folds(folded_cp, &first_fold,
17085 if (folds_count == 0) {
17087 /* Use deprecated warning to increase the chances of this being
17089 ckWARN2reg_d(RExC_parse,
17090 "Perl folding rules are not up-to-date for 0x%02X;"
17091 " please use the perlbug utility to report;", cp);
17096 if (first_fold > 255) {
17097 *invlist = add_cp_to_invlist(*invlist, first_fold);
17099 for (i = 0; i < folds_count - 1; i++) {
17100 if (remaining_folds[i] > 255) {
17101 *invlist = add_cp_to_invlist(*invlist,
17102 remaining_folds[i]);
17112 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17114 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17118 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17120 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17122 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17123 CLEAR_POSIX_WARNINGS();
17127 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17128 if (first_is_fatal) { /* Avoid leaking this */
17129 av_undef(posix_warnings); /* This isn't necessary if the
17130 array is mortal, but is a
17132 (void) sv_2mortal(msg);
17135 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17136 SvREFCNT_dec_NN(msg);
17139 UPDATE_WARNINGS_LOC(RExC_parse);
17142 PERL_STATIC_INLINE Size_t
17143 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17145 const U8 * const start = s1;
17146 const U8 * const send = start + max;
17148 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17150 while (s1 < send && *s1 == *s2) {
17159 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17161 /* This adds the string scalar <multi_string> to the array
17162 * <multi_char_matches>. <multi_string> is known to have exactly
17163 * <cp_count> code points in it. This is used when constructing a
17164 * bracketed character class and we find something that needs to match more
17165 * than a single character.
17167 * <multi_char_matches> is actually an array of arrays. Each top-level
17168 * element is an array that contains all the strings known so far that are
17169 * the same length. And that length (in number of code points) is the same
17170 * as the index of the top-level array. Hence, the [2] element is an
17171 * array, each element thereof is a string containing TWO code points;
17172 * while element [3] is for strings of THREE characters, and so on. Since
17173 * this is for multi-char strings there can never be a [0] nor [1] element.
17175 * When we rewrite the character class below, we will do so such that the
17176 * longest strings are written first, so that it prefers the longest
17177 * matching strings first. This is done even if it turns out that any
17178 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17179 * Christiansen has agreed that this is ok. This makes the test for the
17180 * ligature 'ffi' come before the test for 'ff', for example */
17183 AV** this_array_ptr;
17185 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17187 if (! multi_char_matches) {
17188 multi_char_matches = newAV();
17191 if (av_exists(multi_char_matches, cp_count)) {
17192 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17193 this_array = *this_array_ptr;
17196 this_array = newAV();
17197 av_store(multi_char_matches, cp_count,
17200 av_push(this_array, multi_string);
17202 return multi_char_matches;
17205 /* The names of properties whose definitions are not known at compile time are
17206 * stored in this SV, after a constant heading. So if the length has been
17207 * changed since initialization, then there is a run-time definition. */
17208 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17209 (SvCUR(listsv) != initial_listsv_len)
17211 /* There is a restricted set of white space characters that are legal when
17212 * ignoring white space in a bracketed character class. This generates the
17213 * code to skip them.
17215 * There is a line below that uses the same white space criteria but is outside
17216 * this macro. Both here and there must use the same definition */
17217 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
17220 while (isBLANK_A(UCHARAT(p))) \
17227 STATIC regnode_offset
17228 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17229 const bool stop_at_1, /* Just parse the next thing, don't
17230 look for a full character class */
17231 bool allow_mutiple_chars,
17232 const bool silence_non_portable, /* Don't output warnings
17236 bool optimizable, /* ? Allow a non-ANYOF return
17238 SV** ret_invlist /* Return an inversion list, not a node */
17241 /* parse a bracketed class specification. Most of these will produce an
17242 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17243 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17244 * under /i with multi-character folds: it will be rewritten following the
17245 * paradigm of this example, where the <multi-fold>s are characters which
17246 * fold to multiple character sequences:
17247 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17248 * gets effectively rewritten as:
17249 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17250 * reg() gets called (recursively) on the rewritten version, and this
17251 * function will return what it constructs. (Actually the <multi-fold>s
17252 * aren't physically removed from the [abcdefghi], it's just that they are
17253 * ignored in the recursion by means of a flag:
17254 * <RExC_in_multi_char_class>.)
17256 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17257 * characters, with the corresponding bit set if that character is in the
17258 * list. For characters above this, an inversion list is used. There
17259 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17260 * determinable at compile time
17262 * On success, returns the offset at which any next node should be placed
17263 * into the regex engine program being compiled.
17265 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17266 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17271 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17273 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17274 regnode_offset ret = -1; /* Initialized to an illegal value */
17276 int namedclass = OOB_NAMEDCLASS;
17277 char *rangebegin = NULL;
17278 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17279 aren't available at the time this was called */
17280 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17281 than just initialized. */
17282 SV* properties = NULL; /* Code points that match \p{} \P{} */
17283 SV* posixes = NULL; /* Code points that match classes like [:word:],
17284 extended beyond the Latin1 range. These have to
17285 be kept separate from other code points for much
17286 of this function because their handling is
17287 different under /i, and for most classes under
17289 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17290 separate for a while from the non-complemented
17291 versions because of complications with /d
17293 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17294 treated more simply than the general case,
17295 leading to less compilation and execution
17297 UV element_count = 0; /* Number of distinct elements in the class.
17298 Optimizations may be possible if this is tiny */
17299 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17300 character; used under /i */
17302 char * stop_ptr = RExC_end; /* where to stop parsing */
17304 /* ignore unescaped whitespace? */
17305 const bool skip_white = cBOOL( ret_invlist
17306 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17308 /* inversion list of code points this node matches only when the target
17309 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17311 SV* upper_latin1_only_utf8_matches = NULL;
17313 /* Inversion list of code points this node matches regardless of things
17314 * like locale, folding, utf8ness of the target string */
17315 SV* cp_list = NULL;
17317 /* Like cp_list, but code points on this list need to be checked for things
17318 * that fold to/from them under /i */
17319 SV* cp_foldable_list = NULL;
17321 /* Like cp_list, but code points on this list are valid only when the
17322 * runtime locale is UTF-8 */
17323 SV* only_utf8_locale_list = NULL;
17325 /* In a range, if one of the endpoints is non-character-set portable,
17326 * meaning that it hard-codes a code point that may mean a different
17327 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17328 * mnemonic '\t' which each mean the same character no matter which
17329 * character set the platform is on. */
17330 unsigned int non_portable_endpoint = 0;
17332 /* Is the range unicode? which means on a platform that isn't 1-1 native
17333 * to Unicode (i.e. non-ASCII), each code point in it should be considered
17334 * to be a Unicode value. */
17335 bool unicode_range = FALSE;
17336 bool invert = FALSE; /* Is this class to be complemented */
17338 bool warn_super = ALWAYS_WARN_SUPER;
17340 const char * orig_parse = RExC_parse;
17342 /* This variable is used to mark where the end in the input is of something
17343 * that looks like a POSIX construct but isn't. During the parse, when
17344 * something looks like it could be such a construct is encountered, it is
17345 * checked for being one, but not if we've already checked this area of the
17346 * input. Only after this position is reached do we check again */
17347 char *not_posix_region_end = RExC_parse - 1;
17349 AV* posix_warnings = NULL;
17350 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17351 U8 op = END; /* The returned node-type, initialized to an impossible
17353 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
17354 U32 posixl = 0; /* bit field of posix classes matched under /l */
17357 /* Flags as to what things aren't knowable until runtime. (Note that these are
17358 * mutually exclusive.) */
17359 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
17360 haven't been defined as of yet */
17361 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
17363 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
17364 what gets folded */
17365 U32 has_runtime_dependency = 0; /* OR of the above flags */
17367 DECLARE_AND_GET_RE_DEBUG_FLAGS;
17369 PERL_ARGS_ASSERT_REGCLASS;
17371 PERL_UNUSED_ARG(depth);
17374 assert(! (ret_invlist && allow_mutiple_chars));
17376 /* If wants an inversion list returned, we can't optimize to something
17379 optimizable = FALSE;
17382 DEBUG_PARSE("clas");
17384 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
17385 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
17386 && UNICODE_DOT_DOT_VERSION == 0)
17387 allow_mutiple_chars = FALSE;
17390 /* We include the /i status at the beginning of this so that we can
17391 * know it at runtime */
17392 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17393 initial_listsv_len = SvCUR(listsv);
17394 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
17396 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17398 assert(RExC_parse <= RExC_end);
17400 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
17403 allow_mutiple_chars = FALSE;
17405 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17408 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17409 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17410 int maybe_class = handle_possible_posix(pRExC_state,
17412 ¬_posix_region_end,
17414 TRUE /* checking only */);
17415 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17416 ckWARN4reg(not_posix_region_end,
17417 "POSIX syntax [%c %c] belongs inside character classes%s",
17418 *RExC_parse, *RExC_parse,
17419 (maybe_class == OOB_NAMEDCLASS)
17420 ? ((POSIXCC_NOTYET(*RExC_parse))
17421 ? " (but this one isn't implemented)"
17422 : " (but this one isn't fully valid)")
17428 /* If the caller wants us to just parse a single element, accomplish this
17429 * by faking the loop ending condition */
17430 if (stop_at_1 && RExC_end > RExC_parse) {
17431 stop_ptr = RExC_parse + 1;
17434 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17435 if (UCHARAT(RExC_parse) == ']')
17436 goto charclassloop;
17440 if ( posix_warnings
17441 && av_tindex_skip_len_mg(posix_warnings) >= 0
17442 && RExC_parse > not_posix_region_end)
17444 /* Warnings about posix class issues are considered tentative until
17445 * we are far enough along in the parse that we can no longer
17446 * change our mind, at which point we output them. This is done
17447 * each time through the loop so that a later class won't zap them
17448 * before they have been dealt with. */
17449 output_posix_warnings(pRExC_state, posix_warnings);
17452 if (RExC_parse >= stop_ptr) {
17456 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17458 if (UCHARAT(RExC_parse) == ']') {
17464 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17465 save_value = value;
17466 save_prevvalue = prevvalue;
17469 rangebegin = RExC_parse;
17471 non_portable_endpoint = 0;
17473 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17474 value = utf8n_to_uvchr((U8*)RExC_parse,
17475 RExC_end - RExC_parse,
17476 &numlen, UTF8_ALLOW_DEFAULT);
17477 RExC_parse += numlen;
17480 value = UCHARAT(RExC_parse++);
17482 if (value == '[') {
17483 char * posix_class_end;
17484 namedclass = handle_possible_posix(pRExC_state,
17487 do_posix_warnings ? &posix_warnings : NULL,
17488 FALSE /* die if error */);
17489 if (namedclass > OOB_NAMEDCLASS) {
17491 /* If there was an earlier attempt to parse this particular
17492 * posix class, and it failed, it was a false alarm, as this
17493 * successful one proves */
17494 if ( posix_warnings
17495 && av_tindex_skip_len_mg(posix_warnings) >= 0
17496 && not_posix_region_end >= RExC_parse
17497 && not_posix_region_end <= posix_class_end)
17499 av_undef(posix_warnings);
17502 RExC_parse = posix_class_end;
17504 else if (namedclass == OOB_NAMEDCLASS) {
17505 not_posix_region_end = posix_class_end;
17508 namedclass = OOB_NAMEDCLASS;
17511 else if ( RExC_parse - 1 > not_posix_region_end
17512 && MAYBE_POSIXCC(value))
17514 (void) handle_possible_posix(
17516 RExC_parse - 1, /* -1 because parse has already been
17518 ¬_posix_region_end,
17519 do_posix_warnings ? &posix_warnings : NULL,
17520 TRUE /* checking only */);
17522 else if ( strict && ! skip_white
17523 && ( _generic_isCC(value, _CC_VERTSPACE)
17524 || is_VERTWS_cp_high(value)))
17526 vFAIL("Literal vertical space in [] is illegal except under /x");
17528 else if (value == '\\') {
17529 /* Is a backslash; get the code point of the char after it */
17531 if (RExC_parse >= RExC_end) {
17532 vFAIL("Unmatched [");
17535 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17536 value = utf8n_to_uvchr((U8*)RExC_parse,
17537 RExC_end - RExC_parse,
17538 &numlen, UTF8_ALLOW_DEFAULT);
17539 RExC_parse += numlen;
17542 value = UCHARAT(RExC_parse++);
17544 /* Some compilers cannot handle switching on 64-bit integer
17545 * values, therefore value cannot be an UV. Yes, this will
17546 * be a problem later if we want switch on Unicode.
17547 * A similar issue a little bit later when switching on
17548 * namedclass. --jhi */
17550 /* If the \ is escaping white space when white space is being
17551 * skipped, it means that that white space is wanted literally, and
17552 * is already in 'value'. Otherwise, need to translate the escape
17553 * into what it signifies. */
17554 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17555 const char * message;
17559 case 'w': namedclass = ANYOF_WORDCHAR; break;
17560 case 'W': namedclass = ANYOF_NWORDCHAR; break;
17561 case 's': namedclass = ANYOF_SPACE; break;
17562 case 'S': namedclass = ANYOF_NSPACE; break;
17563 case 'd': namedclass = ANYOF_DIGIT; break;
17564 case 'D': namedclass = ANYOF_NDIGIT; break;
17565 case 'v': namedclass = ANYOF_VERTWS; break;
17566 case 'V': namedclass = ANYOF_NVERTWS; break;
17567 case 'h': namedclass = ANYOF_HORIZWS; break;
17568 case 'H': namedclass = ANYOF_NHORIZWS; break;
17569 case 'N': /* Handle \N{NAME} in class */
17571 const char * const backslash_N_beg = RExC_parse - 2;
17574 if (! grok_bslash_N(pRExC_state,
17575 NULL, /* No regnode */
17576 &value, /* Yes single value */
17577 &cp_count, /* Multiple code pt count */
17583 if (*flagp & NEED_UTF8)
17584 FAIL("panic: grok_bslash_N set NEED_UTF8");
17586 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17588 if (cp_count < 0) {
17589 vFAIL("\\N in a character class must be a named character: \\N{...}");
17591 else if (cp_count == 0) {
17592 ckWARNreg(RExC_parse,
17593 "Ignoring zero length \\N{} in character class");
17595 else { /* cp_count > 1 */
17596 assert(cp_count > 1);
17597 if (! RExC_in_multi_char_class) {
17598 if ( ! allow_mutiple_chars
17601 || *RExC_parse == '-')
17605 vFAIL("\\N{} here is restricted to one character");
17607 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17608 break; /* <value> contains the first code
17609 point. Drop out of the switch to
17613 SV * multi_char_N = newSVpvn(backslash_N_beg,
17614 RExC_parse - backslash_N_beg);
17616 = add_multi_match(multi_char_matches,
17621 } /* End of cp_count != 1 */
17623 /* This element should not be processed further in this
17626 value = save_value;
17627 prevvalue = save_prevvalue;
17628 continue; /* Back to top of loop to get next char */
17631 /* Here, is a single code point, and <value> contains it */
17632 unicode_range = TRUE; /* \N{} are Unicode */
17640 if (RExC_pm_flags & PMf_WILDCARD) {
17642 /* diag_listed_as: Use of %s is not allowed in Unicode
17643 property wildcard subpatterns in regex; marked by <--
17645 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17646 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17649 /* \p means they want Unicode semantics */
17650 REQUIRE_UNI_RULES(flagp, 0);
17652 if (RExC_parse >= RExC_end)
17653 vFAIL2("Empty \\%c", (U8)value);
17654 if (*RExC_parse == '{') {
17655 const U8 c = (U8)value;
17656 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17659 vFAIL2("Missing right brace on \\%c{}", c);
17664 /* White space is allowed adjacent to the braces and after
17665 * any '^', even when not under /x */
17666 while (isSPACE(*RExC_parse)) {
17670 if (UCHARAT(RExC_parse) == '^') {
17672 /* toggle. (The rhs xor gets the single bit that
17673 * differs between P and p; the other xor inverts just
17675 value ^= 'P' ^ 'p';
17678 while (isSPACE(*RExC_parse)) {
17683 if (e == RExC_parse)
17684 vFAIL2("Empty \\%c{}", c);
17686 n = e - RExC_parse;
17687 while (isSPACE(*(RExC_parse + n - 1)))
17690 } /* The \p isn't immediately followed by a '{' */
17691 else if (! isALPHA(*RExC_parse)) {
17692 RExC_parse += (UTF)
17693 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17695 vFAIL2("Character following \\%c must be '{' or a "
17696 "single-character Unicode property name",
17704 char* name = RExC_parse;
17706 /* Any message returned about expanding the definition */
17707 SV* msg = newSVpvs_flags("", SVs_TEMP);
17709 /* If set TRUE, the property is user-defined as opposed to
17710 * official Unicode */
17711 bool user_defined = FALSE;
17712 AV * strings = NULL;
17714 SV * prop_definition = parse_uniprop_string(
17715 name, n, UTF, FOLD,
17716 FALSE, /* This is compile-time */
17718 /* We can't defer this defn when
17719 * the full result is required in
17721 ! cBOOL(ret_invlist),
17728 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17729 assert(prop_definition == NULL);
17730 RExC_parse = e + 1;
17731 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17732 thing so, or else the display is
17736 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17737 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17738 SvCUR(msg), SvPVX(msg)));
17741 assert(prop_definition || strings);
17745 if (! prop_definition) {
17746 RExC_parse = e + 1;
17747 vFAIL("Unicode string properties are not implemented in (?[...])");
17751 "Using just the single character results"
17752 " returned by \\p{} in (?[...])");
17755 else if (! RExC_in_multi_char_class) {
17756 if (invert ^ (value == 'P')) {
17757 RExC_parse = e + 1;
17758 vFAIL("Inverting a character class which contains"
17759 " a multi-character sequence is illegal");
17762 /* For each multi-character string ... */
17763 while (av_tindex(strings) >= 0) {
17764 /* ... Each entry is itself an array of code
17766 AV * this_string = (AV *) av_shift( strings);
17767 STRLEN cp_count = av_tindex(this_string) + 1;
17768 SV * final = newSV(cp_count * 4);
17771 /* Create another string of sequences of \x{...} */
17772 while (av_tindex(this_string) >= 0) {
17773 SV * character = av_shift(this_string);
17774 UV cp = SvUV(character);
17777 REQUIRE_UTF8(flagp);
17779 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17781 SvREFCNT_dec_NN(character);
17783 SvREFCNT_dec_NN(this_string);
17785 /* And add that to the list of such things */
17787 = add_multi_match(multi_char_matches,
17792 SvREFCNT_dec_NN(strings);
17795 if (! prop_definition) { /* If we got only a string,
17796 this iteration didn't really
17797 find a character */
17800 else if (! is_invlist(prop_definition)) {
17802 /* Here, the definition isn't known, so we have gotten
17803 * returned a string that will be evaluated if and when
17804 * encountered at runtime. We add it to the list of
17805 * such properties, along with whether it should be
17806 * complemented or not */
17807 if (value == 'P') {
17808 sv_catpvs(listsv, "!");
17811 sv_catpvs(listsv, "+");
17813 sv_catsv(listsv, prop_definition);
17815 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17817 /* We don't know yet what this matches, so have to flag
17819 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17822 assert (prop_definition && is_invlist(prop_definition));
17824 /* Here we do have the complete property definition
17826 * Temporary workaround for [perl #133136]. For this
17827 * precise input that is in the .t that is failing,
17828 * load utf8.pm, which is what the test wants, so that
17829 * that .t passes */
17830 if ( memEQs(RExC_start, e + 1 - RExC_start,
17832 && ! hv_common(GvHVn(PL_incgv),
17834 "utf8.pm", sizeof("utf8.pm") - 1,
17835 0, HV_FETCH_ISEXISTS, NULL, 0))
17837 require_pv("utf8.pm");
17840 if (! user_defined &&
17841 /* We warn on matching an above-Unicode code point
17842 * if the match would return true, except don't
17843 * warn for \p{All}, which has exactly one element
17845 (_invlist_contains_cp(prop_definition, 0x110000)
17846 && (! (_invlist_len(prop_definition) == 1
17847 && *invlist_array(prop_definition) == 0))))
17852 /* Invert if asking for the complement */
17853 if (value == 'P') {
17854 _invlist_union_complement_2nd(properties,
17859 _invlist_union(properties, prop_definition, &properties);
17864 RExC_parse = e + 1;
17865 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17869 case 'n': value = '\n'; break;
17870 case 'r': value = '\r'; break;
17871 case 't': value = '\t'; break;
17872 case 'f': value = '\f'; break;
17873 case 'b': value = '\b'; break;
17874 case 'e': value = ESC_NATIVE; break;
17875 case 'a': value = '\a'; break;
17877 RExC_parse--; /* function expects to be pointed at the 'o' */
17878 if (! grok_bslash_o(&RExC_parse,
17884 cBOOL(range), /* MAX_UV allowed for range
17890 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17891 warn_non_literal_string(RExC_parse, packed_warn, message);
17895 non_portable_endpoint++;
17899 RExC_parse--; /* function expects to be pointed at the 'x' */
17900 if (! grok_bslash_x(&RExC_parse,
17906 cBOOL(range), /* MAX_UV allowed for range
17912 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17913 warn_non_literal_string(RExC_parse, packed_warn, message);
17917 non_portable_endpoint++;
17921 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17924 /* going to die anyway; point to exact spot of
17926 RExC_parse += (UTF)
17927 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17932 value = grok_c_char;
17934 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17935 warn_non_literal_string(RExC_parse, packed_warn, message);
17938 non_portable_endpoint++;
17940 case '0': case '1': case '2': case '3': case '4':
17941 case '5': case '6': case '7':
17943 /* Take 1-3 octal digits */
17944 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17945 | PERL_SCAN_NOTIFY_ILLDIGIT;
17946 numlen = (strict) ? 4 : 3;
17947 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17948 RExC_parse += numlen;
17951 RExC_parse += (UTF)
17952 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17954 vFAIL("Need exactly 3 octal digits");
17956 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17957 && RExC_parse < RExC_end
17958 && isDIGIT(*RExC_parse)
17959 && ckWARN(WARN_REGEXP))
17961 reg_warn_non_literal_string(
17963 form_alien_digit_msg(8, numlen, RExC_parse,
17964 RExC_end, UTF, FALSE));
17968 non_portable_endpoint++;
17973 /* Allow \_ to not give an error */
17974 if (isWORDCHAR(value) && value != '_') {
17976 vFAIL2("Unrecognized escape \\%c in character class",
17980 ckWARN2reg(RExC_parse,
17981 "Unrecognized escape \\%c in character class passed through",
17986 } /* End of switch on char following backslash */
17987 } /* end of handling backslash escape sequences */
17989 /* Here, we have the current token in 'value' */
17991 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17994 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
17995 * literal, as is the character that began the false range, i.e.
17996 * the 'a' in the examples */
17998 const int w = (RExC_parse >= rangebegin)
17999 ? RExC_parse - rangebegin
18003 "False [] range \"%" UTF8f "\"",
18004 UTF8fARG(UTF, w, rangebegin));
18007 ckWARN2reg(RExC_parse,
18008 "False [] range \"%" UTF8f "\"",
18009 UTF8fARG(UTF, w, rangebegin));
18010 cp_list = add_cp_to_invlist(cp_list, '-');
18011 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18015 range = 0; /* this was not a true range */
18016 element_count += 2; /* So counts for three values */
18019 classnum = namedclass_to_classnum(namedclass);
18021 if (LOC && namedclass < ANYOF_POSIXL_MAX
18022 #ifndef HAS_ISASCII
18023 && classnum != _CC_ASCII
18026 SV* scratch_list = NULL;
18028 /* What the Posix classes (like \w, [:space:]) match isn't
18029 * generally knowable under locale until actual match time. A
18030 * special node is used for these which has extra space for a
18031 * bitmap, with a bit reserved for each named class that is to
18032 * be matched against. (This isn't needed for \p{} and
18033 * pseudo-classes, as they are not affected by locale, and
18034 * hence are dealt with separately.) However, if a named class
18035 * and its complement are both present, then it matches
18036 * everything, and there is no runtime dependency. Odd numbers
18037 * are the complements of the next lower number, so xor works.
18038 * (Note that something like [\w\D] should match everything,
18039 * because \d should be a proper subset of \w. But rather than
18040 * trust that the locale is well behaved, we leave this to
18041 * runtime to sort out) */
18042 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18043 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18044 POSIXL_ZERO(posixl);
18045 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18046 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18047 continue; /* We could ignore the rest of the class, but
18048 best to parse it for any errors */
18050 else { /* Here, isn't the complement of any already parsed
18052 POSIXL_SET(posixl, namedclass);
18053 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18054 anyof_flags |= ANYOF_MATCHES_POSIXL;
18056 /* The above-Latin1 characters are not subject to locale
18057 * rules. Just add them to the unconditionally-matched
18060 /* Get the list of the above-Latin1 code points this
18062 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18063 PL_XPosix_ptrs[classnum],
18065 /* Odd numbers are complements,
18066 * like NDIGIT, NASCII, ... */
18067 namedclass % 2 != 0,
18069 /* Checking if 'cp_list' is NULL first saves an extra
18070 * clone. Its reference count will be decremented at the
18071 * next union, etc, or if this is the only instance, at the
18072 * end of the routine */
18074 cp_list = scratch_list;
18077 _invlist_union(cp_list, scratch_list, &cp_list);
18078 SvREFCNT_dec_NN(scratch_list);
18080 continue; /* Go get next character */
18085 /* Here, is not /l, or is a POSIX class for which /l doesn't
18086 * matter (or is a Unicode property, which is skipped here). */
18087 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18088 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18090 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18091 * nor /l make a difference in what these match,
18092 * therefore we just add what they match to cp_list. */
18093 if (classnum != _CC_VERTSPACE) {
18094 assert( namedclass == ANYOF_HORIZWS
18095 || namedclass == ANYOF_NHORIZWS);
18097 /* It turns out that \h is just a synonym for
18099 classnum = _CC_BLANK;
18102 _invlist_union_maybe_complement_2nd(
18104 PL_XPosix_ptrs[classnum],
18105 namedclass % 2 != 0, /* Complement if odd
18106 (NHORIZWS, NVERTWS)
18111 else if ( AT_LEAST_UNI_SEMANTICS
18112 || classnum == _CC_ASCII
18113 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
18114 || classnum == _CC_XDIGIT)))
18116 /* We usually have to worry about /d affecting what POSIX
18117 * classes match, with special code needed because we won't
18118 * know until runtime what all matches. But there is no
18119 * extra work needed under /u and /a; and [:ascii:] is
18120 * unaffected by /d; and :digit: and :xdigit: don't have
18121 * runtime differences under /d. So we can special case
18122 * these, and avoid some extra work below, and at runtime.
18124 _invlist_union_maybe_complement_2nd(
18126 ((AT_LEAST_ASCII_RESTRICTED)
18127 ? PL_Posix_ptrs[classnum]
18128 : PL_XPosix_ptrs[classnum]),
18129 namedclass % 2 != 0,
18132 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18133 complement and use nposixes */
18134 SV** posixes_ptr = namedclass % 2 == 0
18137 _invlist_union_maybe_complement_2nd(
18139 PL_XPosix_ptrs[classnum],
18140 namedclass % 2 != 0,
18144 } /* end of namedclass \blah */
18146 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
18148 /* If 'range' is set, 'value' is the ending of a range--check its
18149 * validity. (If value isn't a single code point in the case of a
18150 * range, we should have figured that out above in the code that
18151 * catches false ranges). Later, we will handle each individual code
18152 * point in the range. If 'range' isn't set, this could be the
18153 * beginning of a range, so check for that by looking ahead to see if
18154 * the next real character to be processed is the range indicator--the
18159 /* For unicode ranges, we have to test that the Unicode as opposed
18160 * to the native values are not decreasing. (Above 255, there is
18161 * no difference between native and Unicode) */
18162 if (unicode_range && prevvalue < 255 && value < 255) {
18163 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18164 goto backwards_range;
18169 if (prevvalue > value) /* b-a */ {
18174 w = RExC_parse - rangebegin;
18176 "Invalid [] range \"%" UTF8f "\"",
18177 UTF8fARG(UTF, w, rangebegin));
18178 NOT_REACHED; /* NOTREACHED */
18182 prevvalue = value; /* save the beginning of the potential range */
18183 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18184 && *RExC_parse == '-')
18186 char* next_char_ptr = RExC_parse + 1;
18188 /* Get the next real char after the '-' */
18189 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
18191 /* If the '-' is at the end of the class (just before the ']',
18192 * it is a literal minus; otherwise it is a range */
18193 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18194 RExC_parse = next_char_ptr;
18196 /* a bad range like \w-, [:word:]- ? */
18197 if (namedclass > OOB_NAMEDCLASS) {
18198 if (strict || ckWARN(WARN_REGEXP)) {
18199 const int w = RExC_parse >= rangebegin
18200 ? RExC_parse - rangebegin
18203 vFAIL4("False [] range \"%*.*s\"",
18208 "False [] range \"%*.*s\"",
18212 cp_list = add_cp_to_invlist(cp_list, '-');
18215 range = 1; /* yeah, it's a range! */
18216 continue; /* but do it the next time */
18221 if (namedclass > OOB_NAMEDCLASS) {
18225 /* Here, we have a single value this time through the loop, and
18226 * <prevvalue> is the beginning of the range, if any; or <value> if
18229 /* non-Latin1 code point implies unicode semantics. */
18231 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18232 || prevvalue > MAX_LEGAL_CP))
18234 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18236 REQUIRE_UNI_RULES(flagp, 0);
18237 if ( ! silence_non_portable
18238 && UNICODE_IS_PERL_EXTENDED(value)
18239 && TO_OUTPUT_WARNINGS(RExC_parse))
18241 ckWARN2_non_literal_string(RExC_parse,
18242 packWARN(WARN_PORTABLE),
18243 PL_extended_cp_format,
18248 /* Ready to process either the single value, or the completed range.
18249 * For single-valued non-inverted ranges, we consider the possibility
18250 * of multi-char folds. (We made a conscious decision to not do this
18251 * for the other cases because it can often lead to non-intuitive
18252 * results. For example, you have the peculiar case that:
18253 * "s s" =~ /^[^\xDF]+$/i => Y
18254 * "ss" =~ /^[^\xDF]+$/i => N
18256 * See [perl #89750] */
18257 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18258 if ( value == LATIN_SMALL_LETTER_SHARP_S
18259 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18262 /* Here <value> is indeed a multi-char fold. Get what it is */
18264 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18267 UV folded = _to_uni_fold_flags(
18271 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18272 ? FOLD_FLAGS_NOMIX_ASCII
18276 /* Here, <folded> should be the first character of the
18277 * multi-char fold of <value>, with <foldbuf> containing the
18278 * whole thing. But, if this fold is not allowed (because of
18279 * the flags), <fold> will be the same as <value>, and should
18280 * be processed like any other character, so skip the special
18282 if (folded != value) {
18284 /* Skip if we are recursed, currently parsing the class
18285 * again. Otherwise add this character to the list of
18286 * multi-char folds. */
18287 if (! RExC_in_multi_char_class) {
18288 STRLEN cp_count = utf8_length(foldbuf,
18289 foldbuf + foldlen);
18290 SV* multi_fold = sv_2mortal(newSVpvs(""));
18292 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18295 = add_multi_match(multi_char_matches,
18301 /* This element should not be processed further in this
18304 value = save_value;
18305 prevvalue = save_prevvalue;
18311 if (strict && ckWARN(WARN_REGEXP)) {
18314 /* If the range starts above 255, everything is portable and
18315 * likely to be so for any forseeable character set, so don't
18317 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18318 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18320 else if (prevvalue != value) {
18322 /* Under strict, ranges that stop and/or end in an ASCII
18323 * printable should have each end point be a portable value
18324 * for it (preferably like 'A', but we don't warn if it is
18325 * a (portable) Unicode name or code point), and the range
18326 * must be be all digits or all letters of the same case.
18327 * Otherwise, the range is non-portable and unclear as to
18328 * what it contains */
18329 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
18330 && ( non_portable_endpoint
18331 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18332 || (isLOWER_A(prevvalue) && isLOWER_A(value))
18333 || (isUPPER_A(prevvalue) && isUPPER_A(value))
18335 vWARN(RExC_parse, "Ranges of ASCII printables should"
18336 " be some subset of \"0-9\","
18337 " \"A-Z\", or \"a-z\"");
18339 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18340 SSize_t index_start;
18341 SSize_t index_final;
18343 /* But the nature of Unicode and languages mean we
18344 * can't do the same checks for above-ASCII ranges,
18345 * except in the case of digit ones. These should
18346 * contain only digits from the same group of 10. The
18347 * ASCII case is handled just above. Hence here, the
18348 * range could be a range of digits. First some
18349 * unlikely special cases. Grandfather in that a range
18350 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18351 * if its starting value is one of the 10 digits prior
18352 * to it. This is because it is an alternate way of
18353 * writing 19D1, and some people may expect it to be in
18354 * that group. But it is bad, because it won't give
18355 * the expected results. In Unicode 5.2 it was
18356 * considered to be in that group (of 11, hence), but
18357 * this was fixed in the next version */
18359 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18360 goto warn_bad_digit_range;
18362 else if (UNLIKELY( prevvalue >= 0x1D7CE
18363 && value <= 0x1D7FF))
18365 /* This is the only other case currently in Unicode
18366 * where the algorithm below fails. The code
18367 * points just above are the end points of a single
18368 * range containing only decimal digits. It is 5
18369 * different series of 0-9. All other ranges of
18370 * digits currently in Unicode are just a single
18371 * series. (And mktables will notify us if a later
18372 * Unicode version breaks this.)
18374 * If the range being checked is at most 9 long,
18375 * and the digit values represented are in
18376 * numerical order, they are from the same series.
18378 if ( value - prevvalue > 9
18379 || ((( value - 0x1D7CE) % 10)
18380 <= (prevvalue - 0x1D7CE) % 10))
18382 goto warn_bad_digit_range;
18387 /* For all other ranges of digits in Unicode, the
18388 * algorithm is just to check if both end points
18389 * are in the same series, which is the same range.
18391 index_start = _invlist_search(
18392 PL_XPosix_ptrs[_CC_DIGIT],
18395 /* Warn if the range starts and ends with a digit,
18396 * and they are not in the same group of 10. */
18397 if ( index_start >= 0
18398 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18400 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18401 value)) != index_start
18402 && index_final >= 0
18403 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18405 warn_bad_digit_range:
18406 vWARN(RExC_parse, "Ranges of digits should be"
18407 " from the same group of"
18414 if ((! range || prevvalue == value) && non_portable_endpoint) {
18415 if (isPRINT_A(value)) {
18418 if (isBACKSLASHED_PUNCT(value)) {
18419 literal[d++] = '\\';
18421 literal[d++] = (char) value;
18422 literal[d++] = '\0';
18425 "\"%.*s\" is more clearly written simply as \"%s\"",
18426 (int) (RExC_parse - rangebegin),
18431 else if (isMNEMONIC_CNTRL(value)) {
18433 "\"%.*s\" is more clearly written simply as \"%s\"",
18434 (int) (RExC_parse - rangebegin),
18436 cntrl_to_mnemonic((U8) value)
18442 /* Deal with this element of the class */
18445 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18448 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18449 * that don't require special handling, we can just add the range like
18450 * we do for ASCII platforms */
18451 if ((UNLIKELY(prevvalue == 0) && value >= 255)
18452 || ! (prevvalue < 256
18454 || (! non_portable_endpoint
18455 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18456 || (isUPPER_A(prevvalue)
18457 && isUPPER_A(value)))))))
18459 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18463 /* Here, requires special handling. This can be because it is a
18464 * range whose code points are considered to be Unicode, and so
18465 * must be individually translated into native, or because its a
18466 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18467 * EBCDIC, but we have defined them to include only the "expected"
18468 * upper or lower case ASCII alphabetics. Subranges above 255 are
18469 * the same in native and Unicode, so can be added as a range */
18470 U8 start = NATIVE_TO_LATIN1(prevvalue);
18472 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18473 for (j = start; j <= end; j++) {
18474 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18477 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18483 range = 0; /* this range (if it was one) is done now */
18484 } /* End of loop through all the text within the brackets */
18486 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18487 output_posix_warnings(pRExC_state, posix_warnings);
18490 /* If anything in the class expands to more than one character, we have to
18491 * deal with them by building up a substitute parse string, and recursively
18492 * calling reg() on it, instead of proceeding */
18493 if (multi_char_matches) {
18494 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18497 char *save_end = RExC_end;
18498 char *save_parse = RExC_parse;
18499 char *save_start = RExC_start;
18500 Size_t constructed_prefix_len = 0; /* This gives the length of the
18501 constructed portion of the
18502 substitute parse. */
18503 bool first_time = TRUE; /* First multi-char occurrence doesn't get
18508 /* Only one level of recursion allowed */
18509 assert(RExC_copy_start_in_constructed == RExC_precomp);
18511 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
18512 because too confusing */
18514 sv_catpvs(substitute_parse, "(?:");
18518 /* Look at the longest strings first */
18519 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18524 if (av_exists(multi_char_matches, cp_count)) {
18525 AV** this_array_ptr;
18528 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18530 while ((this_sequence = av_pop(*this_array_ptr)) !=
18533 if (! first_time) {
18534 sv_catpvs(substitute_parse, "|");
18536 first_time = FALSE;
18538 sv_catpv(substitute_parse, SvPVX(this_sequence));
18543 /* If the character class contains anything else besides these
18544 * multi-character strings, have to include it in recursive parsing */
18545 if (element_count) {
18546 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18548 sv_catpvs(substitute_parse, "|");
18549 if (has_l_bracket) { /* Add an [ if the original had one */
18550 sv_catpvs(substitute_parse, "[");
18552 constructed_prefix_len = SvCUR(substitute_parse);
18553 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18555 /* Put in a closing ']' to match any opening one, but not if going
18556 * off the end, as otherwise we are adding something that really
18558 if (has_l_bracket && RExC_parse < RExC_end) {
18559 sv_catpvs(substitute_parse, "]");
18563 sv_catpvs(substitute_parse, ")");
18566 /* This is a way to get the parse to skip forward a whole named
18567 * sequence instead of matching the 2nd character when it fails the
18569 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18573 /* Set up the data structure so that any errors will be properly
18574 * reported. See the comments at the definition of
18575 * REPORT_LOCATION_ARGS for details */
18576 RExC_copy_start_in_input = (char *) orig_parse;
18577 RExC_start = RExC_parse = SvPV(substitute_parse, len);
18578 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18579 RExC_end = RExC_parse + len;
18580 RExC_in_multi_char_class = 1;
18582 ret = reg(pRExC_state, 1, ®_flags, depth+1);
18584 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18586 /* And restore so can parse the rest of the pattern */
18587 RExC_parse = save_parse;
18588 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18589 RExC_end = save_end;
18590 RExC_in_multi_char_class = 0;
18591 SvREFCNT_dec_NN(multi_char_matches);
18595 /* If folding, we calculate all characters that could fold to or from the
18596 * ones already on the list */
18597 if (cp_foldable_list) {
18599 UV start, end; /* End points of code point ranges */
18601 SV* fold_intersection = NULL;
18604 /* Our calculated list will be for Unicode rules. For locale
18605 * matching, we have to keep a separate list that is consulted at
18606 * runtime only when the locale indicates Unicode rules (and we
18607 * don't include potential matches in the ASCII/Latin1 range, as
18608 * any code point could fold to any other, based on the run-time
18609 * locale). For non-locale, we just use the general list */
18611 use_list = &only_utf8_locale_list;
18614 use_list = &cp_list;
18617 /* Only the characters in this class that participate in folds need
18618 * be checked. Get the intersection of this class and all the
18619 * possible characters that are foldable. This can quickly narrow
18620 * down a large class */
18621 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18622 &fold_intersection);
18624 /* Now look at the foldable characters in this class individually */
18625 invlist_iterinit(fold_intersection);
18626 while (invlist_iternext(fold_intersection, &start, &end)) {
18630 /* Look at every character in the range */
18631 for (j = start; j <= end; j++) {
18632 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18635 Size_t folds_count;
18637 const U32 * remaining_folds;
18641 /* Under /l, we don't know what code points below 256
18642 * fold to, except we do know the MICRO SIGN folds to
18643 * an above-255 character if the locale is UTF-8, so we
18644 * add it to the special list (in *use_list) Otherwise
18645 * we know now what things can match, though some folds
18646 * are valid under /d only if the target is UTF-8.
18647 * Those go in a separate list */
18648 if ( IS_IN_SOME_FOLD_L1(j)
18649 && ! (LOC && j != MICRO_SIGN))
18652 /* ASCII is always matched; non-ASCII is matched
18653 * only under Unicode rules (which could happen
18654 * under /l if the locale is a UTF-8 one */
18655 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18656 *use_list = add_cp_to_invlist(*use_list,
18657 PL_fold_latin1[j]);
18659 else if (j != PL_fold_latin1[j]) {
18660 upper_latin1_only_utf8_matches
18661 = add_cp_to_invlist(
18662 upper_latin1_only_utf8_matches,
18663 PL_fold_latin1[j]);
18667 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18668 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18670 add_above_Latin1_folds(pRExC_state,
18677 /* Here is an above Latin1 character. We don't have the
18678 * rules hard-coded for it. First, get its fold. This is
18679 * the simple fold, as the multi-character folds have been
18680 * handled earlier and separated out */
18681 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18682 (ASCII_FOLD_RESTRICTED)
18683 ? FOLD_FLAGS_NOMIX_ASCII
18686 /* Single character fold of above Latin1. Add everything
18687 * in its fold closure to the list that this node should
18689 folds_count = _inverse_folds(folded, &first_fold,
18691 for (k = 0; k <= folds_count; k++) {
18692 UV c = (k == 0) /* First time through use itself */
18694 : (k == 1) /* 2nd time use, the first fold */
18697 /* Then the remaining ones */
18698 : remaining_folds[k-2];
18700 /* /aa doesn't allow folds between ASCII and non- */
18701 if (( ASCII_FOLD_RESTRICTED
18702 && (isASCII(c) != isASCII(j))))
18707 /* Folds under /l which cross the 255/256 boundary are
18708 * added to a separate list. (These are valid only
18709 * when the locale is UTF-8.) */
18710 if (c < 256 && LOC) {
18711 *use_list = add_cp_to_invlist(*use_list, c);
18715 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18717 cp_list = add_cp_to_invlist(cp_list, c);
18720 /* Similarly folds involving non-ascii Latin1
18721 * characters under /d are added to their list */
18722 upper_latin1_only_utf8_matches
18723 = add_cp_to_invlist(
18724 upper_latin1_only_utf8_matches,
18730 SvREFCNT_dec_NN(fold_intersection);
18733 /* Now that we have finished adding all the folds, there is no reason
18734 * to keep the foldable list separate */
18735 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18736 SvREFCNT_dec_NN(cp_foldable_list);
18739 /* And combine the result (if any) with any inversion lists from posix
18740 * classes. The lists are kept separate up to now because we don't want to
18741 * fold the classes */
18742 if (simple_posixes) { /* These are the classes known to be unaffected by
18745 _invlist_union(cp_list, simple_posixes, &cp_list);
18746 SvREFCNT_dec_NN(simple_posixes);
18749 cp_list = simple_posixes;
18752 if (posixes || nposixes) {
18753 if (! DEPENDS_SEMANTICS) {
18755 /* For everything but /d, we can just add the current 'posixes' and
18756 * 'nposixes' to the main list */
18759 _invlist_union(cp_list, posixes, &cp_list);
18760 SvREFCNT_dec_NN(posixes);
18768 _invlist_union(cp_list, nposixes, &cp_list);
18769 SvREFCNT_dec_NN(nposixes);
18772 cp_list = nposixes;
18777 /* Under /d, things like \w match upper Latin1 characters only if
18778 * the target string is in UTF-8. But things like \W match all the
18779 * upper Latin1 characters if the target string is not in UTF-8.
18781 * Handle the case with something like \W separately */
18783 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18785 /* A complemented posix class matches all upper Latin1
18786 * characters if not in UTF-8. And it matches just certain
18787 * ones when in UTF-8. That means those certain ones are
18788 * matched regardless, so can just be added to the
18789 * unconditional list */
18791 _invlist_union(cp_list, nposixes, &cp_list);
18792 SvREFCNT_dec_NN(nposixes);
18796 cp_list = nposixes;
18799 /* Likewise for 'posixes' */
18800 _invlist_union(posixes, cp_list, &cp_list);
18801 SvREFCNT_dec(posixes);
18803 /* Likewise for anything else in the range that matched only
18805 if (upper_latin1_only_utf8_matches) {
18806 _invlist_union(cp_list,
18807 upper_latin1_only_utf8_matches,
18809 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18810 upper_latin1_only_utf8_matches = NULL;
18813 /* If we don't match all the upper Latin1 characters regardless
18814 * of UTF-8ness, we have to set a flag to match the rest when
18816 _invlist_subtract(only_non_utf8_list, cp_list,
18817 &only_non_utf8_list);
18818 if (_invlist_len(only_non_utf8_list) != 0) {
18819 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18821 SvREFCNT_dec_NN(only_non_utf8_list);
18824 /* Here there were no complemented posix classes. That means
18825 * the upper Latin1 characters in 'posixes' match only when the
18826 * target string is in UTF-8. So we have to add them to the
18827 * list of those types of code points, while adding the
18828 * remainder to the unconditional list.
18830 * First calculate what they are */
18831 SV* nonascii_but_latin1_properties = NULL;
18832 _invlist_intersection(posixes, PL_UpperLatin1,
18833 &nonascii_but_latin1_properties);
18835 /* And add them to the final list of such characters. */
18836 _invlist_union(upper_latin1_only_utf8_matches,
18837 nonascii_but_latin1_properties,
18838 &upper_latin1_only_utf8_matches);
18840 /* Remove them from what now becomes the unconditional list */
18841 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18844 /* And add those unconditional ones to the final list */
18846 _invlist_union(cp_list, posixes, &cp_list);
18847 SvREFCNT_dec_NN(posixes);
18854 SvREFCNT_dec(nonascii_but_latin1_properties);
18856 /* Get rid of any characters from the conditional list that we
18857 * now know are matched unconditionally, which may make that
18859 _invlist_subtract(upper_latin1_only_utf8_matches,
18861 &upper_latin1_only_utf8_matches);
18862 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18863 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18864 upper_latin1_only_utf8_matches = NULL;
18870 /* And combine the result (if any) with any inversion list from properties.
18871 * The lists are kept separate up to now so that we can distinguish the two
18872 * in regards to matching above-Unicode. A run-time warning is generated
18873 * if a Unicode property is matched against a non-Unicode code point. But,
18874 * we allow user-defined properties to match anything, without any warning,
18875 * and we also suppress the warning if there is a portion of the character
18876 * class that isn't a Unicode property, and which matches above Unicode, \W
18877 * or [\x{110000}] for example.
18878 * (Note that in this case, unlike the Posix one above, there is no
18879 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18880 * forces Unicode semantics */
18884 /* If it matters to the final outcome, see if a non-property
18885 * component of the class matches above Unicode. If so, the
18886 * warning gets suppressed. This is true even if just a single
18887 * such code point is specified, as, though not strictly correct if
18888 * another such code point is matched against, the fact that they
18889 * are using above-Unicode code points indicates they should know
18890 * the issues involved */
18892 warn_super = ! (invert
18893 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18896 _invlist_union(properties, cp_list, &cp_list);
18897 SvREFCNT_dec_NN(properties);
18900 cp_list = properties;
18905 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18907 /* Because an ANYOF node is the only one that warns, this node
18908 * can't be optimized into something else */
18909 optimizable = FALSE;
18913 /* Here, we have calculated what code points should be in the character
18916 * Now we can see about various optimizations. Fold calculation (which we
18917 * did above) needs to take place before inversion. Otherwise /[^k]/i
18918 * would invert to include K, which under /i would match k, which it
18919 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18920 * folded until runtime */
18922 /* If we didn't do folding, it's because some information isn't available
18923 * until runtime; set the run-time fold flag for these We know to set the
18924 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18925 * at least one 0-255 range code point */
18928 /* Some things on the list might be unconditionally included because of
18929 * other components. Remove them, and clean up the list if it goes to
18931 if (only_utf8_locale_list && cp_list) {
18932 _invlist_subtract(only_utf8_locale_list, cp_list,
18933 &only_utf8_locale_list);
18935 if (_invlist_len(only_utf8_locale_list) == 0) {
18936 SvREFCNT_dec_NN(only_utf8_locale_list);
18937 only_utf8_locale_list = NULL;
18940 if ( only_utf8_locale_list
18941 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18942 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18944 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18947 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18949 else if (cp_list && invlist_lowest(cp_list) < 256) {
18950 /* If nothing is below 256, has no locale dependency; otherwise it
18952 anyof_flags |= ANYOFL_FOLD;
18953 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18956 else if ( DEPENDS_SEMANTICS
18957 && ( upper_latin1_only_utf8_matches
18958 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18960 RExC_seen_d_op = TRUE;
18961 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18964 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18968 && ! has_runtime_dependency)
18970 _invlist_invert(cp_list);
18972 /* Clear the invert flag since have just done it here */
18976 /* All possible optimizations below still have these characteristics.
18977 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18979 *flagp |= HASWIDTH|SIMPLE;
18982 *ret_invlist = cp_list;
18984 return (cp_list) ? RExC_emit : 0;
18987 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18988 RExC_contains_locale = 1;
18991 /* Some character classes are equivalent to other nodes. Such nodes take
18992 * up less room, and some nodes require fewer operations to execute, than
18993 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
18994 * improve efficiency. */
18997 PERL_UINT_FAST8_T i;
18998 UV partial_cp_count = 0;
18999 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19000 UV end[MAX_FOLD_FROMS+1] = { 0 };
19001 bool single_range = FALSE;
19003 if (cp_list) { /* Count the code points in enough ranges that we would
19004 see all the ones possible in any fold in this version
19007 invlist_iterinit(cp_list);
19008 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19009 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19012 partial_cp_count += end[i] - start[i] + 1;
19016 single_range = TRUE;
19018 invlist_iterfinish(cp_list);
19021 /* If we know at compile time that this matches every possible code
19022 * point, any run-time dependencies don't matter */
19023 if (start[0] == 0 && end[0] == UV_MAX) {
19025 ret = reganode(pRExC_state, OPFAIL, 0);
19028 ret = reg_node(pRExC_state, SANY);
19034 /* Similarly, for /l posix classes, if both a class and its
19035 * complement match, any run-time dependencies don't matter */
19037 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19040 if ( POSIXL_TEST(posixl, namedclass) /* class */
19041 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19044 ret = reganode(pRExC_state, OPFAIL, 0);
19047 ret = reg_node(pRExC_state, SANY);
19054 /* For well-behaved locales, some classes are subsets of others,
19055 * so complementing the subset and including the non-complemented
19056 * superset should match everything, like [\D[:alnum:]], and
19057 * [[:^alpha:][:alnum:]], but some implementations of locales are
19058 * buggy, and khw thinks its a bad idea to have optimization change
19059 * behavior, even if it avoids an OS bug in a given case */
19061 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19063 /* If is a single posix /l class, can optimize to just that op.
19064 * Such a node will not match anything in the Latin1 range, as that
19065 * is not determinable until runtime, but will match whatever the
19066 * class does outside that range. (Note that some classes won't
19067 * match anything outside the range, like [:ascii:]) */
19068 if ( isSINGLE_BIT_SET(posixl)
19069 && (partial_cp_count == 0 || start[0] > 255))
19072 SV * class_above_latin1 = NULL;
19073 bool already_inverted;
19074 bool are_equivalent;
19076 /* Compute which bit is set, which is the same thing as, e.g.,
19077 * ANYOF_CNTRL. From
19078 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19080 static const int MultiplyDeBruijnBitPosition2[32] =
19082 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19083 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19086 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19087 * 0x077CB531U) >> 27];
19088 classnum = namedclass_to_classnum(namedclass);
19090 /* The named classes are such that the inverted number is one
19091 * larger than the non-inverted one */
19092 already_inverted = namedclass
19093 - classnum_to_namedclass(classnum);
19095 /* Create an inversion list of the official property, inverted
19096 * if the constructed node list is inverted, and restricted to
19097 * only the above latin1 code points, which are the only ones
19098 * known at compile time */
19099 _invlist_intersection_maybe_complement_2nd(
19101 PL_XPosix_ptrs[classnum],
19103 &class_above_latin1);
19104 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19106 SvREFCNT_dec_NN(class_above_latin1);
19108 if (are_equivalent) {
19110 /* Resolve the run-time inversion flag with this possibly
19111 * inverted class */
19112 invert = invert ^ already_inverted;
19114 ret = reg_node(pRExC_state,
19115 POSIXL + invert * (NPOSIXL - POSIXL));
19116 FLAGS(REGNODE_p(ret)) = classnum;
19122 /* khw can't think of any other possible transformation involving
19124 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19128 if (! has_runtime_dependency) {
19130 /* If the list is empty, nothing matches. This happens, for
19131 * example, when a Unicode property that doesn't match anything is
19132 * the only element in the character class (perluniprops.pod notes
19133 * such properties). */
19134 if (partial_cp_count == 0) {
19136 ret = reg_node(pRExC_state, SANY);
19139 ret = reganode(pRExC_state, OPFAIL, 0);
19145 /* If matches everything but \n */
19146 if ( start[0] == 0 && end[0] == '\n' - 1
19147 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19150 ret = reg_node(pRExC_state, REG_ANY);
19156 /* Next see if can optimize classes that contain just a few code points
19157 * into an EXACTish node. The reason to do this is to let the
19158 * optimizer join this node with adjacent EXACTish ones, and ANYOF
19159 * nodes require conversion to code point from UTF-8.
19161 * An EXACTFish node can be generated even if not under /i, and vice
19162 * versa. But care must be taken. An EXACTFish node has to be such
19163 * that it only matches precisely the code points in the class, but we
19164 * want to generate the least restrictive one that does that, to
19165 * increase the odds of being able to join with an adjacent node. For
19166 * example, if the class contains [kK], we have to make it an EXACTFAA
19167 * node to prevent the KELVIN SIGN from matching. Whether we are under
19168 * /i or not is irrelevant in this case. Less obvious is the pattern
19169 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
19170 * supposed to match the single character U+0149 LATIN SMALL LETTER N
19171 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
19172 * that includes \X{02BC}, there is a multi-char fold that does, and so
19173 * the node generated for it must be an EXACTFish one. On the other
19174 * hand qr/:/i should generate a plain EXACT node since the colon
19175 * participates in no fold whatsoever, and having it EXACT tells the
19176 * optimizer the target string cannot match unless it has a colon in
19182 /* Only try if there are no more code points in the class than
19183 * in the max possible fold */
19184 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19186 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19188 /* We can always make a single code point class into an
19189 * EXACTish node. */
19193 /* Here is /l: Use EXACTL, except if there is a fold not
19194 * known until runtime so shows as only a single code point
19195 * here. For code points above 255, we know which can
19196 * cause problems by having a potential fold to the Latin1
19199 || ( start[0] > 255
19200 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19208 else if (! FOLD) { /* Not /l and not /i */
19209 op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19211 else if (start[0] < 256) { /* /i, not /l, and the code point is
19214 /* Under /i, it gets a little tricky. A code point that
19215 * doesn't participate in a fold should be an EXACT node.
19216 * We know this one isn't the result of a simple fold, or
19217 * there'd be more than one code point in the list, but it
19218 * could be part of a multi- character fold. In that case
19219 * we better not create an EXACT node, as we would wrongly
19220 * be telling the optimizer that this code point must be in
19221 * the target string, and that is wrong. This is because
19222 * if the sequence around this code point forms a
19223 * multi-char fold, what needs to be in the string could be
19224 * the code point that folds to the sequence.
19226 * This handles the case of below-255 code points, as we
19227 * have an easy look up for those. The next clause handles
19228 * the above-256 one */
19229 op = IS_IN_SOME_FOLD_L1(start[0])
19233 else { /* /i, larger code point. Since we are under /i, and
19234 have just this code point, we know that it can't
19235 fold to something else, so PL_InMultiCharFold
19237 op = _invlist_contains_cp(PL_InMultiCharFold,
19245 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19246 && _invlist_contains_cp(PL_in_some_fold, start[0]))
19248 /* Here, the only runtime dependency, if any, is from /d, and
19249 * the class matches more than one code point, and the lowest
19250 * code point participates in some fold. It might be that the
19251 * other code points are /i equivalent to this one, and hence
19252 * they would representable by an EXACTFish node. Above, we
19253 * eliminated classes that contain too many code points to be
19254 * EXACTFish, with the test for MAX_FOLD_FROMS
19256 * First, special case the ASCII fold pairs, like 'B' and 'b'.
19257 * We do this because we have EXACTFAA at our disposal for the
19259 if (partial_cp_count == 2 && isASCII(start[0])) {
19261 /* The only ASCII characters that participate in folds are
19263 assert(isALPHA(start[0]));
19264 if ( end[0] == start[0] /* First range is a single
19265 character, so 2nd exists */
19266 && isALPHA_FOLD_EQ(start[0], start[1]))
19269 /* Here, is part of an ASCII fold pair */
19271 if ( ASCII_FOLD_RESTRICTED
19272 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19274 /* If the second clause just above was true, it
19275 * means we can't be under /i, or else the list
19276 * would have included more than this fold pair.
19277 * Therefore we have to exclude the possibility of
19278 * whatever else it is that folds to these, by
19279 * using EXACTFAA */
19282 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19284 /* Here, there's no simple fold that start[0] is part
19285 * of, but there is a multi-character one. If we
19286 * are not under /i, we want to exclude that
19287 * possibility; if under /i, we want to include it
19289 op = (FOLD) ? EXACTFU : EXACTFAA;
19293 /* Here, the only possible fold start[0] particpates in
19294 * is with start[1]. /i or not isn't relevant */
19298 value = toFOLD(start[0]);
19301 else if ( ! upper_latin1_only_utf8_matches
19302 || ( _invlist_len(upper_latin1_only_utf8_matches)
19305 invlist_highest(upper_latin1_only_utf8_matches)]
19308 /* Here, the smallest character is non-ascii or there are
19309 * more than 2 code points matched by this node. Also, we
19310 * either don't have /d UTF-8 dependent matches, or if we
19311 * do, they look like they could be a single character that
19312 * is the fold of the lowest one in the always-match list.
19313 * This test quickly excludes most of the false positives
19314 * when there are /d UTF-8 depdendent matches. These are
19315 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19316 * SMALL LETTER A WITH GRAVE iff the target string is
19317 * UTF-8. (We don't have to worry above about exceeding
19318 * the array bounds of PL_fold_latin1[] because any code
19319 * point in 'upper_latin1_only_utf8_matches' is below 256.)
19321 * EXACTFAA would apply only to pairs (hence exactly 2 code
19322 * points) in the ASCII range, so we can't use it here to
19323 * artificially restrict the fold domain, so we check if
19324 * the class does or does not match some EXACTFish node.
19325 * Further, if we aren't under /i, and and the folded-to
19326 * character is part of a multi-character fold, we can't do
19327 * this optimization, as the sequence around it could be
19328 * that multi-character fold, and we don't here know the
19329 * context, so we have to assume it is that multi-char
19330 * fold, to prevent potential bugs.
19332 * To do the general case, we first find the fold of the
19333 * lowest code point (which may be higher than the lowest
19334 * one), then find everything that folds to it. (The data
19335 * structure we have only maps from the folded code points,
19336 * so we have to do the earlier step.) */
19339 U8 foldbuf[UTF8_MAXBYTES_CASE];
19340 UV folded = _to_uni_fold_flags(start[0],
19341 foldbuf, &foldlen, 0);
19343 const U32 * remaining_folds;
19344 Size_t folds_to_this_cp_count = _inverse_folds(
19348 Size_t folds_count = folds_to_this_cp_count + 1;
19349 SV * fold_list = _new_invlist(folds_count);
19352 /* If there are UTF-8 dependent matches, create a temporary
19353 * list of what this node matches, including them. */
19354 SV * all_cp_list = NULL;
19355 SV ** use_this_list = &cp_list;
19357 if (upper_latin1_only_utf8_matches) {
19358 all_cp_list = _new_invlist(0);
19359 use_this_list = &all_cp_list;
19360 _invlist_union(cp_list,
19361 upper_latin1_only_utf8_matches,
19365 /* Having gotten everything that participates in the fold
19366 * containing the lowest code point, we turn that into an
19367 * inversion list, making sure everything is included. */
19368 fold_list = add_cp_to_invlist(fold_list, start[0]);
19369 fold_list = add_cp_to_invlist(fold_list, folded);
19370 if (folds_to_this_cp_count > 0) {
19371 fold_list = add_cp_to_invlist(fold_list, first_fold);
19372 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19373 fold_list = add_cp_to_invlist(fold_list,
19374 remaining_folds[i]);
19378 /* If the fold list is identical to what's in this ANYOF
19379 * node, the node can be represented by an EXACTFish one
19381 if (_invlistEQ(*use_this_list, fold_list,
19382 0 /* Don't complement */ )
19385 /* But, we have to be careful, as mentioned above.
19386 * Just the right sequence of characters could match
19387 * this if it is part of a multi-character fold. That
19388 * IS what we want if we are under /i. But it ISN'T
19389 * what we want if not under /i, as it could match when
19390 * it shouldn't. So, when we aren't under /i and this
19391 * character participates in a multi-char fold, we
19392 * don't optimize into an EXACTFish node. So, for each
19393 * case below we have to check if we are folding
19394 * and if not, if it is not part of a multi-char fold.
19396 if (start[0] > 255) { /* Highish code point */
19397 if (FOLD || ! _invlist_contains_cp(
19398 PL_InMultiCharFold, folded))
19402 : (ASCII_FOLD_RESTRICTED)
19407 } /* Below, the lowest code point < 256 */
19410 && DEPENDS_SEMANTICS)
19411 { /* An EXACTF node containing a single character
19412 's', can be an EXACTFU if it doesn't get
19413 joined with an adjacent 's' */
19414 op = EXACTFU_S_EDGE;
19418 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19420 if (upper_latin1_only_utf8_matches) {
19423 /* We can't use the fold, as that only matches
19427 else if ( UNLIKELY(start[0] == MICRO_SIGN)
19429 { /* EXACTFUP is a special node for this
19431 op = (ASCII_FOLD_RESTRICTED)
19434 value = MICRO_SIGN;
19436 else if ( ASCII_FOLD_RESTRICTED
19437 && ! isASCII(start[0]))
19438 { /* For ASCII under /iaa, we can use EXACTFU
19450 SvREFCNT_dec_NN(fold_list);
19451 SvREFCNT_dec(all_cp_list);
19458 /* Here, we have calculated what EXACTish node to use. Have to
19459 * convert to UTF-8 if not already there */
19462 SvREFCNT_dec(cp_list);;
19463 REQUIRE_UTF8(flagp);
19466 /* This is a kludge to the special casing issues with this
19467 * ligature under /aa. FB05 should fold to FB06, but the
19468 * call above to _to_uni_fold_flags() didn't find this, as
19469 * it didn't use the /aa restriction in order to not miss
19470 * other folds that would be affected. This is the only
19471 * instance likely to ever be a problem in all of Unicode.
19472 * So special case it. */
19473 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
19474 && ASCII_FOLD_RESTRICTED)
19476 value = LATIN_SMALL_LIGATURE_ST;
19480 len = (UTF) ? UVCHR_SKIP(value) : 1;
19482 ret = regnode_guts(pRExC_state, op, len, "exact");
19483 FILL_NODE(ret, op);
19484 RExC_emit += 1 + STR_SZ(len);
19485 setSTR_LEN(REGNODE_p(ret), len);
19487 *STRINGs(REGNODE_p(ret)) = (U8) value;
19490 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19496 if (! has_runtime_dependency) {
19498 /* See if this can be turned into an ANYOFM node. Think about the
19499 * bit patterns in two different bytes. In some positions, the
19500 * bits in each will be 1; and in other positions both will be 0;
19501 * and in some positions the bit will be 1 in one byte, and 0 in
19502 * the other. Let 'n' be the number of positions where the bits
19503 * differ. We create a mask which has exactly 'n' 0 bits, each in
19504 * a position where the two bytes differ. Now take the set of all
19505 * bytes that when ANDed with the mask yield the same result. That
19506 * set has 2**n elements, and is representable by just two 8 bit
19507 * numbers: the result and the mask. Importantly, matching the set
19508 * can be vectorized by creating a word full of the result bytes,
19509 * and a word full of the mask bytes, yielding a significant speed
19510 * up. Here, see if this node matches such a set. As a concrete
19511 * example consider [01], and the byte representing '0' which is
19512 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
19513 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
19514 * 0x30. Any other bytes ANDed yield something else. So [01],
19515 * which is a common usage, is optimizable into ANYOFM, and can
19516 * benefit from the speed up. We can only do this on UTF-8
19517 * invariant bytes, because they have the same bit patterns under
19519 PERL_UINT_FAST8_T inverted = 0;
19521 const PERL_UINT_FAST8_T max_permissible = 0xFF;
19523 const PERL_UINT_FAST8_T max_permissible = 0x7F;
19525 /* If doesn't fit the criteria for ANYOFM, invert and try again.
19526 * If that works we will instead later generate an NANYOFM, and
19527 * invert back when through */
19528 if (invlist_highest(cp_list) > max_permissible) {
19529 _invlist_invert(cp_list);
19533 if (invlist_highest(cp_list) <= max_permissible) {
19534 UV this_start, this_end;
19535 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
19536 U8 bits_differing = 0;
19537 Size_t full_cp_count = 0;
19538 bool first_time = TRUE;
19540 /* Go through the bytes and find the bit positions that differ
19542 invlist_iterinit(cp_list);
19543 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19544 unsigned int i = this_start;
19547 if (! UVCHR_IS_INVARIANT(i)) {
19551 first_time = FALSE;
19552 lowest_cp = this_start;
19554 /* We have set up the code point to compare with.
19555 * Don't compare it with itself */
19559 /* Find the bit positions that differ from the lowest code
19560 * point in the node. Keep track of all such positions by
19562 for (; i <= this_end; i++) {
19563 if (! UVCHR_IS_INVARIANT(i)) {
19567 bits_differing |= i ^ lowest_cp;
19570 full_cp_count += this_end - this_start + 1;
19573 /* At the end of the loop, we count how many bits differ from
19574 * the bits in lowest code point, call the count 'd'. If the
19575 * set we found contains 2**d elements, it is the closure of
19576 * all code points that differ only in those bit positions. To
19577 * convince yourself of that, first note that the number in the
19578 * closure must be a power of 2, which we test for. The only
19579 * way we could have that count and it be some differing set,
19580 * is if we got some code points that don't differ from the
19581 * lowest code point in any position, but do differ from each
19582 * other in some other position. That means one code point has
19583 * a 1 in that position, and another has a 0. But that would
19584 * mean that one of them differs from the lowest code point in
19585 * that position, which possibility we've already excluded. */
19586 if ( (inverted || full_cp_count > 1)
19587 && full_cp_count == 1U << PL_bitcount[bits_differing])
19591 op = ANYOFM + inverted;;
19593 /* We need to make the bits that differ be 0's */
19594 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19596 /* The argument is the lowest code point */
19597 ret = reganode(pRExC_state, op, lowest_cp);
19598 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19602 invlist_iterfinish(cp_list);
19606 _invlist_invert(cp_list);
19613 /* XXX We could create an ANYOFR_LOW node here if we saved above if
19614 * all were invariants, it wasn't inverted, and there is a single
19615 * range. This would be faster than some of the posix nodes we
19616 * create below like /\d/a, but would be twice the size. Without
19617 * having actually measured the gain, khw doesn't think the
19618 * tradeoff is really worth it */
19621 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19622 PERL_UINT_FAST8_T type;
19623 SV * intersection = NULL;
19624 SV* d_invlist = NULL;
19626 /* See if this matches any of the POSIX classes. The POSIXA and
19627 * POSIXD ones are about the same speed as ANYOF ops, but take less
19628 * room; the ones that have above-Latin1 code point matches are
19629 * somewhat faster than ANYOF. */
19631 for (type = POSIXA; type >= POSIXD; type--) {
19634 if (type == POSIXL) { /* But not /l posix classes */
19638 for (posix_class = 0;
19639 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19642 SV** our_code_points = &cp_list;
19643 SV** official_code_points;
19646 if (type == POSIXA) {
19647 official_code_points = &PL_Posix_ptrs[posix_class];
19650 official_code_points = &PL_XPosix_ptrs[posix_class];
19653 /* Skip non-existent classes of this type. e.g. \v only
19654 * has an entry in PL_XPosix_ptrs */
19655 if (! *official_code_points) {
19659 /* Try both the regular class, and its inversion */
19660 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19661 bool this_inverted = invert ^ try_inverted;
19663 if (type != POSIXD) {
19665 /* This class that isn't /d can't match if we have
19666 * /d dependencies */
19667 if (has_runtime_dependency
19668 & HAS_D_RUNTIME_DEPENDENCY)
19673 else /* is /d */ if (! this_inverted) {
19675 /* /d classes don't match anything non-ASCII below
19676 * 256 unconditionally (which cp_list contains) */
19677 _invlist_intersection(cp_list, PL_UpperLatin1,
19679 if (_invlist_len(intersection) != 0) {
19683 SvREFCNT_dec(d_invlist);
19684 d_invlist = invlist_clone(cp_list, NULL);
19686 /* But under UTF-8 it turns into using /u rules.
19687 * Add the things it matches under these conditions
19688 * so that we check below that these are identical
19689 * to what the tested class should match */
19690 if (upper_latin1_only_utf8_matches) {
19693 upper_latin1_only_utf8_matches,
19696 our_code_points = &d_invlist;
19698 else { /* POSIXD, inverted. If this doesn't have this
19699 flag set, it isn't /d. */
19700 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19704 our_code_points = &cp_list;
19707 /* Here, have weeded out some things. We want to see
19708 * if the list of characters this node contains
19709 * ('*our_code_points') precisely matches those of the
19710 * class we are currently checking against
19711 * ('*official_code_points'). */
19712 if (_invlistEQ(*our_code_points,
19713 *official_code_points,
19716 /* Here, they precisely match. Optimize this ANYOF
19717 * node into its equivalent POSIX one of the
19718 * correct type, possibly inverted */
19719 ret = reg_node(pRExC_state, (try_inverted)
19723 FLAGS(REGNODE_p(ret)) = posix_class;
19724 SvREFCNT_dec(d_invlist);
19725 SvREFCNT_dec(intersection);
19731 SvREFCNT_dec(d_invlist);
19732 SvREFCNT_dec(intersection);
19735 /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19736 * both in size and speed. Currently, a 20 bit range base (smallest
19737 * code point in the range), and a 12 bit maximum delta are packed into
19738 * a 32 bit word. This allows for using it on all of the Unicode code
19739 * points except for the highest plane, which is only for private use
19740 * code points. khw doubts that a bigger delta is likely in real world
19743 && ! has_runtime_dependency
19744 && anyof_flags == 0
19745 && start[0] < (1 << ANYOFR_BASE_BITS)
19746 && end[0] - start[0]
19747 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19748 * CHARBITS - ANYOFR_BASE_BITS))))
19751 U8 low_utf8[UTF8_MAXBYTES+1];
19752 U8 high_utf8[UTF8_MAXBYTES+1];
19754 ret = reganode(pRExC_state, ANYOFR,
19755 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19757 /* Place the lowest UTF-8 start byte in the flags field, so as to
19758 * allow efficient ruling out at run time of many possible inputs.
19760 (void) uvchr_to_utf8(low_utf8, start[0]);
19761 (void) uvchr_to_utf8(high_utf8, end[0]);
19763 /* If all code points share the same first byte, this can be an
19764 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
19765 * quickly rule out many inputs at run-time without having to
19766 * compute the code point from UTF-8. For EBCDIC, we use I8, as
19767 * not doing that transformation would not rule out nearly so many
19769 if (low_utf8[0] == high_utf8[0]) {
19770 OP(REGNODE_p(ret)) = ANYOFRb;
19771 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19774 ANYOF_FLAGS(REGNODE_p(ret))
19775 = NATIVE_UTF8_TO_I8(low_utf8[0]);
19781 /* If didn't find an optimization and there is no need for a bitmap,
19782 * optimize to indicate that */
19783 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19785 && ! upper_latin1_only_utf8_matches
19786 && anyof_flags == 0)
19788 U8 low_utf8[UTF8_MAXBYTES+1];
19789 UV highest_cp = invlist_highest(cp_list);
19791 /* Currently the maximum allowed code point by the system is
19792 * IV_MAX. Higher ones are reserved for future internal use. This
19793 * particular regnode can be used for higher ones, but we can't
19794 * calculate the code point of those. IV_MAX suffices though, as
19795 * it will be a large first byte */
19796 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19799 /* We store the lowest possible first byte of the UTF-8
19800 * representation, using the flags field. This allows for quick
19801 * ruling out of some inputs without having to convert from UTF-8
19802 * to code point. For EBCDIC, we use I8, as not doing that
19803 * transformation would not rule out nearly so many things */
19804 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19808 /* If the first UTF-8 start byte for the highest code point in the
19809 * range is suitably small, we may be able to get an upper bound as
19811 if (highest_cp <= IV_MAX) {
19812 U8 high_utf8[UTF8_MAXBYTES+1];
19813 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19816 /* If the lowest and highest are the same, we can get an exact
19817 * first byte instead of a just minimum or even a sequence of
19818 * exact leading bytes. We signal these with different
19820 if (low_utf8[0] == high_utf8[0]) {
19821 Size_t len = find_first_differing_byte_pos(low_utf8,
19823 MIN(low_len, high_len));
19827 /* No need to convert to I8 for EBCDIC as this is an
19829 anyof_flags = low_utf8[0];
19834 ret = regnode_guts(pRExC_state, op,
19835 regarglen[op] + STR_SZ(len),
19837 FILL_NODE(ret, op);
19838 ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19840 Copy(low_utf8, /* Add the common bytes */
19841 ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19843 RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19844 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19845 NULL, only_utf8_locale_list);
19849 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19852 /* Here, the high byte is not the same as the low, but is
19853 * small enough that its reasonable to have a loose upper
19854 * bound, which is packed in with the strict lower bound.
19855 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19856 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19857 * is the same thing as UTF-8 */
19860 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19861 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19864 if (range_diff <= max_range_diff / 8) {
19867 else if (range_diff <= max_range_diff / 4) {
19870 else if (range_diff <= max_range_diff / 2) {
19873 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19878 goto done_finding_op;
19880 } /* End of seeing if can optimize it into a different node */
19882 is_anyof: /* It's going to be an ANYOF node. */
19883 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19893 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19894 FILL_NODE(ret, op); /* We set the argument later */
19895 RExC_emit += 1 + regarglen[op];
19896 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19898 /* Here, <cp_list> contains all the code points we can determine at
19899 * compile time that match under all conditions. Go through it, and
19900 * for things that belong in the bitmap, put them there, and delete from
19901 * <cp_list>. While we are at it, see if everything above 255 is in the
19902 * list, and if so, set a flag to speed up execution */
19904 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19907 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19911 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19914 /* Here, the bitmap has been populated with all the Latin1 code points that
19915 * always match. Can now add to the overall list those that match only
19916 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19918 if (upper_latin1_only_utf8_matches) {
19920 _invlist_union(cp_list,
19921 upper_latin1_only_utf8_matches,
19923 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19926 cp_list = upper_latin1_only_utf8_matches;
19928 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19931 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19932 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19935 only_utf8_locale_list);
19936 SvREFCNT_dec(cp_list);;
19937 SvREFCNT_dec(only_utf8_locale_list);
19942 /* Here, the node is getting optimized into something that's not an ANYOF
19943 * one. Finish up. */
19945 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19946 RExC_parse - orig_parse);;
19947 SvREFCNT_dec(cp_list);;
19948 SvREFCNT_dec(only_utf8_locale_list);
19952 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19955 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19956 regnode* const node,
19958 SV* const runtime_defns,
19959 SV* const only_utf8_locale_list)
19961 /* Sets the arg field of an ANYOF-type node 'node', using information about
19962 * the node passed-in. If there is nothing outside the node's bitmap, the
19963 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19964 * the count returned by add_data(), having allocated and stored an array,
19967 * av[0] stores the inversion list defining this class as far as known at
19968 * this time, or PL_sv_undef if nothing definite is now known.
19969 * av[1] stores the inversion list of code points that match only if the
19970 * current locale is UTF-8, or if none, PL_sv_undef if there is an
19971 * av[2], or no entry otherwise.
19972 * av[2] stores the list of user-defined properties whose subroutine
19973 * definitions aren't known at this time, or no entry if none. */
19977 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19979 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19980 assert(! (ANYOF_FLAGS(node)
19981 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19982 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19985 AV * const av = newAV();
19989 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
19992 if (only_utf8_locale_list) {
19993 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
19994 SvREFCNT_inc_NN(only_utf8_locale_list));
19997 if (runtime_defns) {
19998 av_store(av, DEFERRED_USER_DEFINED_INDEX,
19999 SvREFCNT_inc_NN(runtime_defns));
20002 rv = newRV_noinc(MUTABLE_SV(av));
20003 n = add_data(pRExC_state, STR_WITH_LEN("s"));
20004 RExC_rxi->data->data[n] = (void*)rv;
20011 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20012 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20014 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20018 /* For internal core use only.
20019 * Returns the inversion list for the input 'node' in the regex 'prog'.
20020 * If <doinit> is 'true', will attempt to create the inversion list if not
20022 * If <listsvp> is non-null, will return the printable contents of the
20023 * property definition. This can be used to get debugging information
20024 * even before the inversion list exists, by calling this function with
20025 * 'doinit' set to false, in which case the components that will be used
20026 * to eventually create the inversion list are returned (in a printable
20028 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20029 * store an inversion list of code points that should match only if the
20030 * execution-time locale is a UTF-8 one.
20031 * If <output_invlist> is not NULL, it is where this routine is to store an
20032 * inversion list of the code points that would be instead returned in
20033 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20034 * when this parameter is used, is just the non-code point data that
20035 * will go into creating the inversion list. This currently should be just
20036 * user-defined properties whose definitions were not known at compile
20037 * time. Using this parameter allows for easier manipulation of the
20038 * inversion list's data by the caller. It is illegal to call this
20039 * function with this parameter set, but not <listsvp>
20041 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20042 * that, in spite of this function's name, the inversion list it returns
20043 * may include the bitmap data as well */
20045 SV *si = NULL; /* Input initialization string */
20046 SV* invlist = NULL;
20048 RXi_GET_DECL(prog, progi);
20049 const struct reg_data * const data = prog ? progi->data : NULL;
20051 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20052 PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20054 PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20056 assert(! output_invlist || listsvp);
20058 if (data && data->count) {
20059 const U32 n = ARG(node);
20061 if (data->what[n] == 's') {
20062 SV * const rv = MUTABLE_SV(data->data[n]);
20063 AV * const av = MUTABLE_AV(SvRV(rv));
20064 SV **const ary = AvARRAY(av);
20066 invlist = ary[INVLIST_INDEX];
20068 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20069 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20072 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20073 si = ary[DEFERRED_USER_DEFINED_INDEX];
20076 if (doinit && (si || invlist)) {
20079 SV * msg = newSVpvs_flags("", SVs_TEMP);
20081 SV * prop_definition = handle_user_defined_property(
20082 "", 0, FALSE, /* There is no \p{}, \P{} */
20083 SvPVX_const(si)[1] - '0', /* /i or not has been
20084 stored here for just
20086 TRUE, /* run time */
20087 FALSE, /* This call must find the defn */
20088 si, /* The property definition */
20091 0 /* base level call */
20095 assert(prop_definition == NULL);
20097 Perl_croak(aTHX_ "%" UTF8f,
20098 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20102 _invlist_union(invlist, prop_definition, &invlist);
20103 SvREFCNT_dec_NN(prop_definition);
20106 invlist = prop_definition;
20109 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20110 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20112 ary[INVLIST_INDEX] = invlist;
20113 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20114 ? ONLY_LOCALE_MATCHES_INDEX
20122 /* If requested, return a printable version of what this ANYOF node matches
20125 SV* matches_string = NULL;
20127 /* This function can be called at compile-time, before everything gets
20128 * resolved, in which case we return the currently best available
20129 * information, which is the string that will eventually be used to do
20130 * that resolving, 'si' */
20132 /* Here, we only have 'si' (and possibly some passed-in data in
20133 * 'invlist', which is handled below) If the caller only wants
20134 * 'si', use that. */
20135 if (! output_invlist) {
20136 matches_string = newSVsv(si);
20139 /* But if the caller wants an inversion list of the node, we
20140 * need to parse 'si' and place as much as possible in the
20141 * desired output inversion list, making 'matches_string' only
20142 * contain the currently unresolvable things */
20143 const char *si_string = SvPVX(si);
20144 STRLEN remaining = SvCUR(si);
20148 /* Ignore everything before and including the first new-line */
20149 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20150 assert (si_string != NULL);
20152 remaining = SvPVX(si) + SvCUR(si) - si_string;
20154 while (remaining > 0) {
20156 /* The data consists of just strings defining user-defined
20157 * property names, but in prior incarnations, and perhaps
20158 * somehow from pluggable regex engines, it could still
20159 * hold hex code point definitions, all of which should be
20160 * legal (or it wouldn't have gotten this far). Each
20161 * component of a range would be separated by a tab, and
20162 * each range by a new-line. If these are found, instead
20163 * add them to the inversion list */
20164 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
20165 |PERL_SCAN_SILENT_NON_PORTABLE;
20166 STRLEN len = remaining;
20167 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20169 /* If the hex decode routine found something, it should go
20170 * up to the next \n */
20171 if ( *(si_string + len) == '\n') {
20172 if (count) { /* 2nd code point on line */
20173 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20176 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20179 goto prepare_for_next_iteration;
20182 /* If the hex decode was instead for the lower range limit,
20183 * save it, and go parse the upper range limit */
20184 if (*(si_string + len) == '\t') {
20185 assert(count == 0);
20189 prepare_for_next_iteration:
20190 si_string += len + 1;
20191 remaining -= len + 1;
20195 /* Here, didn't find a legal hex number. Just add the text
20196 * from here up to the next \n, omitting any trailing
20200 len = strcspn(si_string,
20201 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20203 if (matches_string) {
20204 sv_catpvn(matches_string, si_string, len);
20207 matches_string = newSVpvn(si_string, len);
20209 sv_catpvs(matches_string, " ");
20213 && UCHARAT(si_string)
20214 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20219 if (remaining && UCHARAT(si_string) == '\n') {
20223 } /* end of loop through the text */
20225 assert(matches_string);
20226 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
20227 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20229 } /* end of has an 'si' */
20232 /* Add the stuff that's already known */
20235 /* Again, if the caller doesn't want the output inversion list, put
20236 * everything in 'matches-string' */
20237 if (! output_invlist) {
20238 if ( ! matches_string) {
20239 matches_string = newSVpvs("\n");
20241 sv_catsv(matches_string, invlist_contents(invlist,
20242 TRUE /* traditional style */
20245 else if (! *output_invlist) {
20246 *output_invlist = invlist_clone(invlist, NULL);
20249 _invlist_union(*output_invlist, invlist, output_invlist);
20253 *listsvp = matches_string;
20259 /* reg_skipcomment()
20261 Absorbs an /x style # comment from the input stream,
20262 returning a pointer to the first character beyond the comment, or if the
20263 comment terminates the pattern without anything following it, this returns
20264 one past the final character of the pattern (in other words, RExC_end) and
20265 sets the REG_RUN_ON_COMMENT_SEEN flag.
20267 Note it's the callers responsibility to ensure that we are
20268 actually in /x mode
20272 PERL_STATIC_INLINE char*
20273 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20275 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20279 while (p < RExC_end) {
20280 if (*(++p) == '\n') {
20285 /* we ran off the end of the pattern without ending the comment, so we have
20286 * to add an \n when wrapping */
20287 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20292 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20294 const bool force_to_xmod
20297 /* If the text at the current parse position '*p' is a '(?#...)' comment,
20298 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20299 * is /x whitespace, advance '*p' so that on exit it points to the first
20300 * byte past all such white space and comments */
20302 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20304 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20306 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20309 if (RExC_end - (*p) >= 3
20311 && *(*p + 1) == '?'
20312 && *(*p + 2) == '#')
20314 while (*(*p) != ')') {
20315 if ((*p) == RExC_end)
20316 FAIL("Sequence (?#... not terminated");
20324 const char * save_p = *p;
20325 while ((*p) < RExC_end) {
20327 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20330 else if (*(*p) == '#') {
20331 (*p) = reg_skipcomment(pRExC_state, (*p));
20337 if (*p != save_p) {
20350 Advances the parse position by one byte, unless that byte is the beginning
20351 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
20352 those two cases, the parse position is advanced beyond all such comments and
20355 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20359 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20361 PERL_ARGS_ASSERT_NEXTCHAR;
20363 if (RExC_parse < RExC_end) {
20365 || UTF8_IS_INVARIANT(*RExC_parse)
20366 || UTF8_IS_START(*RExC_parse));
20368 RExC_parse += (UTF)
20369 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20372 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20373 FALSE /* Don't force /x */ );
20378 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20380 /* 'size' is the delta number of smallest regnode equivalents to add or
20381 * subtract from the current memory allocated to the regex engine being
20384 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20389 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20390 /* +1 for REG_MAGIC */
20393 if ( RExC_rxi == NULL )
20394 FAIL("Regexp out of space");
20395 RXi_SET(RExC_rx, RExC_rxi);
20397 RExC_emit_start = RExC_rxi->program;
20399 Zero(REGNODE_p(RExC_emit), size, regnode);
20402 #ifdef RE_TRACK_PATTERN_OFFSETS
20403 Renew(RExC_offsets, 2*RExC_size+1, U32);
20405 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20407 RExC_offsets[0] = RExC_size;
20411 STATIC regnode_offset
20412 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20414 /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20415 * equivalents space. It aligns and increments RExC_size
20417 * It returns the regnode's offset into the regex engine program */
20419 const regnode_offset ret = RExC_emit;
20421 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20423 PERL_ARGS_ASSERT_REGNODE_GUTS;
20425 SIZE_ALIGN(RExC_size);
20426 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20427 NODE_ALIGN_FILL(REGNODE_p(ret));
20428 #ifndef RE_TRACK_PATTERN_OFFSETS
20429 PERL_UNUSED_ARG(name);
20430 PERL_UNUSED_ARG(op);
20432 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20434 if (RExC_offsets) { /* MJD */
20436 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20439 (UV)(RExC_emit) > RExC_offsets[0]
20440 ? "Overwriting end of array!\n" : "OK",
20442 (UV)(RExC_parse - RExC_start),
20443 (UV)RExC_offsets[0]));
20444 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20451 - reg_node - emit a node
20453 STATIC regnode_offset /* Location. */
20454 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20456 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20457 regnode_offset ptr = ret;
20459 PERL_ARGS_ASSERT_REG_NODE;
20461 assert(regarglen[op] == 0);
20463 FILL_ADVANCE_NODE(ptr, op);
20469 - reganode - emit a node with an argument
20471 STATIC regnode_offset /* Location. */
20472 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20474 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20475 regnode_offset ptr = ret;
20477 PERL_ARGS_ASSERT_REGANODE;
20479 /* ANYOF are special cased to allow non-length 1 args */
20480 assert(regarglen[op] == 1);
20482 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20488 - regpnode - emit a temporary node with a SV* argument
20490 STATIC regnode_offset /* Location. */
20491 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20493 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20494 regnode_offset ptr = ret;
20496 PERL_ARGS_ASSERT_REGPNODE;
20498 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20503 STATIC regnode_offset
20504 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20506 /* emit a node with U32 and I32 arguments */
20508 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20509 regnode_offset ptr = ret;
20511 PERL_ARGS_ASSERT_REG2LANODE;
20513 assert(regarglen[op] == 2);
20515 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20521 - reginsert - insert an operator in front of already-emitted operand
20523 * That means that on exit 'operand' is the offset of the newly inserted
20524 * operator, and the original operand has been relocated.
20526 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20527 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20529 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20530 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20532 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20535 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20536 const regnode_offset operand, const U32 depth)
20541 const int offset = regarglen[(U8)op];
20542 const int size = NODE_STEP_REGNODE + offset;
20543 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20545 PERL_ARGS_ASSERT_REGINSERT;
20546 PERL_UNUSED_CONTEXT;
20547 PERL_UNUSED_ARG(depth);
20548 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20549 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20550 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20551 studying. If this is wrong then we need to adjust RExC_recurse
20552 below like we do with RExC_open_parens/RExC_close_parens. */
20553 change_engine_size(pRExC_state, (Ptrdiff_t) size);
20554 src = REGNODE_p(RExC_emit);
20556 dst = REGNODE_p(RExC_emit);
20558 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20559 * and [perl #133871] shows this can lead to problems, so skip this
20560 * realignment of parens until a later pass when they are reliable */
20561 if (! IN_PARENS_PASS && RExC_open_parens) {
20563 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20564 /* remember that RExC_npar is rex->nparens + 1,
20565 * iow it is 1 more than the number of parens seen in
20566 * the pattern so far. */
20567 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20568 /* note, RExC_open_parens[0] is the start of the
20569 * regex, it can't move. RExC_close_parens[0] is the end
20570 * of the regex, it *can* move. */
20571 if ( paren && RExC_open_parens[paren] >= operand ) {
20572 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20573 RExC_open_parens[paren] += size;
20575 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20577 if ( RExC_close_parens[paren] >= operand ) {
20578 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20579 RExC_close_parens[paren] += size;
20581 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20586 RExC_end_op += size;
20588 while (src > REGNODE_p(operand)) {
20589 StructCopy(--src, --dst, regnode);
20590 #ifdef RE_TRACK_PATTERN_OFFSETS
20591 if (RExC_offsets) { /* MJD 20010112 */
20593 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20597 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20598 ? "Overwriting end of array!\n" : "OK",
20599 (UV)REGNODE_OFFSET(src),
20600 (UV)REGNODE_OFFSET(dst),
20601 (UV)RExC_offsets[0]));
20602 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20603 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20608 place = REGNODE_p(operand); /* Op node, where operand used to be. */
20609 #ifdef RE_TRACK_PATTERN_OFFSETS
20610 if (RExC_offsets) { /* MJD */
20612 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20616 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20617 ? "Overwriting end of array!\n" : "OK",
20618 (UV)REGNODE_OFFSET(place),
20619 (UV)(RExC_parse - RExC_start),
20620 (UV)RExC_offsets[0]));
20621 Set_Node_Offset(place, RExC_parse);
20622 Set_Node_Length(place, 1);
20625 src = NEXTOPER(place);
20627 FILL_NODE(operand, op);
20629 /* Zero out any arguments in the new node */
20630 Zero(src, offset, regnode);
20634 - regtail - set the next-pointer at the end of a node chain of p to val. If
20635 that value won't fit in the space available, instead returns FALSE.
20636 (Except asserts if we can't fit in the largest space the regex
20637 engine is designed for.)
20638 - SEE ALSO: regtail_study
20641 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20642 const regnode_offset p,
20643 const regnode_offset val,
20646 regnode_offset scan;
20647 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20649 PERL_ARGS_ASSERT_REGTAIL;
20651 PERL_UNUSED_ARG(depth);
20654 /* Find last node. */
20655 scan = (regnode_offset) p;
20657 regnode * const temp = regnext(REGNODE_p(scan));
20659 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20660 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20661 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
20662 SvPV_nolen_const(RExC_mysv), scan,
20663 (temp == NULL ? "->" : ""),
20664 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20669 scan = REGNODE_OFFSET(temp);
20672 assert(val >= scan);
20673 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20674 assert((UV) (val - scan) <= U32_MAX);
20675 ARG_SET(REGNODE_p(scan), val - scan);
20678 if (val - scan > U16_MAX) {
20679 /* Populate this with something that won't loop and will likely
20680 * lead to a crash if the caller ignores the failure return, and
20681 * execution continues */
20682 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20685 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20693 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20694 - Look for optimizable sequences at the same time.
20695 - currently only looks for EXACT chains.
20697 This is experimental code. The idea is to use this routine to perform
20698 in place optimizations on branches and groups as they are constructed,
20699 with the long term intention of removing optimization from study_chunk so
20700 that it is purely analytical.
20702 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20703 to control which is which.
20705 This used to return a value that was ignored. It was a problem that it is
20706 #ifdef'd to be another function that didn't return a value. khw has changed it
20707 so both currently return a pass/fail return.
20710 /* TODO: All four parms should be const */
20713 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20714 const regnode_offset val, U32 depth)
20716 regnode_offset scan;
20718 #ifdef EXPERIMENTAL_INPLACESCAN
20721 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20723 PERL_ARGS_ASSERT_REGTAIL_STUDY;
20726 /* Find last node. */
20730 regnode * const temp = regnext(REGNODE_p(scan));
20731 #ifdef EXPERIMENTAL_INPLACESCAN
20732 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20733 bool unfolded_multi_char; /* Unexamined in this routine */
20734 if (join_exact(pRExC_state, scan, &min,
20735 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20736 return TRUE; /* Was return EXACT */
20740 switch (OP(REGNODE_p(scan))) {
20747 case EXACTFU_S_EDGE:
20748 case EXACTFAA_NO_TRIE:
20755 if( exact == PSEUDO )
20756 exact= OP(REGNODE_p(scan));
20757 else if ( exact != OP(REGNODE_p(scan)) )
20766 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20767 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20768 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
20769 SvPV_nolen_const(RExC_mysv),
20771 PL_reg_name[exact]);
20775 scan = REGNODE_OFFSET(temp);
20778 DEBUG_PARSE_MSG("");
20779 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20780 Perl_re_printf( aTHX_
20781 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20782 SvPV_nolen_const(RExC_mysv),
20787 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20788 assert((UV) (val - scan) <= U32_MAX);
20789 ARG_SET(REGNODE_p(scan), val - scan);
20792 if (val - scan > U16_MAX) {
20793 /* Populate this with something that won't loop and will likely
20794 * lead to a crash if the caller ignores the failure return, and
20795 * execution continues */
20796 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20799 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20802 return TRUE; /* Was 'return exact' */
20807 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20809 /* Returns an inversion list of all the code points matched by the
20810 * ANYOFM/NANYOFM node 'n' */
20812 SV * cp_list = _new_invlist(-1);
20813 const U8 lowest = (U8) ARG(n);
20816 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20818 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20820 /* Starting with the lowest code point, any code point that ANDed with the
20821 * mask yields the lowest code point is in the set */
20822 for (i = lowest; i <= 0xFF; i++) {
20823 if ((i & FLAGS(n)) == ARG(n)) {
20824 cp_list = add_cp_to_invlist(cp_list, i);
20827 /* We know how many code points (a power of two) that are in the
20828 * set. No use looking once we've got that number */
20829 if (count >= needed) break;
20833 if (OP(n) == NANYOFM) {
20834 _invlist_invert(cp_list);
20840 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20845 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20850 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20852 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20853 if (flags & (1<<bit)) {
20854 if (!set++ && lead)
20855 Perl_re_printf( aTHX_ "%s", lead);
20856 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20861 Perl_re_printf( aTHX_ "\n");
20863 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20868 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20874 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20876 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20877 if (flags & (1<<bit)) {
20878 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20881 if (!set++ && lead)
20882 Perl_re_printf( aTHX_ "%s", lead);
20883 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20886 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20887 if (!set++ && lead) {
20888 Perl_re_printf( aTHX_ "%s", lead);
20891 case REGEX_UNICODE_CHARSET:
20892 Perl_re_printf( aTHX_ "UNICODE");
20894 case REGEX_LOCALE_CHARSET:
20895 Perl_re_printf( aTHX_ "LOCALE");
20897 case REGEX_ASCII_RESTRICTED_CHARSET:
20898 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20900 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20901 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20904 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20910 Perl_re_printf( aTHX_ "\n");
20912 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20918 Perl_regdump(pTHX_ const regexp *r)
20922 SV * const sv = sv_newmortal();
20923 SV *dsv= sv_newmortal();
20924 RXi_GET_DECL(r, ri);
20925 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20927 PERL_ARGS_ASSERT_REGDUMP;
20929 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20931 /* Header fields of interest. */
20932 for (i = 0; i < 2; i++) {
20933 if (r->substrs->data[i].substr) {
20934 RE_PV_QUOTED_DECL(s, 0, dsv,
20935 SvPVX_const(r->substrs->data[i].substr),
20936 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20937 PL_dump_re_max_len);
20938 Perl_re_printf( aTHX_
20939 "%s %s%s at %" IVdf "..%" UVuf " ",
20940 i ? "floating" : "anchored",
20942 RE_SV_TAIL(r->substrs->data[i].substr),
20943 (IV)r->substrs->data[i].min_offset,
20944 (UV)r->substrs->data[i].max_offset);
20946 else if (r->substrs->data[i].utf8_substr) {
20947 RE_PV_QUOTED_DECL(s, 1, dsv,
20948 SvPVX_const(r->substrs->data[i].utf8_substr),
20949 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20951 Perl_re_printf( aTHX_
20952 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20953 i ? "floating" : "anchored",
20955 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20956 (IV)r->substrs->data[i].min_offset,
20957 (UV)r->substrs->data[i].max_offset);
20961 if (r->check_substr || r->check_utf8)
20962 Perl_re_printf( aTHX_
20964 ( r->check_substr == r->substrs->data[1].substr
20965 && r->check_utf8 == r->substrs->data[1].utf8_substr
20966 ? "(checking floating" : "(checking anchored"));
20967 if (r->intflags & PREGf_NOSCAN)
20968 Perl_re_printf( aTHX_ " noscan");
20969 if (r->extflags & RXf_CHECK_ALL)
20970 Perl_re_printf( aTHX_ " isall");
20971 if (r->check_substr || r->check_utf8)
20972 Perl_re_printf( aTHX_ ") ");
20974 if (ri->regstclass) {
20975 regprop(r, sv, ri->regstclass, NULL, NULL);
20976 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
20978 if (r->intflags & PREGf_ANCH) {
20979 Perl_re_printf( aTHX_ "anchored");
20980 if (r->intflags & PREGf_ANCH_MBOL)
20981 Perl_re_printf( aTHX_ "(MBOL)");
20982 if (r->intflags & PREGf_ANCH_SBOL)
20983 Perl_re_printf( aTHX_ "(SBOL)");
20984 if (r->intflags & PREGf_ANCH_GPOS)
20985 Perl_re_printf( aTHX_ "(GPOS)");
20986 Perl_re_printf( aTHX_ " ");
20988 if (r->intflags & PREGf_GPOS_SEEN)
20989 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
20990 if (r->intflags & PREGf_SKIP)
20991 Perl_re_printf( aTHX_ "plus ");
20992 if (r->intflags & PREGf_IMPLICIT)
20993 Perl_re_printf( aTHX_ "implicit ");
20994 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
20995 if (r->extflags & RXf_EVAL_SEEN)
20996 Perl_re_printf( aTHX_ "with eval ");
20997 Perl_re_printf( aTHX_ "\n");
20999 regdump_extflags("r->extflags: ", r->extflags);
21000 regdump_intflags("r->intflags: ", r->intflags);
21003 PERL_ARGS_ASSERT_REGDUMP;
21004 PERL_UNUSED_CONTEXT;
21005 PERL_UNUSED_ARG(r);
21006 #endif /* DEBUGGING */
21009 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21012 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
21013 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
21014 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
21015 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
21016 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
21017 || _CC_VERTSPACE != 15
21018 # error Need to adjust order of anyofs[]
21020 static const char * const anyofs[] = {
21057 - regprop - printable representation of opcode, with run time support
21061 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21066 RXi_GET_DECL(prog, progi);
21067 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21069 PERL_ARGS_ASSERT_REGPROP;
21073 if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
21074 if (pRExC_state) { /* This gives more info, if we have it */
21075 FAIL3("panic: corrupted regexp opcode %d > %d",
21076 (int)OP(o), (int)REGNODE_MAX);
21079 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21080 (int)OP(o), (int)REGNODE_MAX);
21083 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21085 k = PL_regkind[OP(o)];
21088 sv_catpvs(sv, " ");
21089 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21090 * is a crude hack but it may be the best for now since
21091 * we have no flag "this EXACTish node was UTF-8"
21093 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21094 PL_colors[0], PL_colors[1],
21095 PERL_PV_ESCAPE_UNI_DETECT |
21096 PERL_PV_ESCAPE_NONASCII |
21097 PERL_PV_PRETTY_ELLIPSES |
21098 PERL_PV_PRETTY_LTGT |
21099 PERL_PV_PRETTY_NOCLEAR
21101 } else if (k == TRIE) {
21102 /* print the details of the trie in dumpuntil instead, as
21103 * progi->data isn't available here */
21104 const char op = OP(o);
21105 const U32 n = ARG(o);
21106 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21107 (reg_ac_data *)progi->data->data[n] :
21109 const reg_trie_data * const trie
21110 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21112 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21113 DEBUG_TRIE_COMPILE_r({
21115 sv_catpvs(sv, "(JUMP)");
21116 Perl_sv_catpvf(aTHX_ sv,
21117 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21118 (UV)trie->startstate,
21119 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21120 (UV)trie->wordcount,
21123 (UV)TRIE_CHARCOUNT(trie),
21124 (UV)trie->uniquecharcount
21127 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21128 sv_catpvs(sv, "[");
21129 (void) put_charclass_bitmap_innards(sv,
21130 ((IS_ANYOF_TRIE(op))
21132 : TRIE_BITMAP(trie)),
21139 sv_catpvs(sv, "]");
21141 } else if (k == CURLY) {
21142 U32 lo = ARG1(o), hi = ARG2(o);
21143 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21144 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21145 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21146 if (hi == REG_INFTY)
21147 sv_catpvs(sv, "INFTY");
21149 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21150 sv_catpvs(sv, "}");
21152 else if (k == WHILEM && o->flags) /* Ordinal/of */
21153 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21154 else if (k == REF || k == OPEN || k == CLOSE
21155 || k == GROUPP || OP(o)==ACCEPT)
21157 AV *name_list= NULL;
21158 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21159 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21160 if ( RXp_PAREN_NAMES(prog) ) {
21161 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21162 } else if ( pRExC_state ) {
21163 name_list= RExC_paren_name_list;
21166 if ( k != REF || (OP(o) < REFN)) {
21167 SV **name= av_fetch(name_list, parno, 0 );
21169 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21172 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21173 I32 *nums=(I32*)SvPVX(sv_dat);
21174 SV **name= av_fetch(name_list, nums[0], 0 );
21177 for ( n=0; n<SvIVX(sv_dat); n++ ) {
21178 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21179 (n ? "," : ""), (IV)nums[n]);
21181 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21185 if ( k == REF && reginfo) {
21186 U32 n = ARG(o); /* which paren pair */
21187 I32 ln = prog->offs[n].start;
21188 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21189 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21190 else if (ln == prog->offs[n].end)
21191 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21193 const char *s = reginfo->strbeg + ln;
21194 Perl_sv_catpvf(aTHX_ sv, ": ");
21195 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21196 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21199 } else if (k == GOSUB) {
21200 AV *name_list= NULL;
21201 if ( RXp_PAREN_NAMES(prog) ) {
21202 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21203 } else if ( pRExC_state ) {
21204 name_list= RExC_paren_name_list;
21207 /* Paren and offset */
21208 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21209 (int)((o + (int)ARG2L(o)) - progi->program) );
21211 SV **name= av_fetch(name_list, ARG(o), 0 );
21213 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21216 else if (k == LOGICAL)
21217 /* 2: embedded, otherwise 1 */
21218 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21219 else if (k == ANYOF || k == ANYOFR) {
21223 bool do_sep = FALSE; /* Do we need to separate various components of
21225 /* Set if there is still an unresolved user-defined property */
21226 SV *unresolved = NULL;
21228 /* Things that are ignored except when the runtime locale is UTF-8 */
21229 SV *only_utf8_locale_invlist = NULL;
21231 /* Code points that don't fit in the bitmap */
21232 SV *nonbitmap_invlist = NULL;
21234 /* And things that aren't in the bitmap, but are small enough to be */
21235 SV* bitmap_range_not_in_bitmap = NULL;
21239 if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21245 flags = ANYOF_FLAGS(o);
21246 bitmap = ANYOF_BITMAP(o);
21250 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21251 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21252 sv_catpvs(sv, "{utf8-locale-reqd}");
21254 if (flags & ANYOFL_FOLD) {
21255 sv_catpvs(sv, "{i}");
21259 inverted = flags & ANYOF_INVERT;
21261 /* If there is stuff outside the bitmap, get it */
21262 if (arg != ANYOF_ONLY_HAS_BITMAP) {
21263 if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21264 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21266 ANYOFRbase(o) + ANYOFRdelta(o));
21269 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21270 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21272 &only_utf8_locale_invlist,
21273 &nonbitmap_invlist);
21275 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21277 &only_utf8_locale_invlist,
21278 &nonbitmap_invlist);
21282 /* The non-bitmap data may contain stuff that could fit in the
21283 * bitmap. This could come from a user-defined property being
21284 * finally resolved when this call was done; or much more likely
21285 * because there are matches that require UTF-8 to be valid, and so
21286 * aren't in the bitmap (or ANYOFR). This is teased apart later */
21287 _invlist_intersection(nonbitmap_invlist,
21289 &bitmap_range_not_in_bitmap);
21290 /* Leave just the things that don't fit into the bitmap */
21291 _invlist_subtract(nonbitmap_invlist,
21293 &nonbitmap_invlist);
21296 /* Obey this flag to add all above-the-bitmap code points */
21297 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21298 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21299 NUM_ANYOF_CODE_POINTS,
21303 /* Ready to start outputting. First, the initial left bracket */
21304 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21306 /* ANYOFH by definition doesn't have anything that will fit inside the
21307 * bitmap; ANYOFR may or may not. */
21308 if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21309 && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21310 || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21312 /* Then all the things that could fit in the bitmap */
21313 do_sep = put_charclass_bitmap_innards(sv,
21315 bitmap_range_not_in_bitmap,
21316 only_utf8_locale_invlist,
21320 /* Can't try inverting for a
21321 * better display if there
21322 * are things that haven't
21325 || inRANGE(OP(o), ANYOFR, ANYOFRb));
21326 SvREFCNT_dec(bitmap_range_not_in_bitmap);
21328 /* If there are user-defined properties which haven't been defined
21329 * yet, output them. If the result is not to be inverted, it is
21330 * clearest to output them in a separate [] from the bitmap range
21331 * stuff. If the result is to be complemented, we have to show
21332 * everything in one [], as the inversion applies to the whole
21333 * thing. Use {braces} to separate them from anything in the
21334 * bitmap and anything above the bitmap. */
21337 if (! do_sep) { /* If didn't output anything in the bitmap
21339 sv_catpvs(sv, "^");
21341 sv_catpvs(sv, "{");
21344 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21347 sv_catsv(sv, unresolved);
21349 sv_catpvs(sv, "}");
21351 do_sep = ! inverted;
21355 /* And, finally, add the above-the-bitmap stuff */
21356 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21359 /* See if truncation size is overridden */
21360 const STRLEN dump_len = (PL_dump_re_max_len > 256)
21361 ? PL_dump_re_max_len
21364 /* This is output in a separate [] */
21366 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21369 /* And, for easy of understanding, it is shown in the
21370 * uncomplemented form if possible. The one exception being if
21371 * there are unresolved items, where the inversion has to be
21372 * delayed until runtime */
21373 if (inverted && ! unresolved) {
21374 _invlist_invert(nonbitmap_invlist);
21375 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21378 contents = invlist_contents(nonbitmap_invlist,
21379 FALSE /* output suitable for catsv */
21382 /* If the output is shorter than the permissible maximum, just do it. */
21383 if (SvCUR(contents) <= dump_len) {
21384 sv_catsv(sv, contents);
21387 const char * contents_string = SvPVX(contents);
21388 STRLEN i = dump_len;
21390 /* Otherwise, start at the permissible max and work back to the
21391 * first break possibility */
21392 while (i > 0 && contents_string[i] != ' ') {
21395 if (i == 0) { /* Fail-safe. Use the max if we couldn't
21396 find a legal break */
21400 sv_catpvn(sv, contents_string, i);
21401 sv_catpvs(sv, "...");
21404 SvREFCNT_dec_NN(contents);
21405 SvREFCNT_dec_NN(nonbitmap_invlist);
21408 /* And finally the matching, closing ']' */
21409 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21411 if (OP(o) == ANYOFHs) {
21412 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21414 else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21415 U8 lowest = (OP(o) != ANYOFHr)
21417 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21418 U8 highest = (OP(o) == ANYOFHr)
21419 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21420 : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21423 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21424 if (lowest != highest) {
21425 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21427 Perl_sv_catpvf(aTHX_ sv, ")");
21430 SvREFCNT_dec(unresolved);
21432 else if (k == ANYOFM) {
21433 SV * cp_list = get_ANYOFM_contents(o);
21435 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21436 if (OP(o) == NANYOFM) {
21437 _invlist_invert(cp_list);
21440 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21441 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21443 SvREFCNT_dec(cp_list);
21445 else if (k == POSIXD || k == NPOSIXD) {
21446 U8 index = FLAGS(o) * 2;
21447 if (index < C_ARRAY_LENGTH(anyofs)) {
21448 if (*anyofs[index] != '[') {
21449 sv_catpvs(sv, "[");
21451 sv_catpv(sv, anyofs[index]);
21452 if (*anyofs[index] != '[') {
21453 sv_catpvs(sv, "]");
21457 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21460 else if (k == BOUND || k == NBOUND) {
21461 /* Must be synced with order of 'bound_type' in regcomp.h */
21462 const char * const bounds[] = {
21463 "", /* Traditional */
21469 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21470 sv_catpv(sv, bounds[FLAGS(o)]);
21472 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21473 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21475 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21477 Perl_sv_catpvf(aTHX_ sv, "]");
21479 else if (OP(o) == SBOL)
21480 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21482 /* add on the verb argument if there is one */
21483 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21485 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21486 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21488 sv_catpvs(sv, ":NULL");
21491 PERL_UNUSED_CONTEXT;
21492 PERL_UNUSED_ARG(sv);
21493 PERL_UNUSED_ARG(o);
21494 PERL_UNUSED_ARG(prog);
21495 PERL_UNUSED_ARG(reginfo);
21496 PERL_UNUSED_ARG(pRExC_state);
21497 #endif /* DEBUGGING */
21503 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21504 { /* Assume that RE_INTUIT is set */
21505 /* Returns an SV containing a string that must appear in the target for it
21506 * to match, or NULL if nothing is known that must match.
21508 * CAUTION: the SV can be freed during execution of the regex engine */
21510 struct regexp *const prog = ReANY(r);
21511 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21513 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21514 PERL_UNUSED_CONTEXT;
21518 if (prog->maxlen > 0) {
21519 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21520 ? prog->check_utf8 : prog->check_substr);
21522 if (!PL_colorset) reginitcolors();
21523 Perl_re_printf( aTHX_
21524 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21526 RX_UTF8(r) ? "utf8 " : "",
21527 PL_colors[5], PL_colors[0],
21530 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21534 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21535 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21541 handles refcounting and freeing the perl core regexp structure. When
21542 it is necessary to actually free the structure the first thing it
21543 does is call the 'free' method of the regexp_engine associated to
21544 the regexp, allowing the handling of the void *pprivate; member
21545 first. (This routine is not overridable by extensions, which is why
21546 the extensions free is called first.)
21548 See regdupe and regdupe_internal if you change anything here.
21550 #ifndef PERL_IN_XSUB_RE
21552 Perl_pregfree(pTHX_ REGEXP *r)
21558 Perl_pregfree2(pTHX_ REGEXP *rx)
21560 struct regexp *const r = ReANY(rx);
21561 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21563 PERL_ARGS_ASSERT_PREGFREE2;
21568 if (r->mother_re) {
21569 ReREFCNT_dec(r->mother_re);
21571 CALLREGFREE_PVT(rx); /* free the private data */
21572 SvREFCNT_dec(RXp_PAREN_NAMES(r));
21576 for (i = 0; i < 2; i++) {
21577 SvREFCNT_dec(r->substrs->data[i].substr);
21578 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21580 Safefree(r->substrs);
21582 RX_MATCH_COPY_FREE(rx);
21583 #ifdef PERL_ANY_COW
21584 SvREFCNT_dec(r->saved_copy);
21587 SvREFCNT_dec(r->qr_anoncv);
21588 if (r->recurse_locinput)
21589 Safefree(r->recurse_locinput);
21595 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21596 except that dsv will be created if NULL.
21598 This function is used in two main ways. First to implement
21599 $r = qr/....; $s = $$r;
21601 Secondly, it is used as a hacky workaround to the structural issue of
21603 being stored in the regexp structure which is in turn stored in
21604 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21605 could be PL_curpm in multiple contexts, and could require multiple
21606 result sets being associated with the pattern simultaneously, such
21607 as when doing a recursive match with (??{$qr})
21609 The solution is to make a lightweight copy of the regexp structure
21610 when a qr// is returned from the code executed by (??{$qr}) this
21611 lightweight copy doesn't actually own any of its data except for
21612 the starp/end and the actual regexp structure itself.
21618 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21620 struct regexp *drx;
21621 struct regexp *const srx = ReANY(ssv);
21622 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21624 PERL_ARGS_ASSERT_REG_TEMP_COPY;
21627 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21629 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21631 /* our only valid caller, sv_setsv_flags(), should have done
21632 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21633 assert(!SvOOK(dsv));
21634 assert(!SvIsCOW(dsv));
21635 assert(!SvROK(dsv));
21637 if (SvPVX_const(dsv)) {
21639 Safefree(SvPVX(dsv));
21644 SvOK_off((SV *)dsv);
21647 /* For PVLVs, the head (sv_any) points to an XPVLV, while
21648 * the LV's xpvlenu_rx will point to a regexp body, which
21649 * we allocate here */
21650 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21651 assert(!SvPVX(dsv));
21652 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21653 temp->sv_any = NULL;
21654 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21655 SvREFCNT_dec_NN(temp);
21656 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21657 ing below will not set it. */
21658 SvCUR_set(dsv, SvCUR(ssv));
21661 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21662 sv_force_normal(sv) is called. */
21666 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21667 SvPV_set(dsv, RX_WRAPPED(ssv));
21668 /* We share the same string buffer as the original regexp, on which we
21669 hold a reference count, incremented when mother_re is set below.
21670 The string pointer is copied here, being part of the regexp struct.
21672 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21673 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21677 const I32 npar = srx->nparens+1;
21678 Newx(drx->offs, npar, regexp_paren_pair);
21679 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21681 if (srx->substrs) {
21683 Newx(drx->substrs, 1, struct reg_substr_data);
21684 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21686 for (i = 0; i < 2; i++) {
21687 SvREFCNT_inc_void(drx->substrs->data[i].substr);
21688 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21691 /* check_substr and check_utf8, if non-NULL, point to either their
21692 anchored or float namesakes, and don't hold a second reference. */
21694 RX_MATCH_COPIED_off(dsv);
21695 #ifdef PERL_ANY_COW
21696 drx->saved_copy = NULL;
21698 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21699 SvREFCNT_inc_void(drx->qr_anoncv);
21700 if (srx->recurse_locinput)
21701 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21708 /* regfree_internal()
21710 Free the private data in a regexp. This is overloadable by
21711 extensions. Perl takes care of the regexp structure in pregfree(),
21712 this covers the *pprivate pointer which technically perl doesn't
21713 know about, however of course we have to handle the
21714 regexp_internal structure when no extension is in use.
21716 Note this is called before freeing anything in the regexp
21721 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21723 struct regexp *const r = ReANY(rx);
21724 RXi_GET_DECL(r, ri);
21725 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21727 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21737 SV *dsv= sv_newmortal();
21738 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21739 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21740 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21741 PL_colors[4], PL_colors[5], s);
21745 #ifdef RE_TRACK_PATTERN_OFFSETS
21747 Safefree(ri->u.offsets); /* 20010421 MJD */
21749 if (ri->code_blocks)
21750 S_free_codeblocks(aTHX_ ri->code_blocks);
21753 int n = ri->data->count;
21756 /* If you add a ->what type here, update the comment in regcomp.h */
21757 switch (ri->data->what[n]) {
21763 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21766 Safefree(ri->data->data[n]);
21772 { /* Aho Corasick add-on structure for a trie node.
21773 Used in stclass optimization only */
21775 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21776 #ifdef USE_ITHREADS
21780 refcount = --aho->refcount;
21783 PerlMemShared_free(aho->states);
21784 PerlMemShared_free(aho->fail);
21785 /* do this last!!!! */
21786 PerlMemShared_free(ri->data->data[n]);
21787 /* we should only ever get called once, so
21788 * assert as much, and also guard the free
21789 * which /might/ happen twice. At the least
21790 * it will make code anlyzers happy and it
21791 * doesn't cost much. - Yves */
21792 assert(ri->regstclass);
21793 if (ri->regstclass) {
21794 PerlMemShared_free(ri->regstclass);
21795 ri->regstclass = 0;
21802 /* trie structure. */
21804 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21805 #ifdef USE_ITHREADS
21809 refcount = --trie->refcount;
21812 PerlMemShared_free(trie->charmap);
21813 PerlMemShared_free(trie->states);
21814 PerlMemShared_free(trie->trans);
21816 PerlMemShared_free(trie->bitmap);
21818 PerlMemShared_free(trie->jump);
21819 PerlMemShared_free(trie->wordinfo);
21820 /* do this last!!!! */
21821 PerlMemShared_free(ri->data->data[n]);
21826 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21827 ri->data->what[n]);
21830 Safefree(ri->data->what);
21831 Safefree(ri->data);
21837 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21838 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21839 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
21842 re_dup_guts - duplicate a regexp.
21844 This routine is expected to clone a given regexp structure. It is only
21845 compiled under USE_ITHREADS.
21847 After all of the core data stored in struct regexp is duplicated
21848 the regexp_engine.dupe method is used to copy any private data
21849 stored in the *pprivate pointer. This allows extensions to handle
21850 any duplication it needs to do.
21852 See pregfree() and regfree_internal() if you change anything here.
21854 #if defined(USE_ITHREADS)
21855 #ifndef PERL_IN_XSUB_RE
21857 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21861 const struct regexp *r = ReANY(sstr);
21862 struct regexp *ret = ReANY(dstr);
21864 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21866 npar = r->nparens+1;
21867 Newx(ret->offs, npar, regexp_paren_pair);
21868 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21870 if (ret->substrs) {
21871 /* Do it this way to avoid reading from *r after the StructCopy().
21872 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21873 cache, it doesn't matter. */
21875 const bool anchored = r->check_substr
21876 ? r->check_substr == r->substrs->data[0].substr
21877 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21878 Newx(ret->substrs, 1, struct reg_substr_data);
21879 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21881 for (i = 0; i < 2; i++) {
21882 ret->substrs->data[i].substr =
21883 sv_dup_inc(ret->substrs->data[i].substr, param);
21884 ret->substrs->data[i].utf8_substr =
21885 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21888 /* check_substr and check_utf8, if non-NULL, point to either their
21889 anchored or float namesakes, and don't hold a second reference. */
21891 if (ret->check_substr) {
21893 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21895 ret->check_substr = ret->substrs->data[0].substr;
21896 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21898 assert(r->check_substr == r->substrs->data[1].substr);
21899 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21901 ret->check_substr = ret->substrs->data[1].substr;
21902 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21904 } else if (ret->check_utf8) {
21906 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21908 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21913 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21914 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21915 if (r->recurse_locinput)
21916 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21919 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21921 if (RX_MATCH_COPIED(dstr))
21922 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21924 ret->subbeg = NULL;
21925 #ifdef PERL_ANY_COW
21926 ret->saved_copy = NULL;
21929 /* Whether mother_re be set or no, we need to copy the string. We
21930 cannot refrain from copying it when the storage points directly to
21931 our mother regexp, because that's
21932 1: a buffer in a different thread
21933 2: something we no longer hold a reference on
21934 so we need to copy it locally. */
21935 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21936 /* set malloced length to a non-zero value so it will be freed
21937 * (otherwise in combination with SVf_FAKE it looks like an alien
21938 * buffer). It doesn't have to be the actual malloced size, since it
21939 * should never be grown */
21940 SvLEN_set(dstr, SvCUR(sstr)+1);
21941 ret->mother_re = NULL;
21943 #endif /* PERL_IN_XSUB_RE */
21948 This is the internal complement to regdupe() which is used to copy
21949 the structure pointed to by the *pprivate pointer in the regexp.
21950 This is the core version of the extension overridable cloning hook.
21951 The regexp structure being duplicated will be copied by perl prior
21952 to this and will be provided as the regexp *r argument, however
21953 with the /old/ structures pprivate pointer value. Thus this routine
21954 may override any copying normally done by perl.
21956 It returns a pointer to the new regexp_internal structure.
21960 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21963 struct regexp *const r = ReANY(rx);
21964 regexp_internal *reti;
21966 RXi_GET_DECL(r, ri);
21968 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21972 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21973 char, regexp_internal);
21974 Copy(ri->program, reti->program, len+1, regnode);
21977 if (ri->code_blocks) {
21979 Newx(reti->code_blocks, 1, struct reg_code_blocks);
21980 Newx(reti->code_blocks->cb, ri->code_blocks->count,
21981 struct reg_code_block);
21982 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21983 ri->code_blocks->count, struct reg_code_block);
21984 for (n = 0; n < ri->code_blocks->count; n++)
21985 reti->code_blocks->cb[n].src_regex = (REGEXP*)
21986 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21987 reti->code_blocks->count = ri->code_blocks->count;
21988 reti->code_blocks->refcnt = 1;
21991 reti->code_blocks = NULL;
21993 reti->regstclass = NULL;
21996 struct reg_data *d;
21997 const int count = ri->data->count;
22000 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22001 char, struct reg_data);
22002 Newx(d->what, count, U8);
22005 for (i = 0; i < count; i++) {
22006 d->what[i] = ri->data->what[i];
22007 switch (d->what[i]) {
22008 /* see also regcomp.h and regfree_internal() */
22009 case 'a': /* actually an AV, but the dup function is identical.
22010 values seem to be "plain sv's" generally. */
22011 case 'r': /* a compiled regex (but still just another SV) */
22012 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22013 this use case should go away, the code could have used
22014 'a' instead - see S_set_ANYOF_arg() for array contents. */
22015 case 'S': /* actually an SV, but the dup function is identical. */
22016 case 'u': /* actually an HV, but the dup function is identical.
22017 values are "plain sv's" */
22018 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22021 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22022 * patterns which could start with several different things. Pre-TRIE
22023 * this was more important than it is now, however this still helps
22024 * in some places, for instance /x?a+/ might produce a SSC equivalent
22025 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22028 /* This is cheating. */
22029 Newx(d->data[i], 1, regnode_ssc);
22030 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22031 reti->regstclass = (regnode*)d->data[i];
22034 /* AHO-CORASICK fail table */
22035 /* Trie stclasses are readonly and can thus be shared
22036 * without duplication. We free the stclass in pregfree
22037 * when the corresponding reg_ac_data struct is freed.
22039 reti->regstclass= ri->regstclass;
22042 /* TRIE transition table */
22044 ((reg_trie_data*)ri->data->data[i])->refcount++;
22047 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22048 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22049 is not from another regexp */
22050 d->data[i] = ri->data->data[i];
22053 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22054 ri->data->what[i]);
22063 reti->name_list_idx = ri->name_list_idx;
22065 #ifdef RE_TRACK_PATTERN_OFFSETS
22066 if (ri->u.offsets) {
22067 Newx(reti->u.offsets, 2*len+1, U32);
22068 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22071 SetProgLen(reti, len);
22074 return (void*)reti;
22077 #endif /* USE_ITHREADS */
22079 #ifndef PERL_IN_XSUB_RE
22082 - regnext - dig the "next" pointer out of a node
22085 Perl_regnext(pTHX_ regnode *p)
22092 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
22093 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22094 (int)OP(p), (int)REGNODE_MAX);
22097 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22107 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22110 STRLEN len = strlen(pat);
22113 const char *message;
22115 PERL_ARGS_ASSERT_RE_CROAK;
22119 Copy(pat, buf, len , char);
22121 buf[len + 1] = '\0';
22122 va_start(args, pat);
22123 msv = vmess(buf, &args);
22125 message = SvPV_const(msv, len);
22128 Copy(message, buf, len , char);
22129 /* len-1 to avoid \n */
22130 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22133 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22135 #ifndef PERL_IN_XSUB_RE
22137 Perl_save_re_context(pTHX)
22142 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22145 const REGEXP * const rx = PM_GETRE(PL_curpm);
22147 nparens = RX_NPARENS(rx);
22150 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22151 * that PL_curpm will be null, but that utf8.pm and the modules it
22152 * loads will only use $1..$3.
22153 * The t/porting/re_context.t test file checks this assumption.
22158 for (i = 1; i <= nparens; i++) {
22159 char digits[TYPE_CHARS(long)];
22160 const STRLEN len = my_snprintf(digits, sizeof(digits),
22162 GV *const *const gvp
22163 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22166 GV * const gv = *gvp;
22167 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22177 S_put_code_point(pTHX_ SV *sv, UV c)
22179 PERL_ARGS_ASSERT_PUT_CODE_POINT;
22182 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22184 else if (isPRINT(c)) {
22185 const char string = (char) c;
22187 /* We use {phrase} as metanotation in the class, so also escape literal
22189 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22190 sv_catpvs(sv, "\\");
22191 sv_catpvn(sv, &string, 1);
22193 else if (isMNEMONIC_CNTRL(c)) {
22194 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22197 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22201 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22204 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22206 /* Appends to 'sv' a displayable version of the range of code points from
22207 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
22208 * that have them, when they occur at the beginning or end of the range.
22209 * It uses hex to output the remaining code points, unless 'allow_literals'
22210 * is true, in which case the printable ASCII ones are output as-is (though
22211 * some of these will be escaped by put_code_point()).
22213 * NOTE: This is designed only for printing ranges of code points that fit
22214 * inside an ANYOF bitmap. Higher code points are simply suppressed
22217 const unsigned int min_range_count = 3;
22219 assert(start <= end);
22221 PERL_ARGS_ASSERT_PUT_RANGE;
22223 while (start <= end) {
22225 const char * format;
22227 if (end - start < min_range_count) {
22229 /* Output chars individually when they occur in short ranges */
22230 for (; start <= end; start++) {
22231 put_code_point(sv, start);
22236 /* If permitted by the input options, and there is a possibility that
22237 * this range contains a printable literal, look to see if there is
22239 if (allow_literals && start <= MAX_PRINT_A) {
22241 /* If the character at the beginning of the range isn't an ASCII
22242 * printable, effectively split the range into two parts:
22243 * 1) the portion before the first such printable,
22245 * and output them separately. */
22246 if (! isPRINT_A(start)) {
22247 UV temp_end = start + 1;
22249 /* There is no point looking beyond the final possible
22250 * printable, in MAX_PRINT_A */
22251 UV max = MIN(end, MAX_PRINT_A);
22253 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22257 /* Here, temp_end points to one beyond the first printable if
22258 * found, or to one beyond 'max' if not. If none found, make
22259 * sure that we use the entire range */
22260 if (temp_end > MAX_PRINT_A) {
22261 temp_end = end + 1;
22264 /* Output the first part of the split range: the part that
22265 * doesn't have printables, with the parameter set to not look
22266 * for literals (otherwise we would infinitely recurse) */
22267 put_range(sv, start, temp_end - 1, FALSE);
22269 /* The 2nd part of the range (if any) starts here. */
22272 /* We do a continue, instead of dropping down, because even if
22273 * the 2nd part is non-empty, it could be so short that we want
22274 * to output it as individual characters, as tested for at the
22275 * top of this loop. */
22279 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
22280 * output a sub-range of just the digits or letters, then process
22281 * the remaining portion as usual. */
22282 if (isALPHANUMERIC_A(start)) {
22283 UV mask = (isDIGIT_A(start))
22288 UV temp_end = start + 1;
22290 /* Find the end of the sub-range that includes just the
22291 * characters in the same class as the first character in it */
22292 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22297 /* For short ranges, don't duplicate the code above to output
22298 * them; just call recursively */
22299 if (temp_end - start < min_range_count) {
22300 put_range(sv, start, temp_end, FALSE);
22302 else { /* Output as a range */
22303 put_code_point(sv, start);
22304 sv_catpvs(sv, "-");
22305 put_code_point(sv, temp_end);
22307 start = temp_end + 1;
22311 /* We output any other printables as individual characters */
22312 if (isPUNCT_A(start) || isSPACE_A(start)) {
22313 while (start <= end && (isPUNCT_A(start)
22314 || isSPACE_A(start)))
22316 put_code_point(sv, start);
22321 } /* End of looking for literals */
22323 /* Here is not to output as a literal. Some control characters have
22324 * mnemonic names. Split off any of those at the beginning and end of
22325 * the range to print mnemonically. It isn't possible for many of
22326 * these to be in a row, so this won't overwhelm with output */
22328 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22330 while (isMNEMONIC_CNTRL(start) && start <= end) {
22331 put_code_point(sv, start);
22335 /* If this didn't take care of the whole range ... */
22336 if (start <= end) {
22338 /* Look backwards from the end to find the final non-mnemonic
22341 while (isMNEMONIC_CNTRL(temp_end)) {
22345 /* And separately output the interior range that doesn't start
22346 * or end with mnemonics */
22347 put_range(sv, start, temp_end, FALSE);
22349 /* Then output the mnemonic trailing controls */
22350 start = temp_end + 1;
22351 while (start <= end) {
22352 put_code_point(sv, start);
22359 /* As a final resort, output the range or subrange as hex. */
22361 if (start >= NUM_ANYOF_CODE_POINTS) {
22364 else { /* Have to split range at the bitmap boundary */
22365 this_end = (end < NUM_ANYOF_CODE_POINTS)
22367 : NUM_ANYOF_CODE_POINTS - 1;
22369 #if NUM_ANYOF_CODE_POINTS > 256
22370 format = (this_end < 256)
22371 ? "\\x%02" UVXf "-\\x%02" UVXf
22372 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22374 format = "\\x%02" UVXf "-\\x%02" UVXf;
22376 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22377 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22378 GCC_DIAG_RESTORE_STMT;
22384 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22386 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22390 bool allow_literals = TRUE;
22392 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22394 /* Generally, it is more readable if printable characters are output as
22395 * literals, but if a range (nearly) spans all of them, it's best to output
22396 * it as a single range. This code will use a single range if all but 2
22397 * ASCII printables are in it */
22398 invlist_iterinit(invlist);
22399 while (invlist_iternext(invlist, &start, &end)) {
22401 /* If the range starts beyond the final printable, it doesn't have any
22403 if (start > MAX_PRINT_A) {
22407 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
22408 * all but two, the range must start and end no later than 2 from
22410 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22411 if (end > MAX_PRINT_A) {
22417 if (end - start >= MAX_PRINT_A - ' ' - 2) {
22418 allow_literals = FALSE;
22423 invlist_iterfinish(invlist);
22425 /* Here we have figured things out. Output each range */
22426 invlist_iterinit(invlist);
22427 while (invlist_iternext(invlist, &start, &end)) {
22428 if (start >= NUM_ANYOF_CODE_POINTS) {
22431 put_range(sv, start, end, allow_literals);
22433 invlist_iterfinish(invlist);
22439 S_put_charclass_bitmap_innards_common(pTHX_
22440 SV* invlist, /* The bitmap */
22441 SV* posixes, /* Under /l, things like [:word:], \S */
22442 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
22443 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
22444 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
22445 const bool invert /* Is the result to be inverted? */
22448 /* Create and return an SV containing a displayable version of the bitmap
22449 * and associated information determined by the input parameters. If the
22450 * output would have been only the inversion indicator '^', NULL is instead
22456 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22459 output = newSVpvs("^");
22462 output = newSVpvs("");
22465 /* First, the code points in the bitmap that are unconditionally there */
22466 put_charclass_bitmap_innards_invlist(output, invlist);
22468 /* Traditionally, these have been placed after the main code points */
22470 sv_catsv(output, posixes);
22473 if (only_utf8 && _invlist_len(only_utf8)) {
22474 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22475 put_charclass_bitmap_innards_invlist(output, only_utf8);
22478 if (not_utf8 && _invlist_len(not_utf8)) {
22479 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22480 put_charclass_bitmap_innards_invlist(output, not_utf8);
22483 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22484 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22485 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22487 /* This is the only list in this routine that can legally contain code
22488 * points outside the bitmap range. The call just above to
22489 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22490 * output them here. There's about a half-dozen possible, and none in
22491 * contiguous ranges longer than 2 */
22492 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22494 SV* above_bitmap = NULL;
22496 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22498 invlist_iterinit(above_bitmap);
22499 while (invlist_iternext(above_bitmap, &start, &end)) {
22502 for (i = start; i <= end; i++) {
22503 put_code_point(output, i);
22506 invlist_iterfinish(above_bitmap);
22507 SvREFCNT_dec_NN(above_bitmap);
22511 if (invert && SvCUR(output) == 1) {
22519 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22521 SV *nonbitmap_invlist,
22522 SV *only_utf8_locale_invlist,
22523 const regnode * const node,
22525 const bool force_as_is_display)
22527 /* Appends to 'sv' a displayable version of the innards of the bracketed
22528 * character class defined by the other arguments:
22529 * 'bitmap' points to the bitmap, or NULL if to ignore that.
22530 * 'nonbitmap_invlist' is an inversion list of the code points that are in
22531 * the bitmap range, but for some reason aren't in the bitmap; NULL if
22532 * none. The reasons for this could be that they require some
22533 * condition such as the target string being or not being in UTF-8
22534 * (under /d), or because they came from a user-defined property that
22535 * was not resolved at the time of the regex compilation (under /u)
22536 * 'only_utf8_locale_invlist' is an inversion list of the code points that
22537 * are valid only if the runtime locale is a UTF-8 one; NULL if none
22538 * 'node' is the regex pattern ANYOF node. It is needed only when the
22539 * above two parameters are not null, and is passed so that this
22540 * routine can tease apart the various reasons for them.
22541 * 'flags' is the flags field of 'node'
22542 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
22543 * to invert things to see if that leads to a cleaner display. If
22544 * FALSE, this routine is free to use its judgment about doing this.
22546 * It returns TRUE if there was actually something output. (It may be that
22547 * the bitmap, etc is empty.)
22549 * When called for outputting the bitmap of a non-ANYOF node, just pass the
22550 * bitmap, with the succeeding parameters set to NULL, and the final one to
22554 /* In general, it tries to display the 'cleanest' representation of the
22555 * innards, choosing whether to display them inverted or not, regardless of
22556 * whether the class itself is to be inverted. However, there are some
22557 * cases where it can't try inverting, as what actually matches isn't known
22558 * until runtime, and hence the inversion isn't either. */
22561 bool inverting_allowed = ! force_as_is_display;
22564 STRLEN orig_sv_cur = SvCUR(sv);
22566 SV* invlist; /* Inversion list we accumulate of code points that
22567 are unconditionally matched */
22568 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
22570 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
22572 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
22573 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
22576 SV* as_is_display; /* The output string when we take the inputs
22578 SV* inverted_display; /* The output string when we invert the inputs */
22580 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
22582 /* We are biased in favor of displaying things without them being inverted,
22583 * as that is generally easier to understand */
22584 const int bias = 5;
22586 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22588 /* Start off with whatever code points are passed in. (We clone, so we
22589 * don't change the caller's list) */
22590 if (nonbitmap_invlist) {
22591 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22592 invlist = invlist_clone(nonbitmap_invlist, NULL);
22594 else { /* Worst case size is every other code point is matched */
22595 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22599 if (OP(node) == ANYOFD) {
22601 /* This flag indicates that the code points below 0x100 in the
22602 * nonbitmap list are precisely the ones that match only when the
22603 * target is UTF-8 (they should all be non-ASCII). */
22604 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22606 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22607 _invlist_subtract(invlist, only_utf8, &invlist);
22610 /* And this flag for matching all non-ASCII 0xFF and below */
22611 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22613 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22616 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22618 /* If either of these flags are set, what matches isn't
22619 * determinable except during execution, so don't know enough here
22621 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22622 inverting_allowed = FALSE;
22625 /* What the posix classes match also varies at runtime, so these
22626 * will be output symbolically. */
22627 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22630 posixes = newSVpvs("");
22631 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22632 if (ANYOF_POSIXL_TEST(node, i)) {
22633 sv_catpv(posixes, anyofs[i]);
22640 /* Accumulate the bit map into the unconditional match list */
22642 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22643 if (BITMAP_TEST(bitmap, i)) {
22646 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22649 invlist = _add_range_to_invlist(invlist, start, i-1);
22654 /* Make sure that the conditional match lists don't have anything in them
22655 * that match unconditionally; otherwise the output is quite confusing.
22656 * This could happen if the code that populates these misses some
22659 _invlist_subtract(only_utf8, invlist, &only_utf8);
22662 _invlist_subtract(not_utf8, invlist, ¬_utf8);
22665 if (only_utf8_locale_invlist) {
22667 /* Since this list is passed in, we have to make a copy before
22669 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22671 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22673 /* And, it can get really weird for us to try outputting an inverted
22674 * form of this list when it has things above the bitmap, so don't even
22676 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22677 inverting_allowed = FALSE;
22681 /* Calculate what the output would be if we take the input as-is */
22682 as_is_display = put_charclass_bitmap_innards_common(invlist,
22689 /* If have to take the output as-is, just do that */
22690 if (! inverting_allowed) {
22691 if (as_is_display) {
22692 sv_catsv(sv, as_is_display);
22693 SvREFCNT_dec_NN(as_is_display);
22696 else { /* But otherwise, create the output again on the inverted input, and
22697 use whichever version is shorter */
22699 int inverted_bias, as_is_bias;
22701 /* We will apply our bias to whichever of the the results doesn't have
22711 inverted_bias = bias;
22714 /* Now invert each of the lists that contribute to the output,
22715 * excluding from the result things outside the possible range */
22717 /* For the unconditional inversion list, we have to add in all the
22718 * conditional code points, so that when inverted, they will be gone
22720 _invlist_union(only_utf8, invlist, &invlist);
22721 _invlist_union(not_utf8, invlist, &invlist);
22722 _invlist_union(only_utf8_locale, invlist, &invlist);
22723 _invlist_invert(invlist);
22724 _invlist_intersection(invlist, PL_InBitmap, &invlist);
22727 _invlist_invert(only_utf8);
22728 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22730 else if (not_utf8) {
22732 /* If a code point matches iff the target string is not in UTF-8,
22733 * then complementing the result has it not match iff not in UTF-8,
22734 * which is the same thing as matching iff it is UTF-8. */
22735 only_utf8 = not_utf8;
22739 if (only_utf8_locale) {
22740 _invlist_invert(only_utf8_locale);
22741 _invlist_intersection(only_utf8_locale,
22743 &only_utf8_locale);
22746 inverted_display = put_charclass_bitmap_innards_common(
22751 only_utf8_locale, invert);
22753 /* Use the shortest representation, taking into account our bias
22754 * against showing it inverted */
22755 if ( inverted_display
22756 && ( ! as_is_display
22757 || ( SvCUR(inverted_display) + inverted_bias
22758 < SvCUR(as_is_display) + as_is_bias)))
22760 sv_catsv(sv, inverted_display);
22762 else if (as_is_display) {
22763 sv_catsv(sv, as_is_display);
22766 SvREFCNT_dec(as_is_display);
22767 SvREFCNT_dec(inverted_display);
22770 SvREFCNT_dec_NN(invlist);
22771 SvREFCNT_dec(only_utf8);
22772 SvREFCNT_dec(not_utf8);
22773 SvREFCNT_dec(posixes);
22774 SvREFCNT_dec(only_utf8_locale);
22776 return SvCUR(sv) > orig_sv_cur;
22779 #define CLEAR_OPTSTART \
22780 if (optstart) STMT_START { \
22781 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
22782 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22786 #define DUMPUNTIL(b,e) \
22788 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22790 STATIC const regnode *
22791 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22792 const regnode *last, const regnode *plast,
22793 SV* sv, I32 indent, U32 depth)
22795 U8 op = PSEUDO; /* Arbitrary non-END op. */
22796 const regnode *next;
22797 const regnode *optstart= NULL;
22799 RXi_GET_DECL(r, ri);
22800 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22802 PERL_ARGS_ASSERT_DUMPUNTIL;
22804 #ifdef DEBUG_DUMPUNTIL
22805 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
22806 last ? last-start : 0, plast ? plast-start : 0);
22809 if (plast && plast < last)
22812 while (PL_regkind[op] != END && (!last || node < last)) {
22814 /* While that wasn't END last time... */
22817 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22819 next = regnext((regnode *)node);
22822 if (OP(node) == OPTIMIZED) {
22823 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22830 regprop(r, sv, node, NULL, NULL);
22831 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
22832 (int)(2*indent + 1), "", SvPVX_const(sv));
22834 if (OP(node) != OPTIMIZED) {
22835 if (next == NULL) /* Next ptr. */
22836 Perl_re_printf( aTHX_ " (0)");
22837 else if (PL_regkind[(U8)op] == BRANCH
22838 && PL_regkind[OP(next)] != BRANCH )
22839 Perl_re_printf( aTHX_ " (FAIL)");
22841 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
22842 Perl_re_printf( aTHX_ "\n");
22846 if (PL_regkind[(U8)op] == BRANCHJ) {
22849 const regnode *nnode = (OP(next) == LONGJMP
22850 ? regnext((regnode *)next)
22852 if (last && nnode > last)
22854 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22857 else if (PL_regkind[(U8)op] == BRANCH) {
22859 DUMPUNTIL(NEXTOPER(node), next);
22861 else if ( PL_regkind[(U8)op] == TRIE ) {
22862 const regnode *this_trie = node;
22863 const char op = OP(node);
22864 const U32 n = ARG(node);
22865 const reg_ac_data * const ac = op>=AHOCORASICK ?
22866 (reg_ac_data *)ri->data->data[n] :
22868 const reg_trie_data * const trie =
22869 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22871 AV *const trie_words
22872 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22874 const regnode *nextbranch= NULL;
22877 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22878 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22880 Perl_re_indentf( aTHX_ "%s ",
22883 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22884 SvCUR(*elem_ptr), PL_dump_re_max_len,
22885 PL_colors[0], PL_colors[1],
22887 ? PERL_PV_ESCAPE_UNI
22889 | PERL_PV_PRETTY_ELLIPSES
22890 | PERL_PV_PRETTY_LTGT
22895 U16 dist= trie->jump[word_idx+1];
22896 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22897 (UV)((dist ? this_trie + dist : next) - start));
22900 nextbranch= this_trie + trie->jump[0];
22901 DUMPUNTIL(this_trie + dist, nextbranch);
22903 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22904 nextbranch= regnext((regnode *)nextbranch);
22906 Perl_re_printf( aTHX_ "\n");
22909 if (last && next > last)
22914 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22915 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22916 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22918 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22920 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22922 else if ( op == PLUS || op == STAR) {
22923 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22925 else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22926 /* Literal string, where present. */
22927 node += NODE_SZ_STR(node) - 1;
22928 node = NEXTOPER(node);
22931 node = NEXTOPER(node);
22932 node += regarglen[(U8)op];
22934 if (op == CURLYX || op == OPEN || op == SROPEN)
22938 #ifdef DEBUG_DUMPUNTIL
22939 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22944 #endif /* DEBUGGING */
22946 #ifndef PERL_IN_XSUB_RE
22948 # include "uni_keywords.h"
22951 Perl_init_uniprops(pTHX)
22956 char * dump_len_string;
22958 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22959 if ( ! dump_len_string
22960 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22962 PL_dump_re_max_len = 60; /* A reasonable default */
22966 PL_user_def_props = newHV();
22968 # ifdef USE_ITHREADS
22970 HvSHAREKEYS_off(PL_user_def_props);
22971 PL_user_def_props_aTHX = aTHX;
22975 /* Set up the inversion list interpreter-level variables */
22977 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22978 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22979 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22980 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22981 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22982 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22983 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22984 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22985 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22986 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22987 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22988 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22989 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22990 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22991 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22992 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22994 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22995 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22996 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22997 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22998 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22999 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23000 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23001 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23002 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23003 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23004 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23005 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23006 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23007 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23008 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23009 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23011 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23012 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23013 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23014 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23015 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23017 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23018 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23019 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23020 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23022 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23024 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23025 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23027 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23028 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23030 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23031 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23032 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23033 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23034 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23035 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23036 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23037 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23038 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23039 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23040 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23041 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23042 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23043 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23046 /* The below are used only by deprecated functions. They could be removed */
23047 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23048 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23049 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23053 /* These four functions are compiled only in regcomp.c, where they have access
23054 * to the data they return. They are a way for re_comp.c to get access to that
23055 * data without having to compile the whole data structures. */
23058 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23060 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23062 return match_uniprop((U8 *) key, key_len);
23066 Perl_get_prop_definition(pTHX_ const int table_index)
23068 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23070 /* Create and return the inversion list */
23071 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23074 const char * const *
23075 Perl_get_prop_values(const int table_index)
23077 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23079 return UNI_prop_value_ptrs[table_index];
23083 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23085 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23087 return deprecated_property_msgs[warning_offset];
23092 This code was mainly added for backcompat to give a warning for non-portable
23093 code points in user-defined properties. But experiments showed that the
23094 warning in earlier perls were only omitted on overflow, which should be an
23095 error, so there really isnt a backcompat issue, and actually adding the
23096 warning when none was present before might cause breakage, for little gain. So
23097 khw left this code in, but not enabled. Tests were never added.
23100 Ei |const char *|get_extended_utf8_msg|const UV cp
23102 PERL_STATIC_INLINE const char *
23103 S_get_extended_utf8_msg(pTHX_ const UV cp)
23105 U8 dummy[UTF8_MAXBYTES + 1];
23109 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23112 msg = hv_fetchs(msgs, "text", 0);
23115 (void) sv_2mortal((SV *) msgs);
23117 return SvPVX(*msg);
23121 #endif /* end of ! PERL_IN_XSUB_RE */
23124 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23125 const bool ignore_case)
23127 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23128 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23129 * because nothing outside of ASCII will match. Use /m because the input
23130 * string may be a bunch of lines strung together.
23132 * Also sets up the debugging info */
23134 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23136 SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23137 REGEXP * subpattern_re;
23138 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23140 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23145 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23147 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23148 rx_flags = flags & RXf_PMf_COMPILETIME;
23150 #ifndef PERL_IN_XSUB_RE
23151 /* Use the core engine if this file is regcomp.c. That means no
23152 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23153 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23154 &PL_core_reg_engine,
23158 if (isDEBUG_WILDCARD) {
23159 /* Use the special debugging engine if this file is re_comp.c and wants
23160 * to output the wildcard matching. This uses whatever
23161 * 'use re "Debug ..." is in effect */
23162 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23168 /* Use the special wildcard engine if this file is re_comp.c and
23169 * doesn't want to output the wildcard matching. This uses whatever
23170 * 'use re "Debug ..." is in effect for compilation, but this engine
23171 * structure has been set up so that it uses the core engine for
23172 * execution, so no execution debugging as a result of re.pm will be
23174 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23178 /* XXX The above has the effect that any user-supplied regex engine
23179 * won't be called for matching wildcards. That might be good, or bad.
23180 * It could be changed in several ways. The reason it is done the
23181 * current way is to avoid having to save and restore
23182 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
23183 * could be used. Another suggestion is to keep the authoritative
23184 * value of the debug flags in a thread-local variable and add set/get
23185 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23186 * Still another is to pass a flag, say in the engine's intflags that
23187 * would be checked each time before doing the debug output */
23191 assert(subpattern_re); /* Should have died if didn't compile successfully */
23192 return subpattern_re;
23196 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23197 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23200 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23202 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23206 /* The compilation has set things up so that if the program doesn't want to
23207 * see the wildcard matching procedure, it will get the core execution
23208 * engine, which is subject only to -Dr. So we have to turn that off
23209 * around this procedure */
23210 if (! isDEBUG_WILDCARD) {
23211 /* Note! Casts away 'volatile' */
23213 PL_debug &= ~ DEBUG_r_FLAG;
23216 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23224 S_handle_user_defined_property(pTHX_
23226 /* Parses the contents of a user-defined property definition; returning the
23227 * expanded definition if possible. If so, the return is an inversion
23230 * If there are subroutines that are part of the expansion and which aren't
23231 * known at the time of the call to this function, this returns what
23232 * parse_uniprop_string() returned for the first one encountered.
23234 * If an error was found, NULL is returned, and 'msg' gets a suitable
23235 * message appended to it. (Appending allows the back trace of how we got
23236 * to the faulty definition to be displayed through nested calls of
23237 * user-defined subs.)
23239 * The caller IS responsible for freeing any returned SV.
23241 * The syntax of the contents is pretty much described in perlunicode.pod,
23242 * but we also allow comments on each line */
23244 const char * name, /* Name of property */
23245 const STRLEN name_len, /* The name's length in bytes */
23246 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23247 const bool to_fold, /* ? Is this under /i */
23248 const bool runtime, /* ? Are we in compile- or run-time */
23249 const bool deferrable, /* Is it ok for this property's full definition
23250 to be deferred until later? */
23251 SV* contents, /* The property's definition */
23252 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
23253 getting called unless this is thought to be
23254 a user-defined property */
23255 SV * msg, /* Any error or warning msg(s) are appended to
23257 const STRLEN level) /* Recursion level of this call */
23260 const char * string = SvPV_const(contents, len);
23261 const char * const e = string + len;
23262 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23263 const STRLEN msgs_length_on_entry = SvCUR(msg);
23265 const char * s0 = string; /* Points to first byte in the current line
23266 being parsed in 'string' */
23267 const char overflow_msg[] = "Code point too large in \"";
23268 SV* running_definition = NULL;
23270 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23272 *user_defined_ptr = TRUE;
23274 /* Look at each line */
23276 const char * s; /* Current byte */
23277 char op = '+'; /* Default operation is 'union' */
23278 IV min = 0; /* range begin code point */
23279 IV max = -1; /* and range end */
23280 SV* this_definition;
23282 /* Skip comment lines */
23284 s0 = strchr(s0, '\n');
23292 /* For backcompat, allow an empty first line */
23298 /* First character in the line may optionally be the operation */
23307 /* If the line is one or two hex digits separated by blank space, its
23308 * a range; otherwise it is either another user-defined property or an
23313 if (! isXDIGIT(*s)) {
23314 goto check_if_property;
23317 do { /* Each new hex digit will add 4 bits. */
23318 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23319 s = strchr(s, '\n');
23323 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23324 sv_catpv(msg, overflow_msg);
23325 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23326 UTF8fARG(is_contents_utf8, s - s0, s0));
23327 sv_catpvs(msg, "\"");
23328 goto return_failure;
23331 /* Accumulate this digit into the value */
23332 min = (min << 4) + READ_XDIGIT(s);
23333 } while (isXDIGIT(*s));
23335 while (isBLANK(*s)) { s++; }
23337 /* We allow comments at the end of the line */
23339 s = strchr(s, '\n');
23345 else if (s < e && *s != '\n') {
23346 if (! isXDIGIT(*s)) {
23347 goto check_if_property;
23350 /* Look for the high point of the range */
23353 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23354 s = strchr(s, '\n');
23358 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23359 sv_catpv(msg, overflow_msg);
23360 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23361 UTF8fARG(is_contents_utf8, s - s0, s0));
23362 sv_catpvs(msg, "\"");
23363 goto return_failure;
23366 max = (max << 4) + READ_XDIGIT(s);
23367 } while (isXDIGIT(*s));
23369 while (isBLANK(*s)) { s++; }
23372 s = strchr(s, '\n');
23377 else if (s < e && *s != '\n') {
23378 goto check_if_property;
23382 if (max == -1) { /* The line only had one entry */
23385 else if (max < min) {
23386 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23387 sv_catpvs(msg, "Illegal range in \"");
23388 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23389 UTF8fARG(is_contents_utf8, s - s0, s0));
23390 sv_catpvs(msg, "\"");
23391 goto return_failure;
23394 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
23396 if ( UNICODE_IS_PERL_EXTENDED(min)
23397 || UNICODE_IS_PERL_EXTENDED(max))
23399 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23401 /* If both code points are non-portable, warn only on the lower
23403 sv_catpv(msg, get_extended_utf8_msg(
23404 (UNICODE_IS_PERL_EXTENDED(min))
23406 sv_catpvs(msg, " in \"");
23407 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23408 UTF8fARG(is_contents_utf8, s - s0, s0));
23409 sv_catpvs(msg, "\"");
23414 /* Here, this line contains a legal range */
23415 this_definition = sv_2mortal(_new_invlist(2));
23416 this_definition = _add_range_to_invlist(this_definition, min, max);
23421 /* Here it isn't a legal range line. See if it is a legal property
23422 * line. First find the end of the meat of the line */
23423 s = strpbrk(s, "#\n");
23428 /* Ignore trailing blanks in keeping with the requirements of
23429 * parse_uniprop_string() */
23431 while (s > s0 && isBLANK_A(*s)) {
23436 this_definition = parse_uniprop_string(s0, s - s0,
23437 is_utf8, to_fold, runtime,
23440 user_defined_ptr, msg,
23442 ? level /* Don't increase level
23443 if input is empty */
23446 if (this_definition == NULL) {
23447 goto return_failure; /* 'msg' should have had the reason
23448 appended to it by the above call */
23451 if (! is_invlist(this_definition)) { /* Unknown at this time */
23452 return newSVsv(this_definition);
23456 s = strchr(s, '\n');
23466 _invlist_union(running_definition, this_definition,
23467 &running_definition);
23470 _invlist_subtract(running_definition, this_definition,
23471 &running_definition);
23474 _invlist_intersection(running_definition, this_definition,
23475 &running_definition);
23478 _invlist_union_complement_2nd(running_definition,
23479 this_definition, &running_definition);
23482 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23483 __FILE__, __LINE__, op);
23487 /* Position past the '\n' */
23489 } /* End of loop through the lines of 'contents' */
23491 /* Here, we processed all the lines in 'contents' without error. If we
23492 * didn't add any warnings, simply return success */
23493 if (msgs_length_on_entry == SvCUR(msg)) {
23495 /* If the expansion was empty, the answer isn't nothing: its an empty
23496 * inversion list */
23497 if (running_definition == NULL) {
23498 running_definition = _new_invlist(1);
23501 return running_definition;
23504 /* Otherwise, add some explanatory text, but we will return success */
23508 running_definition = NULL;
23512 if (name_len > 0) {
23513 sv_catpvs(msg, " in expansion of ");
23514 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23517 return running_definition;
23520 /* As explained below, certain operations need to take place in the first
23521 * thread created. These macros switch contexts */
23522 # ifdef USE_ITHREADS
23523 # define DECLARATION_FOR_GLOBAL_CONTEXT \
23524 PerlInterpreter * save_aTHX = aTHX;
23525 # define SWITCH_TO_GLOBAL_CONTEXT \
23526 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23527 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
23528 # define CUR_CONTEXT aTHX
23529 # define ORIGINAL_CONTEXT save_aTHX
23531 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
23532 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
23533 # define RESTORE_CONTEXT NOOP
23534 # define CUR_CONTEXT NULL
23535 # define ORIGINAL_CONTEXT NULL
23539 S_delete_recursion_entry(pTHX_ void *key)
23541 /* Deletes the entry used to detect recursion when expanding user-defined
23542 * properties. This is a function so it can be set up to be called even if
23543 * the program unexpectedly quits */
23546 SV ** current_entry;
23547 const STRLEN key_len = strlen((const char *) key);
23548 DECLARATION_FOR_GLOBAL_CONTEXT;
23550 SWITCH_TO_GLOBAL_CONTEXT;
23552 /* If the entry is one of these types, it is a permanent entry, and not the
23553 * one used to detect recursions. This function should delete only the
23554 * recursion entry */
23555 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23557 && ! is_invlist(*current_entry)
23558 && ! SvPOK(*current_entry))
23560 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23568 S_get_fq_name(pTHX_
23569 const char * const name, /* The first non-blank in the \p{}, \P{} */
23570 const Size_t name_len, /* Its length in bytes, not including any trailing space */
23571 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23572 const bool has_colon_colon
23575 /* Returns a mortal SV containing the fully qualified version of the input
23580 fq_name = newSVpvs_flags("", SVs_TEMP);
23582 /* Use the current package if it wasn't included in our input */
23583 if (! has_colon_colon) {
23584 const HV * pkg = (IN_PERL_COMPILETIME)
23586 : CopSTASH(PL_curcop);
23587 const char* pkgname = HvNAME(pkg);
23589 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23590 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23591 sv_catpvs(fq_name, "::");
23594 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23595 UTF8fARG(is_utf8, name_len, name));
23600 S_parse_uniprop_string(pTHX_
23602 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
23603 * now. If so, the return is an inversion list.
23605 * If the property is user-defined, it is a subroutine, which in turn
23606 * may call other subroutines. This function will call the whole nest of
23607 * them to get the definition they return; if some aren't known at the time
23608 * of the call to this function, the fully qualified name of the highest
23609 * level sub is returned. It is an error to call this function at runtime
23610 * without every sub defined.
23612 * If an error was found, NULL is returned, and 'msg' gets a suitable
23613 * message appended to it. (Appending allows the back trace of how we got
23614 * to the faulty definition to be displayed through nested calls of
23615 * user-defined subs.)
23617 * The caller should NOT try to free any returned inversion list.
23619 * Other parameters will be set on return as described below */
23621 const char * const name, /* The first non-blank in the \p{}, \P{} */
23622 Size_t name_len, /* Its length in bytes, not including any
23624 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23625 const bool to_fold, /* ? Is this under /i */
23626 const bool runtime, /* TRUE if this is being called at run time */
23627 const bool deferrable, /* TRUE if it's ok for the definition to not be
23628 known at this call */
23629 AV ** strings, /* To return string property values, like named
23631 bool *user_defined_ptr, /* Upon return from this function it will be
23632 set to TRUE if any component is a
23633 user-defined property */
23634 SV * msg, /* Any error or warning msg(s) are appended to
23636 const STRLEN level) /* Recursion level of this call */
23639 char* lookup_name; /* normalized name for lookup in our tables */
23640 unsigned lookup_len; /* Its length */
23641 enum { Not_Strict = 0, /* Some properties have stricter name */
23642 Strict, /* normalization rules, which we decide */
23643 As_Is /* upon based on parsing */
23644 } stricter = Not_Strict;
23646 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23647 * (though it requires extra effort to download them from Unicode and
23648 * compile perl to know about them) */
23649 bool is_nv_type = FALSE;
23651 unsigned int i, j = 0;
23652 int equals_pos = -1; /* Where the '=' is found, or negative if none */
23653 int slash_pos = -1; /* Where the '/' is found, or negative if none */
23654 int table_index = 0; /* The entry number for this property in the table
23655 of all Unicode property names */
23656 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
23657 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
23658 the normalized name in certain situations */
23659 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
23660 part of a package name */
23661 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
23662 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
23663 property rather than a Unicode
23665 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
23666 if an error. If it is an inversion list,
23667 it is the definition. Otherwise it is a
23668 string containing the fully qualified sub
23670 SV * fq_name = NULL; /* For user-defined properties, the fully
23672 bool invert_return = FALSE; /* ? Do we need to complement the result before
23674 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23675 explicit utf8:: package that we strip
23677 /* The expansion of properties that could be either user-defined or
23678 * official unicode ones is deferred until runtime, including a marker for
23679 * those that might be in the latter category. This boolean indicates if
23680 * we've seen that marker. If not, what we're parsing can't be such an
23681 * official Unicode property whose expansion was deferred */
23682 bool could_be_deferred_official = FALSE;
23684 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23686 /* The input will be normalized into 'lookup_name' */
23687 Newx(lookup_name, name_len, char);
23688 SAVEFREEPV(lookup_name);
23690 /* Parse the input. */
23691 for (i = 0; i < name_len; i++) {
23692 char cur = name[i];
23694 /* Most of the characters in the input will be of this ilk, being parts
23696 if (isIDCONT_A(cur)) {
23698 /* Case differences are ignored. Our lookup routine assumes
23699 * everything is lowercase, so normalize to that */
23700 if (isUPPER_A(cur)) {
23701 lookup_name[j++] = toLOWER_A(cur);
23705 if (cur == '_') { /* Don't include these in the normalized name */
23709 lookup_name[j++] = cur;
23711 /* The first character in a user-defined name must be of this type.
23713 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23714 could_be_user_defined = FALSE;
23720 /* Here, the character is not something typically in a name, But these
23721 * two types of characters (and the '_' above) can be freely ignored in
23722 * most situations. Later it may turn out we shouldn't have ignored
23723 * them, and we have to reparse, but we don't have enough information
23724 * yet to make that decision */
23725 if (cur == '-' || isSPACE_A(cur)) {
23726 could_be_user_defined = FALSE;
23730 /* An equals sign or single colon mark the end of the first part of
23731 * the property name */
23733 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23735 lookup_name[j++] = '='; /* Treat the colon as an '=' */
23736 equals_pos = j; /* Note where it occurred in the input */
23737 could_be_user_defined = FALSE;
23741 /* If this looks like it is a marker we inserted at compile time,
23742 * set a flag and otherwise ignore it. If it isn't in the final
23743 * position, keep it as it would have been user input. */
23744 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23746 && could_be_user_defined
23747 && i == name_len - 1)
23750 could_be_deferred_official = TRUE;
23754 /* Otherwise, this character is part of the name. */
23755 lookup_name[j++] = cur;
23757 /* Here it isn't a single colon, so if it is a colon, it must be a
23761 /* A double colon should be a package qualifier. We note its
23762 * position and continue. Note that one could have
23763 * pkg1::pkg2::...::foo
23764 * so that the position at the end of the loop will be just after
23765 * the final qualifier */
23768 non_pkg_begin = i + 1;
23769 lookup_name[j++] = ':';
23770 lun_non_pkg_begin = j;
23772 else { /* Only word chars (and '::') can be in a user-defined name */
23773 could_be_user_defined = FALSE;
23775 } /* End of parsing through the lhs of the property name (or all of it if
23778 # define STRLENs(s) (sizeof("" s "") - 1)
23780 /* If there is a single package name 'utf8::', it is ambiguous. It could
23781 * be for a user-defined property, or it could be a Unicode property, as
23782 * all of them are considered to be for that package. For the purposes of
23783 * parsing the rest of the property, strip it off */
23784 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23785 lookup_name += STRLENs("utf8::");
23786 j -= STRLENs("utf8::");
23787 equals_pos -= STRLENs("utf8::");
23788 stripped_utf8_pkg = TRUE;
23791 /* Here, we are either done with the whole property name, if it was simple;
23792 * or are positioned just after the '=' if it is compound. */
23794 if (equals_pos >= 0) {
23795 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23797 /* Space immediately after the '=' is ignored */
23799 for (; i < name_len; i++) {
23800 if (! isSPACE_A(name[i])) {
23805 /* Most punctuation after the equals indicates a subpattern, like
23807 if ( isPUNCT_A(name[i])
23812 /* A backslash means the real delimitter is the next character,
23813 * but it must be punctuation */
23814 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23816 bool special_property = memEQs(lookup_name, j - 1, "name")
23817 || memEQs(lookup_name, j - 1, "na");
23818 if (! special_property) {
23819 /* Find the property. The table includes the equals sign, so
23820 * we use 'j' as-is */
23821 table_index = do_uniprop_match(lookup_name, j);
23823 if (special_property || table_index) {
23824 REGEXP * subpattern_re;
23825 char open = name[i++];
23827 const char * pos_in_brackets;
23828 const char * const * prop_values;
23831 /* Backslash => delimitter is the character following. We
23832 * already checked that it is punctuation */
23833 if (open == '\\') {
23838 /* This data structure is constructed so that the matching
23839 * closing bracket is 3 past its matching opening. The second
23840 * set of closing is so that if the opening is something like
23841 * ']', the closing will be that as well. Something similar is
23842 * done in toke.c */
23843 pos_in_brackets = memCHRs("([<)]>)]>", open);
23844 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23847 || name[name_len-1] != close
23848 || (escaped && name[name_len-2] != '\\')
23849 /* Also make sure that there are enough characters.
23850 * e.g., '\\\' would show up incorrectly as legal even
23851 * though it is too short */
23852 || (SSize_t) (name_len - i - 1 - escaped) < 0)
23854 sv_catpvs(msg, "Unicode property wildcard not terminated");
23855 goto append_name_to_msg;
23858 Perl_ck_warner_d(aTHX_
23859 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23860 "The Unicode property wildcards feature is experimental");
23862 if (special_property) {
23863 const char * error_msg;
23864 const char * revised_name = name + i;
23865 Size_t revised_name_len = name_len - (i + 1 + escaped);
23867 /* Currently, the only 'special_property' is name, which we
23868 * lookup in _charnames.pm */
23870 if (! load_charnames(newSVpvs("placeholder"),
23871 revised_name, revised_name_len,
23874 sv_catpv(msg, error_msg);
23875 goto append_name_to_msg;
23878 /* Farm this out to a function just to make the current
23879 * function less unwieldy */
23880 if (handle_names_wildcard(revised_name, revised_name_len,
23884 return prop_definition;
23890 prop_values = get_prop_values(table_index);
23892 /* Now create and compile the wildcard subpattern. Use /i
23893 * because the property values are supposed to match with case
23895 subpattern_re = compile_wildcard(name + i,
23896 name_len - i - 1 - escaped,
23900 /* For each legal property value, see if the supplied pattern
23902 while (*prop_values) {
23903 const char * const entry = *prop_values;
23904 const Size_t len = strlen(entry);
23905 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23907 if (execute_wildcard(subpattern_re,
23909 (char *) entry + len,
23913 { /* Here, matched. Add to the returned list */
23914 Size_t total_len = j + len;
23915 SV * sub_invlist = NULL;
23916 char * this_string;
23918 /* We know this is a legal \p{property=value}. Call
23919 * the function to return the list of code points that
23921 Newxz(this_string, total_len + 1, char);
23922 Copy(lookup_name, this_string, j, char);
23923 my_strlcat(this_string, entry, total_len + 1);
23924 SAVEFREEPV(this_string);
23925 sub_invlist = parse_uniprop_string(this_string,
23935 _invlist_union(prop_definition, sub_invlist,
23939 prop_values++; /* Next iteration, look at next propvalue */
23940 } /* End of looking through property values; (the data
23941 structure is terminated by a NULL ptr) */
23943 SvREFCNT_dec_NN(subpattern_re);
23945 if (prop_definition) {
23946 return prop_definition;
23949 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23950 goto append_name_to_msg;
23953 /* Here's how khw thinks we should proceed to handle the properties
23954 * not yet done: Bidi Mirroring Glyph can map to ""
23955 Bidi Paired Bracket can map to ""
23956 Case Folding (both full and simple)
23957 Shouldn't /i be good enough for Full
23958 Decomposition Mapping
23959 Equivalent Unified Ideograph can map to ""
23960 Lowercase Mapping (both full and simple)
23961 NFKC Case Fold can map to ""
23962 Titlecase Mapping (both full and simple)
23963 Uppercase Mapping (both full and simple)
23964 * Handle these the same way Name is done, using say, _wild.pm, but
23965 * having both loose and full, like in charclass_invlists.h.
23966 * Perhaps move block and script to that as they are somewhat large
23967 * in charclass_invlists.h.
23968 * For properties where the default is the code point itself, such
23969 * as any of the case changing mappings, the string would otherwise
23970 * consist of all Unicode code points in UTF-8 strung together.
23971 * This would be impractical. So instead, examine their compiled
23972 * pattern, looking at the ssc. If none, reject the pattern as an
23973 * error. Otherwise run the pattern against every code point in
23974 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
23975 * And it might be good to create an API to return the ssc.
23976 * Or handle them like the algorithmic names are done
23978 } /* End of is a wildcard subppattern */
23980 /* \p{name=...} is handled specially. Instead of using the normal
23981 * mechanism involving charclass_invlists.h, it uses _charnames.pm
23982 * which has the necessary (huge) data accessible to it, and which
23983 * doesn't get loaded unless necessary. The legal syntax for names is
23984 * somewhat different than other properties due both to the vagaries of
23985 * a few outlier official names, and the fact that only a few ASCII
23986 * characters are permitted in them */
23987 if ( memEQs(lookup_name, j - 1, "name")
23988 || memEQs(lookup_name, j - 1, "na"))
23993 const char * error_msg;
23995 SV * character_name;
23996 STRLEN character_len;
24001 /* Since the RHS (after skipping initial space) is passed unchanged
24002 * to charnames, and there are different criteria for what are
24003 * legal characters in the name, just parse it here. A character
24004 * name must begin with an ASCII alphabetic */
24005 if (! isALPHA(name[i])) {
24008 lookup_name[j++] = name[i];
24010 for (++i; i < name_len; i++) {
24011 /* Official names can only be in the ASCII range, and only
24012 * certain characters */
24013 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24016 lookup_name[j++] = name[i];
24019 /* Finished parsing, save the name into an SV */
24020 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24022 /* Make sure _charnames is loaded. (The parameters give context
24023 * for any errors generated */
24024 table = load_charnames(character_name, name, name_len, &error_msg);
24025 if (table == NULL) {
24026 sv_catpv(msg, error_msg);
24027 goto append_name_to_msg;
24030 lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
24031 if (! lookup_loose) {
24033 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24036 PUSHSTACKi(PERLSI_REGCOMP);
24042 XPUSHs(character_name);
24044 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24049 SvREFCNT_inc_simple_void_NN(character);
24056 if (! SvOK(character)) {
24060 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24061 if (character_len == SvCUR(character)) {
24062 prop_definition = add_cp_to_invlist(NULL, cp);
24067 /* First of the remaining characters in the string. */
24068 char * remaining = SvPVX(character) + character_len;
24070 if (strings == NULL) {
24071 goto failed; /* XXX Perhaps a specific msg instead, like
24072 'not available here' */
24075 if (*strings == NULL) {
24076 *strings = newAV();
24079 this_string = newAV();
24080 av_push(this_string, newSVuv(cp));
24083 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24084 av_push(this_string, newSVuv(cp));
24085 remaining += character_len;
24086 } while (remaining < SvEND(character));
24088 av_push(*strings, (SV *) this_string);
24091 return prop_definition;
24094 /* Certain properties whose values are numeric need special handling.
24095 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24096 * purposes of checking if this is one of those properties */
24097 if (memBEGINPs(lookup_name, j, "is")) {
24101 /* Then check if it is one of these specially-handled properties. The
24102 * possibilities are hard-coded because easier this way, and the list
24103 * is unlikely to change.
24105 * All numeric value type properties are of this ilk, and are also
24106 * special in a different way later on. So find those first. There
24107 * are several numeric value type properties in the Unihan DB (which is
24108 * unlikely to be compiled with perl, but we handle it here in case it
24109 * does get compiled). They all end with 'numeric'. The interiors
24110 * aren't checked for the precise property. This would stop working if
24111 * a cjk property were to be created that ended with 'numeric' and
24112 * wasn't a numeric type */
24113 is_nv_type = memEQs(lookup_name + lookup_offset,
24114 j - 1 - lookup_offset, "numericvalue")
24115 || memEQs(lookup_name + lookup_offset,
24116 j - 1 - lookup_offset, "nv")
24117 || ( memENDPs(lookup_name + lookup_offset,
24118 j - 1 - lookup_offset, "numeric")
24119 && ( memBEGINPs(lookup_name + lookup_offset,
24120 j - 1 - lookup_offset, "cjk")
24121 || memBEGINPs(lookup_name + lookup_offset,
24122 j - 1 - lookup_offset, "k")));
24124 || memEQs(lookup_name + lookup_offset,
24125 j - 1 - lookup_offset, "canonicalcombiningclass")
24126 || memEQs(lookup_name + lookup_offset,
24127 j - 1 - lookup_offset, "ccc")
24128 || memEQs(lookup_name + lookup_offset,
24129 j - 1 - lookup_offset, "age")
24130 || memEQs(lookup_name + lookup_offset,
24131 j - 1 - lookup_offset, "in")
24132 || memEQs(lookup_name + lookup_offset,
24133 j - 1 - lookup_offset, "presentin"))
24137 /* Since the stuff after the '=' is a number, we can't throw away
24138 * '-' willy-nilly, as those could be a minus sign. Other stricter
24139 * rules also apply. However, these properties all can have the
24140 * rhs not be a number, in which case they contain at least one
24141 * alphabetic. In those cases, the stricter rules don't apply.
24142 * But the numeric type properties can have the alphas [Ee] to
24143 * signify an exponent, and it is still a number with stricter
24144 * rules. So look for an alpha that signifies not-strict */
24146 for (k = i; k < name_len; k++) {
24147 if ( isALPHA_A(name[k])
24148 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24150 stricter = Not_Strict;
24158 /* A number may have a leading '+' or '-'. The latter is retained
24160 if (name[i] == '+') {
24163 else if (name[i] == '-') {
24164 lookup_name[j++] = '-';
24168 /* Skip leading zeros including single underscores separating the
24169 * zeros, or between the final leading zero and the first other
24171 for (; i < name_len - 1; i++) {
24172 if ( name[i] != '0'
24173 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24180 else { /* No '=' */
24182 /* Only a few properties without an '=' should be parsed with stricter
24183 * rules. The list is unlikely to change. */
24184 if ( memBEGINPs(lookup_name, j, "perl")
24185 && memNEs(lookup_name + 4, j - 4, "space")
24186 && memNEs(lookup_name + 4, j - 4, "word"))
24190 /* We set the inputs back to 0 and the code below will reparse,
24196 /* Here, we have either finished the property, or are positioned to parse
24197 * the remainder, and we know if stricter rules apply. Finish out, if not
24199 for (; i < name_len; i++) {
24200 char cur = name[i];
24202 /* In all instances, case differences are ignored, and we normalize to
24204 if (isUPPER_A(cur)) {
24205 lookup_name[j++] = toLOWER(cur);
24209 /* An underscore is skipped, but not under strict rules unless it
24210 * separates two digits */
24213 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
24214 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24216 lookup_name[j++] = '_';
24221 /* Hyphens are skipped except under strict */
24222 if (cur == '-' && ! stricter) {
24226 /* XXX Bug in documentation. It says white space skipped adjacent to
24227 * non-word char. Maybe we should, but shouldn't skip it next to a dot
24229 if (isSPACE_A(cur) && ! stricter) {
24233 lookup_name[j++] = cur;
24235 /* Unless this is a non-trailing slash, we are done with it */
24236 if (i >= name_len - 1 || cur != '/') {
24242 /* A slash in the 'numeric value' property indicates that what follows
24243 * is a denominator. It can have a leading '+' and '0's that should be
24244 * skipped. But we have never allowed a negative denominator, so treat
24245 * a minus like every other character. (No need to rule out a second
24246 * '/', as that won't match anything anyway */
24249 if (i < name_len && name[i] == '+') {
24253 /* Skip leading zeros including underscores separating digits */
24254 for (; i < name_len - 1; i++) {
24255 if ( name[i] != '0'
24256 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24262 /* Store the first real character in the denominator */
24263 if (i < name_len) {
24264 lookup_name[j++] = name[i];
24269 /* Here are completely done parsing the input 'name', and 'lookup_name'
24270 * contains a copy, normalized.
24272 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24273 * different from without the underscores. */
24274 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
24275 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24276 && UNLIKELY(name[name_len-1] == '_'))
24278 lookup_name[j++] = '&';
24281 /* If the original input began with 'In' or 'Is', it could be a subroutine
24282 * call to a user-defined property instead of a Unicode property name. */
24283 if ( name_len - non_pkg_begin > 2
24284 && name[non_pkg_begin+0] == 'I'
24285 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24287 /* Names that start with In have different characterstics than those
24288 * that start with Is */
24289 if (name[non_pkg_begin+1] == 's') {
24290 starts_with_Is = TRUE;
24294 could_be_user_defined = FALSE;
24297 if (could_be_user_defined) {
24300 /* If the user defined property returns the empty string, it could
24301 * easily be because the pattern is being compiled before the data it
24302 * actually needs to compile is available. This could be argued to be
24303 * a bug in the perl code, but this is a change of behavior for Perl,
24304 * so we handle it. This means that intentionally returning nothing
24305 * will not be resolved until runtime */
24306 bool empty_return = FALSE;
24308 /* Here, the name could be for a user defined property, which are
24309 * implemented as subs. */
24310 user_sub = get_cvn_flags(name, name_len, 0);
24313 /* Here, the property name could be a user-defined one, but there
24314 * is no subroutine to handle it (as of now). Defer handling it
24315 * until runtime. Otherwise, a block defined by Unicode in a later
24316 * release would get the synonym InFoo added for it, and existing
24317 * code that used that name would suddenly break if it referred to
24318 * the property before the sub was declared. See [perl #134146] */
24320 goto definition_deferred;
24323 /* Here, we are at runtime, and didn't find the user property. It
24324 * could be an official property, but only if no package was
24325 * specified, or just the utf8:: package. */
24326 if (could_be_deferred_official) {
24327 lookup_name += lun_non_pkg_begin;
24328 j -= lun_non_pkg_begin;
24330 else if (! stripped_utf8_pkg) {
24331 goto unknown_user_defined;
24334 /* Drop down to look up in the official properties */
24337 const char insecure[] = "Insecure user-defined property";
24339 /* Here, there is a sub by the correct name. Normally we call it
24340 * to get the property definition */
24342 SV * user_sub_sv = MUTABLE_SV(user_sub);
24343 SV * error; /* Any error returned by calling 'user_sub' */
24344 SV * key; /* The key into the hash of user defined sub names
24347 SV ** saved_user_prop_ptr; /* Hash entry for this property */
24349 /* How many times to retry when another thread is in the middle of
24350 * expanding the same definition we want */
24351 PERL_INT_FAST8_T retry_countdown = 10;
24353 DECLARATION_FOR_GLOBAL_CONTEXT;
24355 /* If we get here, we know this property is user-defined */
24356 *user_defined_ptr = TRUE;
24358 /* We refuse to call a potentially tainted subroutine; returning an
24361 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24362 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24363 goto append_name_to_msg;
24366 /* In principal, we only call each subroutine property definition
24367 * once during the life of the program. This guarantees that the
24368 * property definition never changes. The results of the single
24369 * sub call are stored in a hash, which is used instead for future
24370 * references to this property. The property definition is thus
24371 * immutable. But, to allow the user to have a /i-dependent
24372 * definition, we call the sub once for non-/i, and once for /i,
24373 * should the need arise, passing the /i status as a parameter.
24375 * We start by constructing the hash key name, consisting of the
24376 * fully qualified subroutine name, preceded by the /i status, so
24377 * that there is a key for /i and a different key for non-/i */
24378 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24379 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24380 non_pkg_begin != 0);
24381 sv_catsv(key, fq_name);
24384 /* We only call the sub once throughout the life of the program
24385 * (with the /i, non-/i exception noted above). That means the
24386 * hash must be global and accessible to all threads. It is
24387 * created at program start-up, before any threads are created, so
24388 * is accessible to all children. But this creates some
24391 * 1) The keys can't be shared, or else problems arise; sharing is
24392 * turned off at hash creation time
24393 * 2) All SVs in it are there for the remainder of the life of the
24394 * program, and must be created in the same interpreter context
24395 * as the hash, or else they will be freed from the wrong pool
24396 * at global destruction time. This is handled by switching to
24397 * the hash's context to create each SV going into it, and then
24398 * immediately switching back
24399 * 3) All accesses to the hash must be controlled by a mutex, to
24400 * prevent two threads from getting an unstable state should
24401 * they simultaneously be accessing it. The code below is
24402 * crafted so that the mutex is locked whenever there is an
24403 * access and unlocked only when the next stable state is
24406 * The hash stores either the definition of the property if it was
24407 * valid, or, if invalid, the error message that was raised. We
24408 * use the type of SV to distinguish.
24410 * There's also the need to guard against the definition expansion
24411 * from infinitely recursing. This is handled by storing the aTHX
24412 * of the expanding thread during the expansion. Again the SV type
24413 * is used to distinguish this from the other two cases. If we
24414 * come to here and the hash entry for this property is our aTHX,
24415 * it means we have recursed, and the code assumes that we would
24416 * infinitely recurse, so instead stops and raises an error.
24417 * (Any recursion has always been treated as infinite recursion in
24420 * If instead, the entry is for a different aTHX, it means that
24421 * that thread has gotten here first, and hasn't finished expanding
24422 * the definition yet. We just have to wait until it is done. We
24423 * sleep and retry a few times, returning an error if the other
24424 * thread doesn't complete. */
24427 USER_PROP_MUTEX_LOCK;
24429 /* If we have an entry for this key, the subroutine has already
24430 * been called once with this /i status. */
24431 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24432 SvPVX(key), SvCUR(key), 0);
24433 if (saved_user_prop_ptr) {
24435 /* If the saved result is an inversion list, it is the valid
24436 * definition of this property */
24437 if (is_invlist(*saved_user_prop_ptr)) {
24438 prop_definition = *saved_user_prop_ptr;
24440 /* The SV in the hash won't be removed until global
24441 * destruction, so it is stable and we can unlock */
24442 USER_PROP_MUTEX_UNLOCK;
24444 /* The caller shouldn't try to free this SV */
24445 return prop_definition;
24448 /* Otherwise, if it is a string, it is the error message
24449 * that was returned when we first tried to evaluate this
24450 * property. Fail, and append the message */
24451 if (SvPOK(*saved_user_prop_ptr)) {
24452 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24453 sv_catsv(msg, *saved_user_prop_ptr);
24455 /* The SV in the hash won't be removed until global
24456 * destruction, so it is stable and we can unlock */
24457 USER_PROP_MUTEX_UNLOCK;
24462 assert(SvIOK(*saved_user_prop_ptr));
24464 /* Here, we have an unstable entry in the hash. Either another
24465 * thread is in the middle of expanding the property's
24466 * definition, or we are ourselves recursing. We use the aTHX
24467 * in it to distinguish */
24468 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24470 /* Here, it's another thread doing the expanding. We've
24471 * looked as much as we are going to at the contents of the
24472 * hash entry. It's safe to unlock. */
24473 USER_PROP_MUTEX_UNLOCK;
24475 /* Retry a few times */
24476 if (retry_countdown-- > 0) {
24481 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24482 sv_catpvs(msg, "Timeout waiting for another thread to "
24484 goto append_name_to_msg;
24487 /* Here, we are recursing; don't dig any deeper */
24488 USER_PROP_MUTEX_UNLOCK;
24490 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24492 "Infinite recursion in user-defined property");
24493 goto append_name_to_msg;
24496 /* Here, this thread has exclusive control, and there is no entry
24497 * for this property in the hash. So we have the go ahead to
24498 * expand the definition ourselves. */
24500 PUSHSTACKi(PERLSI_REGCOMP);
24503 /* Create a temporary placeholder in the hash to detect recursion
24505 SWITCH_TO_GLOBAL_CONTEXT;
24506 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24507 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24510 /* Now that we have a placeholder, we can let other threads
24512 USER_PROP_MUTEX_UNLOCK;
24514 /* Make sure the placeholder always gets destroyed */
24515 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24520 /* Call the user's function, with the /i status as a parameter.
24521 * Note that we have gone to a lot of trouble to keep this call
24522 * from being within the locked mutex region. */
24523 XPUSHs(boolSV(to_fold));
24526 /* The following block was taken from swash_init(). Presumably
24527 * they apply to here as well, though we no longer use a swash --
24531 /* We might get here via a subroutine signature which uses a utf8
24532 * parameter name, at which point PL_subname will have been set
24533 * but not yet used. */
24534 save_item(PL_subname);
24536 /* G_SCALAR guarantees a single return value */
24537 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24542 if (TAINT_get || SvTRUE(error)) {
24543 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24544 if (SvTRUE(error)) {
24545 sv_catpvs(msg, "Error \"");
24546 sv_catsv(msg, error);
24547 sv_catpvs(msg, "\"");
24550 if (SvTRUE(error)) sv_catpvs(msg, "; ");
24551 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24554 if (name_len > 0) {
24555 sv_catpvs(msg, " in expansion of ");
24556 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24562 prop_definition = NULL;
24565 SV * contents = POPs;
24567 /* The contents is supposed to be the expansion of the property
24568 * definition. If the definition is deferrable, and we got an
24569 * empty string back, set a flag to later defer it (after clean
24572 && (! SvPOK(contents) || SvCUR(contents) == 0))
24574 empty_return = TRUE;
24576 else { /* Otherwise, call a function to check for valid syntax,
24579 prop_definition = handle_user_defined_property(
24581 is_utf8, to_fold, runtime,
24583 contents, user_defined_ptr,
24589 /* Here, we have the results of the expansion. Delete the
24590 * placeholder, and if the definition is now known, replace it with
24591 * that definition. We need exclusive access to the hash, and we
24592 * can't let anyone else in, between when we delete the placeholder
24593 * and add the permanent entry */
24594 USER_PROP_MUTEX_LOCK;
24596 S_delete_recursion_entry(aTHX_ SvPVX(key));
24598 if ( ! empty_return
24599 && (! prop_definition || is_invlist(prop_definition)))
24601 /* If we got success we use the inversion list defining the
24602 * property; otherwise use the error message */
24603 SWITCH_TO_GLOBAL_CONTEXT;
24604 (void) hv_store_ent(PL_user_def_props,
24607 ? newSVsv(prop_definition)
24613 /* All done, and the hash now has a permanent entry for this
24614 * property. Give up exclusive control */
24615 USER_PROP_MUTEX_UNLOCK;
24621 if (empty_return) {
24622 goto definition_deferred;
24625 if (prop_definition) {
24627 /* If the definition is for something not known at this time,
24628 * we toss it, and go return the main property name, as that's
24629 * the one the user will be aware of */
24630 if (! is_invlist(prop_definition)) {
24631 SvREFCNT_dec_NN(prop_definition);
24632 goto definition_deferred;
24635 sv_2mortal(prop_definition);
24639 return prop_definition;
24641 } /* End of calling the subroutine for the user-defined property */
24642 } /* End of it could be a user-defined property */
24644 /* Here it wasn't a user-defined property that is known at this time. See
24645 * if it is a Unicode property */
24647 lookup_len = j; /* This is a more mnemonic name than 'j' */
24649 /* Get the index into our pointer table of the inversion list corresponding
24650 * to the property */
24651 table_index = do_uniprop_match(lookup_name, lookup_len);
24653 /* If it didn't find the property ... */
24654 if (table_index == 0) {
24656 /* Try again stripping off any initial 'Is'. This is because we
24657 * promise that an initial Is is optional. The same isn't true of
24658 * names that start with 'In'. Those can match only blocks, and the
24659 * lookup table already has those accounted for. */
24660 if (starts_with_Is) {
24666 table_index = do_uniprop_match(lookup_name, lookup_len);
24669 if (table_index == 0) {
24672 /* Here, we didn't find it. If not a numeric type property, and
24673 * can't be a user-defined one, it isn't a legal property */
24674 if (! is_nv_type) {
24675 if (! could_be_user_defined) {
24679 /* Here, the property name is legal as a user-defined one. At
24680 * compile time, it might just be that the subroutine for that
24681 * property hasn't been encountered yet, but at runtime, it's
24682 * an error to try to use an undefined one */
24683 if (! deferrable) {
24684 goto unknown_user_defined;;
24687 goto definition_deferred;
24688 } /* End of isn't a numeric type property */
24690 /* The numeric type properties need more work to decide. What we
24691 * do is make sure we have the number in canonical form and look
24694 if (slash_pos < 0) { /* No slash */
24696 /* When it isn't a rational, take the input, convert it to a
24697 * NV, then create a canonical string representation of that
24701 SSize_t value_len = lookup_len - equals_pos;
24703 /* Get the value */
24704 if ( value_len <= 0
24705 || my_atof3(lookup_name + equals_pos, &value,
24707 != lookup_name + lookup_len)
24712 /* If the value is an integer, the canonical value is integral
24714 if (Perl_ceil(value) == value) {
24715 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24716 equals_pos, lookup_name, value);
24718 else { /* Otherwise, it is %e with a known precision */
24721 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24722 equals_pos, lookup_name,
24723 PL_E_FORMAT_PRECISION, value);
24725 /* The exponent generated is expecting two digits, whereas
24726 * %e on some systems will generate three. Remove leading
24727 * zeros in excess of 2 from the exponent. We start
24728 * looking for them after the '=' */
24729 exp_ptr = strchr(canonical + equals_pos, 'e');
24731 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24732 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24734 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24736 if (excess_exponent_len > 0) {
24737 SSize_t leading_zeros = strspn(cur_ptr, "0");
24738 SSize_t excess_leading_zeros
24739 = MIN(leading_zeros, excess_exponent_len);
24740 if (excess_leading_zeros > 0) {
24741 Move(cur_ptr + excess_leading_zeros,
24743 strlen(cur_ptr) - excess_leading_zeros
24744 + 1, /* Copy the NUL as well */
24751 else { /* Has a slash. Create a rational in canonical form */
24752 UV numerator, denominator, gcd, trial;
24753 const char * end_ptr;
24754 const char * sign = "";
24756 /* We can't just find the numerator, denominator, and do the
24757 * division, then use the method above, because that is
24758 * inexact. And the input could be a rational that is within
24759 * epsilon (given our precision) of a valid rational, and would
24760 * then incorrectly compare valid.
24762 * We're only interested in the part after the '=' */
24763 const char * this_lookup_name = lookup_name + equals_pos;
24764 lookup_len -= equals_pos;
24765 slash_pos -= equals_pos;
24767 /* Handle any leading minus */
24768 if (this_lookup_name[0] == '-') {
24770 this_lookup_name++;
24775 /* Convert the numerator to numeric */
24776 end_ptr = this_lookup_name + slash_pos;
24777 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24781 /* It better have included all characters before the slash */
24782 if (*end_ptr != '/') {
24786 /* Set to look at just the denominator */
24787 this_lookup_name += slash_pos;
24788 lookup_len -= slash_pos;
24789 end_ptr = this_lookup_name + lookup_len;
24791 /* Convert the denominator to numeric */
24792 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24796 /* It better be the rest of the characters, and don't divide by
24798 if ( end_ptr != this_lookup_name + lookup_len
24799 || denominator == 0)
24804 /* Get the greatest common denominator using
24805 http://en.wikipedia.org/wiki/Euclidean_algorithm */
24807 trial = denominator;
24808 while (trial != 0) {
24810 trial = gcd % trial;
24814 /* If already in lowest possible terms, we have already tried
24815 * looking this up */
24820 /* Reduce the rational, which should put it in canonical form
24823 denominator /= gcd;
24825 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24826 equals_pos, lookup_name, sign, numerator, denominator);
24829 /* Here, we have the number in canonical form. Try that */
24830 table_index = do_uniprop_match(canonical, strlen(canonical));
24831 if (table_index == 0) {
24834 } /* End of still didn't find the property in our table */
24835 } /* End of didn't find the property in our table */
24837 /* Here, we have a non-zero return, which is an index into a table of ptrs.
24838 * A negative return signifies that the real index is the absolute value,
24839 * but the result needs to be inverted */
24840 if (table_index < 0) {
24841 invert_return = TRUE;
24842 table_index = -table_index;
24845 /* Out-of band indices indicate a deprecated property. The proper index is
24846 * modulo it with the table size. And dividing by the table size yields
24847 * an offset into a table constructed by regen/mk_invlists.pl to contain
24848 * the corresponding warning message */
24849 if (table_index > MAX_UNI_KEYWORD_INDEX) {
24850 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24851 table_index %= MAX_UNI_KEYWORD_INDEX;
24852 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24853 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24854 (int) name_len, name,
24855 get_deprecated_property_msg(warning_offset));
24858 /* In a few properties, a different property is used under /i. These are
24859 * unlikely to change, so are hard-coded here. */
24861 if ( table_index == UNI_XPOSIXUPPER
24862 || table_index == UNI_XPOSIXLOWER
24863 || table_index == UNI_TITLE)
24865 table_index = UNI_CASED;
24867 else if ( table_index == UNI_UPPERCASELETTER
24868 || table_index == UNI_LOWERCASELETTER
24869 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
24870 || table_index == UNI_TITLECASELETTER
24873 table_index = UNI_CASEDLETTER;
24875 else if ( table_index == UNI_POSIXUPPER
24876 || table_index == UNI_POSIXLOWER)
24878 table_index = UNI_POSIXALPHA;
24882 /* Create and return the inversion list */
24883 prop_definition = get_prop_definition(table_index);
24884 sv_2mortal(prop_definition);
24886 /* See if there is a private use override to add to this definition */
24888 COPHH * hinthash = (IN_PERL_COMPILETIME)
24889 ? CopHINTHASH_get(&PL_compiling)
24890 : CopHINTHASH_get(PL_curcop);
24891 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24893 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24895 /* See if there is an element in the hints hash for this table */
24896 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24897 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24901 SV * pu_definition;
24903 SV * expanded_prop_definition =
24904 sv_2mortal(invlist_clone(prop_definition, NULL));
24906 /* If so, it's definition is the string from here to the next
24907 * \a character. And its format is the same as a user-defined
24909 pos += SvCUR(pu_lookup);
24910 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24911 pu_invlist = handle_user_defined_property(lookup_name,
24914 0, /* Not folded */
24922 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24923 sv_catpvs(msg, "Insecure private-use override");
24924 goto append_name_to_msg;
24927 /* For now, as a safety measure, make sure that it doesn't
24928 * override non-private use code points */
24929 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24931 /* Add it to the list to be returned */
24932 _invlist_union(prop_definition, pu_invlist,
24933 &expanded_prop_definition);
24934 prop_definition = expanded_prop_definition;
24935 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24940 if (invert_return) {
24941 _invlist_invert(prop_definition);
24943 return prop_definition;
24945 unknown_user_defined:
24946 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24947 sv_catpvs(msg, "Unknown user-defined property name");
24948 goto append_name_to_msg;
24951 if (non_pkg_begin != 0) {
24952 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24953 sv_catpvs(msg, "Illegal user-defined property name");
24956 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24957 sv_catpvs(msg, "Can't find Unicode property definition");
24961 append_name_to_msg:
24963 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
24964 const char * suffix = (runtime && level == 0) ? "}" : "\"";
24966 sv_catpv(msg, prefix);
24967 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24968 sv_catpv(msg, suffix);
24973 definition_deferred:
24976 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
24978 /* Here it could yet to be defined, so defer evaluation of this until
24979 * its needed at runtime. We need the fully qualified property name to
24980 * avoid ambiguity */
24982 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24986 /* If it didn't come with a package, or the package is utf8::, this
24987 * actually could be an official Unicode property whose inclusion we
24988 * are deferring until runtime to make sure that it isn't overridden by
24989 * a user-defined property of the same name (which we haven't
24990 * encountered yet). Add a marker to indicate this possibility, for
24991 * use at such time when we first need the definition during pattern
24992 * matching execution */
24993 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
24994 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
24997 /* We also need a trailing newline */
24998 sv_catpvs(fq_name, "\n");
25000 *user_defined_ptr = TRUE;
25006 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25007 const STRLEN wname_len, /* Its length */
25008 SV ** prop_definition,
25011 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25012 * any matches, adding them to prop_definition */
25016 CV * get_names_info; /* entry to charnames.pm to get info we need */
25017 SV * names_string; /* Contains all character names, except algo */
25018 SV * algorithmic_names; /* Contains info about algorithmically
25019 generated character names */
25020 REGEXP * subpattern_re; /* The user's pattern to match with */
25021 struct regexp * prog; /* The compiled pattern */
25022 char * all_names_start; /* lib/unicore/Name.pl string of every
25023 (non-algorithmic) character name */
25024 char * cur_pos; /* We match, effectively using /gc; this is
25025 where we are now */
25026 bool found_matches = FALSE; /* Did any name match so far? */
25027 SV * empty; /* For matching zero length names */
25028 SV * must_sv; /* Contains the substring, if any, that must be
25029 in a name for the subpattern to match */
25030 const char * must; /* The PV of 'must' */
25031 STRLEN must_len; /* And its length */
25032 SV * syllable_name = NULL; /* For Hangul syllables */
25033 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25034 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25036 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25037 * syllable name, and these are immutable and guaranteed by the Unicode
25038 * standard to never be extended */
25039 const STRLEN syl_max_len = hangul_prefix_len + 7;
25043 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25045 /* Make sure _charnames is loaded. (The parameters give context
25046 * for any errors generated */
25047 get_names_info = get_cv("_charnames::_get_names_info", 0);
25048 if (! get_names_info) {
25049 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25052 /* Get the charnames data */
25053 PUSHSTACKi(PERLSI_REGCOMP);
25061 /* Special _charnames entry point that returns the info this routine
25063 call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25067 /* Data structure for names which end in their very own code points */
25068 algorithmic_names = POPs;
25069 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25071 /* The lib/unicore/Name.pl string */
25072 names_string = POPs;
25073 SvREFCNT_inc_simple_void_NN(names_string);
25080 if ( ! SvROK(names_string)
25081 || ! SvROK(algorithmic_names))
25082 { /* Perhaps should panic instead XXX */
25083 SvREFCNT_dec(names_string);
25084 SvREFCNT_dec(algorithmic_names);
25088 names_string = sv_2mortal(SvRV(names_string));
25089 all_names_start = SvPVX(names_string);
25090 cur_pos = all_names_start;
25092 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25094 /* Compile the subpattern consisting of the name being looked for */
25095 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25097 must_sv = re_intuit_string(subpattern_re);
25099 /* regexec.c can free the re_intuit_string() return. GH #17734 */
25100 must_sv = sv_2mortal(newSVsv(must_sv));
25101 must = SvPV(must_sv, must_len);
25108 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
25109 * This works because the NUL causes the function to return early, thus
25110 * showing that there are characters in it other than the acceptable ones,
25111 * which is our desired result.) */
25113 prog = ReANY(subpattern_re);
25115 /* If only nothing is matched, skip to where empty names are looked for */
25116 if (prog->maxlen == 0) {
25120 /* And match against the string of all names /gc. Don't even try if it
25121 * must match a character not found in any name. */
25122 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25124 while (execute_wildcard(subpattern_re,
25126 SvEND(names_string),
25127 all_names_start, 0,
25130 { /* Here, matched. */
25132 /* Note the string entries look like
25133 * 00001\nSTART OF HEADING\n\n
25134 * so we could match anywhere in that string. We have to rule out
25135 * matching a code point line */
25136 char * this_name_start = all_names_start
25137 + RX_OFFS(subpattern_re)->start;
25138 char * this_name_end = all_names_start
25139 + RX_OFFS(subpattern_re)->end;
25142 UV cp = 0; /* Silences some compilers */
25143 AV * this_string = NULL;
25144 bool is_multi = FALSE;
25146 /* If matched nothing, advance to next possible match */
25147 if (this_name_start == this_name_end) {
25148 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25149 SvEND(names_string) - this_name_end);
25150 if (cur_pos == NULL) {
25155 /* Position the next match to start beyond the current returned
25157 cur_pos = (char *) memchr(this_name_end, '\n',
25158 SvEND(names_string) - this_name_end);
25161 /* Back up to the \n just before the beginning of the character. */
25162 cp_end = (char *) my_memrchr(all_names_start,
25164 this_name_start - all_names_start);
25166 /* If we didn't find a \n, it means it matched somewhere in the
25167 * initial '00000' in the string, so isn't a real match */
25168 if (cp_end == NULL) {
25172 this_name_start = cp_end + 1; /* The name starts just after */
25173 cp_end--; /* the \n, and the code point */
25174 /* ends just before it */
25176 /* All code points are 5 digits long */
25177 cp_start = cp_end - 4;
25179 /* This shouldn't happen, as we found a \n, and the first \n is
25180 * further along than what we subtracted */
25181 assert(cp_start >= all_names_start);
25183 if (cp_start == all_names_start) {
25184 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25188 /* If the character is a blank, we either have a named sequence, or
25189 * something is wrong */
25190 if (*(cp_start - 1) == ' ') {
25191 cp_start = (char *) my_memrchr(all_names_start,
25193 cp_start - all_names_start);
25197 assert(cp_start != NULL && cp_start >= all_names_start + 2);
25199 /* Except for the first line in the string, the sequence before the
25200 * code point is \n\n. If that isn't the case here, we didn't
25201 * match the name of a character. (We could have matched a named
25202 * sequence, not currently handled */
25203 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25207 /* We matched! Add this to the list */
25208 found_matches = TRUE;
25210 /* Loop through all the code points in the sequence */
25211 while (cp_start < cp_end) {
25213 /* Calculate this code point from its 5 digits */
25214 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25215 + (XDIGIT_VALUE(cp_start[1]) << 12)
25216 + (XDIGIT_VALUE(cp_start[2]) << 8)
25217 + (XDIGIT_VALUE(cp_start[3]) << 4)
25218 + XDIGIT_VALUE(cp_start[4]);
25220 cp_start += 6; /* Go past any blank */
25222 if (cp_start < cp_end || is_multi) {
25223 if (this_string == NULL) {
25224 this_string = newAV();
25228 av_push(this_string, newSVuv(cp));
25232 if (is_multi) { /* Was more than one code point */
25233 if (*strings == NULL) {
25234 *strings = newAV();
25237 av_push(*strings, (SV *) this_string);
25239 else { /* Only a single code point */
25240 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25242 } /* End of loop through the non-algorithmic names string */
25245 /* There are also character names not in 'names_string'. These are
25246 * algorithmically generatable. Try this pattern on each possible one.
25247 * (khw originally planned to leave this out given the large number of
25248 * matches attempted; but the speed turned out to be quite acceptable
25250 * There are plenty of opportunities to optimize to skip many of the tests.
25251 * beyond the rudimentary ones already here */
25253 /* First see if the subpattern matches any of the algorithmic generatable
25254 * Hangul syllable names.
25256 * We know none of these syllable names will match if the input pattern
25257 * requires more bytes than any syllable has, or if the input pattern only
25258 * matches an empty name, or if the pattern has something it must match and
25259 * one of the characters in that isn't in any Hangul syllable. */
25260 if ( prog->minlen <= (SSize_t) syl_max_len
25261 && prog->maxlen > 0
25262 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25264 /* These constants, names, values, and algorithm are adapted from the
25265 * Unicode standard, version 5.1, section 3.12, and should never
25267 const char * JamoL[] = {
25268 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25269 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25271 const int LCount = C_ARRAY_LENGTH(JamoL);
25273 const char * JamoV[] = {
25274 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25275 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25278 const int VCount = C_ARRAY_LENGTH(JamoV);
25280 const char * JamoT[] = {
25281 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25282 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25283 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25285 const int TCount = C_ARRAY_LENGTH(JamoT);
25289 /* This is the initial Hangul syllable code point; each time through the
25290 * inner loop, it maps to the next higher code point. For more info,
25291 * see the Hangul syllable section of the Unicode standard. */
25294 syllable_name = sv_2mortal(newSV(syl_max_len));
25295 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25297 for (L = 0; L < LCount; L++) {
25298 for (V = 0; V < VCount; V++) {
25299 for (T = 0; T < TCount; T++) {
25301 /* Truncate back to the prefix, which is unvarying */
25302 SvCUR_set(syllable_name, hangul_prefix_len);
25304 sv_catpv(syllable_name, JamoL[L]);
25305 sv_catpv(syllable_name, JamoV[V]);
25306 sv_catpv(syllable_name, JamoT[T]);
25308 if (execute_wildcard(subpattern_re,
25309 SvPVX(syllable_name),
25310 SvEND(syllable_name),
25311 SvPVX(syllable_name), 0,
25315 *prop_definition = add_cp_to_invlist(*prop_definition,
25317 found_matches = TRUE;
25326 /* The rest of the algorithmically generatable names are of the form
25327 * "PREFIX-code_point". The prefixes and the code point limits of each
25328 * were returned to us in the array 'algorithmic_names' from data in
25329 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
25330 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25333 /* Each element of the array is a hash, giving the details for the
25334 * series of names it covers. There is the base name of the characters
25335 * in the series, and the low and high code points in the series. And,
25336 * for optimization purposes a string containing all the legal
25337 * characters that could possibly be in a name in this series. */
25338 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25339 SV * prefix = * hv_fetchs(this_series, "name", 0);
25340 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25341 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25342 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25344 /* Pre-allocate an SV with enough space */
25345 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25347 if (high >= 0x10000) {
25348 sv_catpvs(algo_name, "0");
25351 /* This series can be skipped entirely if the pattern requires
25352 * something longer than any name in the series, or can only match an
25353 * empty name, or contains a character not found in any name in the
25355 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
25356 && prog->maxlen > 0
25357 && (strspn(must, legal) == must_len))
25359 for (j = low; j <= high; j++) { /* For each code point in the series */
25361 /* Get its name, and see if it matches the subpattern */
25362 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25365 if (execute_wildcard(subpattern_re,
25368 SvPVX(algo_name), 0,
25372 *prop_definition = add_cp_to_invlist(*prop_definition, j);
25373 found_matches = TRUE;
25380 /* Finally, see if the subpattern matches an empty string */
25381 empty = newSVpvs("");
25382 if (execute_wildcard(subpattern_re,
25389 /* Many code points have empty names. Currently these are the \p{GC=C}
25390 * ones, minus CC and CF */
25392 SV * empty_names_ref = get_prop_definition(UNI_C);
25393 SV * empty_names = invlist_clone(empty_names_ref, NULL);
25395 SV * subtract = get_prop_definition(UNI_CC);
25397 _invlist_subtract(empty_names, subtract, &empty_names);
25398 SvREFCNT_dec_NN(empty_names_ref);
25399 SvREFCNT_dec_NN(subtract);
25401 subtract = get_prop_definition(UNI_CF);
25402 _invlist_subtract(empty_names, subtract, &empty_names);
25403 SvREFCNT_dec_NN(subtract);
25405 _invlist_union(*prop_definition, empty_names, prop_definition);
25406 found_matches = TRUE;
25407 SvREFCNT_dec_NN(empty_names);
25409 SvREFCNT_dec_NN(empty);
25412 /* If we ever were to accept aliases for, say private use names, we would
25413 * need to do something fancier to find empty names. The code below works
25414 * (at the time it was written), and is slower than the above */
25415 const char empties_pat[] = "^.";
25416 if (strNE(name, empties_pat)) {
25417 SV * empty = newSVpvs("");
25418 if (execute_wildcard(subpattern_re,
25425 SV * empties = NULL;
25427 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25429 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25430 SvREFCNT_dec_NN(empties);
25432 found_matches = TRUE;
25434 SvREFCNT_dec_NN(empty);
25438 SvREFCNT_dec_NN(subpattern_re);
25439 return found_matches;
25443 * ex: set ts=8 sts=4 sw=4 et: