5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
154 #define STATIC static
157 /* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
161 typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
167 struct scan_frame *this_prev_frame; /* this previous frame */
168 struct scan_frame *prev_frame; /* previous frame */
169 struct scan_frame *next_frame; /* next frame */
172 /* Certain characters are output as a sequence with the first being a
174 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
177 struct RExC_state_t {
178 U32 flags; /* RXf_* are we folding, multilining? */
179 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
180 char *precomp; /* uncompiled string. */
181 char *precomp_end; /* pointer to end of uncompiled string. */
182 REGEXP *rx_sv; /* The SV that is the regexp. */
183 regexp *rx; /* perl core regexp structure */
184 regexp_internal *rxi; /* internal data for regexp object
186 char *start; /* Start of input for compile */
187 char *end; /* End of input for compile */
188 char *parse; /* Input-scan pointer. */
189 char *copy_start; /* start of copy of input within
190 constructed parse string */
191 char *save_copy_start; /* Provides one level of saving
192 and restoring 'copy_start' */
193 char *copy_start_in_input; /* Position in input string
194 corresponding to copy_start */
195 SSize_t whilem_seen; /* number of WHILEM in this expr */
196 regnode *emit_start; /* Start of emitted-code area */
197 regnode_offset emit; /* Code-emit pointer */
198 I32 naughty; /* How bad is this pattern? */
199 I32 sawback; /* Did we see \1, ...? */
200 SSize_t size; /* Number of regnode equivalents in
202 Size_t sets_depth; /* Counts recursion depth of already-
203 compiled regex set patterns */
206 I32 parens_buf_size; /* #slots malloced open/close_parens */
207 regnode_offset *open_parens; /* offsets to open parens */
208 regnode_offset *close_parens; /* offsets to close parens */
209 HV *paren_names; /* Paren names */
211 /* position beyond 'precomp' of the warning message furthest away from
212 * 'precomp'. During the parse, no warnings are raised for any problems
213 * earlier in the parse than this position. This works if warnings are
214 * raised the first time a given spot is parsed, and if only one
215 * independent warning is raised for any given spot */
216 Size_t latest_warn_offset;
218 I32 npar; /* Capture buffer count so far in the
219 parse, (OPEN) plus one. ("par" 0 is
221 I32 total_par; /* During initial parse, is either 0,
222 or -1; the latter indicating a
223 reparse is needed. After that pass,
224 it is what 'npar' became after the
225 pass. Hence, it being > 0 indicates
226 we are in a reparse situation */
227 I32 nestroot; /* root parens we are in - used by
230 regnode *end_op; /* END node in program */
231 I32 utf8; /* whether the pattern is utf8 or not */
232 I32 orig_utf8; /* whether the pattern was originally in utf8 */
233 /* XXX use this for future optimisation of case
234 * where pattern must be upgraded to utf8. */
235 I32 uni_semantics; /* If a d charset modifier should use unicode
236 rules, even if the pattern is not in
239 I32 recurse_count; /* Number of recurse regops we have generated */
240 regnode **recurse; /* Recurse regops */
241 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
243 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
247 I32 override_recoding;
248 I32 recode_x_to_native;
249 I32 in_multi_char_class;
250 int code_index; /* next code_blocks[] slot */
251 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
253 SSize_t maxlen; /* mininum possible number of chars in string to match */
254 scan_frame *frame_head;
255 scan_frame *frame_last;
259 SV *runtime_code_qr; /* qr with the runtime code blocks */
261 const char *lastparse;
263 U32 study_chunk_recursed_count;
264 AV *paren_name_list; /* idx -> name */
268 #define RExC_lastparse (pRExC_state->lastparse)
269 #define RExC_lastnum (pRExC_state->lastnum)
270 #define RExC_paren_name_list (pRExC_state->paren_name_list)
271 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
272 #define RExC_mysv (pRExC_state->mysv1)
273 #define RExC_mysv1 (pRExC_state->mysv1)
274 #define RExC_mysv2 (pRExC_state->mysv2)
282 bool sWARN_EXPERIMENTAL__VLB;
283 bool sWARN_EXPERIMENTAL__REGEX_SETS;
286 #define RExC_flags (pRExC_state->flags)
287 #define RExC_pm_flags (pRExC_state->pm_flags)
288 #define RExC_precomp (pRExC_state->precomp)
289 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
290 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
291 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
292 #define RExC_precomp_end (pRExC_state->precomp_end)
293 #define RExC_rx_sv (pRExC_state->rx_sv)
294 #define RExC_rx (pRExC_state->rx)
295 #define RExC_rxi (pRExC_state->rxi)
296 #define RExC_start (pRExC_state->start)
297 #define RExC_end (pRExC_state->end)
298 #define RExC_parse (pRExC_state->parse)
299 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
300 #define RExC_whilem_seen (pRExC_state->whilem_seen)
301 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
302 under /d from /u ? */
304 #ifdef RE_TRACK_PATTERN_OFFSETS
305 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
308 #define RExC_emit (pRExC_state->emit)
309 #define RExC_emit_start (pRExC_state->emit_start)
310 #define RExC_sawback (pRExC_state->sawback)
311 #define RExC_seen (pRExC_state->seen)
312 #define RExC_size (pRExC_state->size)
313 #define RExC_maxlen (pRExC_state->maxlen)
314 #define RExC_npar (pRExC_state->npar)
315 #define RExC_total_parens (pRExC_state->total_par)
316 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
317 #define RExC_nestroot (pRExC_state->nestroot)
318 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
319 #define RExC_utf8 (pRExC_state->utf8)
320 #define RExC_uni_semantics (pRExC_state->uni_semantics)
321 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
322 #define RExC_open_parens (pRExC_state->open_parens)
323 #define RExC_close_parens (pRExC_state->close_parens)
324 #define RExC_end_op (pRExC_state->end_op)
325 #define RExC_paren_names (pRExC_state->paren_names)
326 #define RExC_recurse (pRExC_state->recurse)
327 #define RExC_recurse_count (pRExC_state->recurse_count)
328 #define RExC_sets_depth (pRExC_state->sets_depth)
329 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
330 #define RExC_study_chunk_recursed_bytes \
331 (pRExC_state->study_chunk_recursed_bytes)
332 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
333 #define RExC_in_lookahead (pRExC_state->in_lookahead)
334 #define RExC_contains_locale (pRExC_state->contains_locale)
335 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
338 # define SET_recode_x_to_native(x) \
339 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
341 # define SET_recode_x_to_native(x) NOOP
344 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
345 #define RExC_frame_head (pRExC_state->frame_head)
346 #define RExC_frame_last (pRExC_state->frame_last)
347 #define RExC_frame_count (pRExC_state->frame_count)
348 #define RExC_strict (pRExC_state->strict)
349 #define RExC_study_started (pRExC_state->study_started)
350 #define RExC_warn_text (pRExC_state->warn_text)
351 #define RExC_in_script_run (pRExC_state->in_script_run)
352 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
353 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
354 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
355 #define RExC_unlexed_names (pRExC_state->unlexed_names)
357 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
358 * a flag to disable back-off on the fixed/floating substrings - if it's
359 * a high complexity pattern we assume the benefit of avoiding a full match
360 * is worth the cost of checking for the substrings even if they rarely help.
362 #define RExC_naughty (pRExC_state->naughty)
363 #define TOO_NAUGHTY (10)
364 #define MARK_NAUGHTY(add) \
365 if (RExC_naughty < TOO_NAUGHTY) \
366 RExC_naughty += (add)
367 #define MARK_NAUGHTY_EXP(exp, add) \
368 if (RExC_naughty < TOO_NAUGHTY) \
369 RExC_naughty += RExC_naughty / (exp) + (add)
371 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
372 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
373 ((*s) == '{' && regcurly(s)))
376 * Flags to be passed up and down.
378 #define WORST 0 /* Worst case. */
379 #define HASWIDTH 0x01 /* Known to not match null strings, could match
382 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
383 * character. (There needs to be a case: in the switch statement in regexec.c
384 * for any node marked SIMPLE.) Note that this is not the same thing as
387 #define SPSTART 0x04 /* Starts with * or + */
388 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
389 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
390 #define RESTART_PARSE 0x20 /* Need to redo the parse */
391 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
392 calcuate sizes as UTF-8 */
394 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
396 /* whether trie related optimizations are enabled */
397 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
398 #define TRIE_STUDY_OPT
399 #define FULL_TRIE_STUDY
405 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
406 #define PBITVAL(paren) (1 << ((paren) & 7))
407 #define PAREN_OFFSET(depth) \
408 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
409 #define PAREN_TEST(depth, paren) \
410 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
411 #define PAREN_SET(depth, paren) \
412 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
413 #define PAREN_UNSET(depth, paren) \
414 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
416 #define REQUIRE_UTF8(flagp) STMT_START { \
418 *flagp = RESTART_PARSE|NEED_UTF8; \
423 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
424 * a flag that indicates we need to override /d with /u as a result of
425 * something in the pattern. It should only be used in regards to calling
426 * set_regex_charset() or get_regex_charset() */
427 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
429 if (DEPENDS_SEMANTICS) { \
430 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
431 RExC_uni_semantics = 1; \
432 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
433 /* No need to restart the parse if we haven't seen \
434 * anything that differs between /u and /d, and no need \
435 * to restart immediately if we're going to reparse \
436 * anyway to count parens */ \
437 *flagp |= RESTART_PARSE; \
438 return restart_retval; \
443 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
445 RExC_use_BRANCHJ = 1; \
446 *flagp |= RESTART_PARSE; \
447 return restart_retval; \
450 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
451 * less. After that, it must always be positive, because the whole re is
452 * considered to be surrounded by virtual parens. Setting it to negative
453 * indicates there is some construct that needs to know the actual number of
454 * parens to be properly handled. And that means an extra pass will be
455 * required after we've counted them all */
456 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
457 #define REQUIRE_PARENS_PASS \
458 STMT_START { /* No-op if have completed a pass */ \
459 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
461 #define IN_PARENS_PASS (RExC_total_parens < 0)
464 /* This is used to return failure (zero) early from the calling function if
465 * various flags in 'flags' are set. Two flags always cause a return:
466 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
467 * additional flags that should cause a return; 0 if none. If the return will
468 * be done, '*flagp' is first set to be all of the flags that caused the
470 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
472 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
473 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
478 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
480 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
481 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
482 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
483 if (MUST_RESTART(*(flagp))) return 0
485 /* This converts the named class defined in regcomp.h to its equivalent class
486 * number defined in handy.h. */
487 #define namedclass_to_classnum(class) ((int) ((class) / 2))
488 #define classnum_to_namedclass(classnum) ((classnum) * 2)
490 #define _invlist_union_complement_2nd(a, b, output) \
491 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
492 #define _invlist_intersection_complement_2nd(a, b, output) \
493 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
495 /* We add a marker if we are deferring expansion of a property that is both
496 * 1) potentiallly user-defined; and
497 * 2) could also be an official Unicode property.
499 * Without this marker, any deferred expansion can only be for a user-defined
500 * one. This marker shouldn't conflict with any that could be in a legal name,
501 * and is appended to its name to indicate this. There is a string and
503 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
504 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
506 /* What is infinity for optimization purposes */
507 #define OPTIMIZE_INFTY SSize_t_MAX
509 /* About scan_data_t.
511 During optimisation we recurse through the regexp program performing
512 various inplace (keyhole style) optimisations. In addition study_chunk
513 and scan_commit populate this data structure with information about
514 what strings MUST appear in the pattern. We look for the longest
515 string that must appear at a fixed location, and we look for the
516 longest string that may appear at a floating location. So for instance
521 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
522 strings (because they follow a .* construct). study_chunk will identify
523 both FOO and BAR as being the longest fixed and floating strings respectively.
525 The strings can be composites, for instance
529 will result in a composite fixed substring 'foo'.
531 For each string some basic information is maintained:
534 This is the position the string must appear at, or not before.
535 It also implicitly (when combined with minlenp) tells us how many
536 characters must match before the string we are searching for.
537 Likewise when combined with minlenp and the length of the string it
538 tells us how many characters must appear after the string we have
542 Only used for floating strings. This is the rightmost point that
543 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
544 string can occur infinitely far to the right.
545 For fixed strings, it is equal to min_offset.
548 A pointer to the minimum number of characters of the pattern that the
549 string was found inside. This is important as in the case of positive
550 lookahead or positive lookbehind we can have multiple patterns
555 The minimum length of the pattern overall is 3, the minimum length
556 of the lookahead part is 3, but the minimum length of the part that
557 will actually match is 1. So 'FOO's minimum length is 3, but the
558 minimum length for the F is 1. This is important as the minimum length
559 is used to determine offsets in front of and behind the string being
560 looked for. Since strings can be composites this is the length of the
561 pattern at the time it was committed with a scan_commit. Note that
562 the length is calculated by study_chunk, so that the minimum lengths
563 are not known until the full pattern has been compiled, thus the
564 pointer to the value.
568 In the case of lookbehind the string being searched for can be
569 offset past the start point of the final matching string.
570 If this value was just blithely removed from the min_offset it would
571 invalidate some of the calculations for how many chars must match
572 before or after (as they are derived from min_offset and minlen and
573 the length of the string being searched for).
574 When the final pattern is compiled and the data is moved from the
575 scan_data_t structure into the regexp structure the information
576 about lookbehind is factored in, with the information that would
577 have been lost precalculated in the end_shift field for the
580 The fields pos_min and pos_delta are used to store the minimum offset
581 and the delta to the maximum offset at the current point in the pattern.
585 struct scan_data_substrs {
586 SV *str; /* longest substring found in pattern */
587 SSize_t min_offset; /* earliest point in string it can appear */
588 SSize_t max_offset; /* latest point in string it can appear */
589 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
590 SSize_t lookbehind; /* is the pos of the string modified by LB */
591 I32 flags; /* per substring SF_* and SCF_* flags */
594 typedef struct scan_data_t {
595 /*I32 len_min; unused */
596 /*I32 len_delta; unused */
600 SSize_t last_end; /* min value, <0 unless valid. */
601 SSize_t last_start_min;
602 SSize_t last_start_max;
603 U8 cur_is_floating; /* whether the last_* values should be set as
604 * the next fixed (0) or floating (1)
607 /* [0] is longest fixed substring so far, [1] is longest float so far */
608 struct scan_data_substrs substrs[2];
610 I32 flags; /* common SF_* and SCF_* flags */
612 SSize_t *last_closep;
613 regnode_ssc *start_class;
617 * Forward declarations for pregcomp()'s friends.
620 static const scan_data_t zero_scan_data = {
621 0, 0, NULL, 0, 0, 0, 0,
623 { NULL, 0, 0, 0, 0, 0 },
624 { NULL, 0, 0, 0, 0, 0 },
631 #define SF_BEFORE_SEOL 0x0001
632 #define SF_BEFORE_MEOL 0x0002
633 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
635 #define SF_IS_INF 0x0040
636 #define SF_HAS_PAR 0x0080
637 #define SF_IN_PAR 0x0100
638 #define SF_HAS_EVAL 0x0200
641 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
642 * longest substring in the pattern. When it is not set the optimiser keeps
643 * track of position, but does not keep track of the actual strings seen,
645 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
648 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
649 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
650 * turned off because of the alternation (BRANCH). */
651 #define SCF_DO_SUBSTR 0x0400
653 #define SCF_DO_STCLASS_AND 0x0800
654 #define SCF_DO_STCLASS_OR 0x1000
655 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
656 #define SCF_WHILEM_VISITED_POS 0x2000
658 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
659 #define SCF_SEEN_ACCEPT 0x8000
660 #define SCF_TRIE_DOING_RESTUDY 0x10000
661 #define SCF_IN_DEFINE 0x20000
666 #define UTF cBOOL(RExC_utf8)
668 /* The enums for all these are ordered so things work out correctly */
669 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
670 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
671 == REGEX_DEPENDS_CHARSET)
672 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
673 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
674 >= REGEX_UNICODE_CHARSET)
675 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
676 == REGEX_ASCII_RESTRICTED_CHARSET)
677 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
678 >= REGEX_ASCII_RESTRICTED_CHARSET)
679 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
680 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
682 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
684 /* For programs that want to be strictly Unicode compatible by dying if any
685 * attempt is made to match a non-Unicode code point against a Unicode
687 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
689 #define OOB_NAMEDCLASS -1
691 /* There is no code point that is out-of-bounds, so this is problematic. But
692 * its only current use is to initialize a variable that is always set before
694 #define OOB_UNICODE 0xDEADBEEF
696 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
699 /* length of regex to show in messages that don't mark a position within */
700 #define RegexLengthToShowInErrorMessages 127
703 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
704 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
705 * op/pragma/warn/regcomp.
707 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
708 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
710 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
711 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
713 /* The code in this file in places uses one level of recursion with parsing
714 * rebased to an alternate string constructed by us in memory. This can take
715 * the form of something that is completely different from the input, or
716 * something that uses the input as part of the alternate. In the first case,
717 * there should be no possibility of an error, as we are in complete control of
718 * the alternate string. But in the second case we don't completely control
719 * the input portion, so there may be errors in that. Here's an example:
721 * is handled specially because \x{df} folds to a sequence of more than one
722 * character: 'ss'. What is done is to create and parse an alternate string,
723 * which looks like this:
724 * /(?:\x{DF}|[abc\x{DF}def])/ui
725 * where it uses the input unchanged in the middle of something it constructs,
726 * which is a branch for the DF outside the character class, and clustering
727 * parens around the whole thing. (It knows enough to skip the DF inside the
728 * class while in this substitute parse.) 'abc' and 'def' may have errors that
729 * need to be reported. The general situation looks like this:
731 * |<------- identical ------>|
733 * Input: ---------------------------------------------------------------
734 * Constructed: ---------------------------------------------------
736 * |<------- identical ------>|
738 * sI..eI is the portion of the input pattern we are concerned with here.
739 * sC..EC is the constructed substitute parse string.
740 * sC..tC is constructed by us
741 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
742 * In the diagram, these are vertically aligned.
743 * eC..EC is also constructed by us.
744 * xC is the position in the substitute parse string where we found a
746 * xI is the position in the original pattern corresponding to xC.
748 * We want to display a message showing the real input string. Thus we need to
749 * translate from xC to xI. We know that xC >= tC, since the portion of the
750 * string sC..tC has been constructed by us, and so shouldn't have errors. We
752 * xI = tI + (xC - tC)
754 * When the substitute parse is constructed, the code needs to set:
757 * RExC_copy_start_in_input (tI)
758 * RExC_copy_start_in_constructed (tC)
759 * and restore them when done.
761 * During normal processing of the input pattern, both
762 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
763 * sI, so that xC equals xI.
766 #define sI RExC_precomp
767 #define eI RExC_precomp_end
768 #define sC RExC_start
770 #define tI RExC_copy_start_in_input
771 #define tC RExC_copy_start_in_constructed
772 #define xI(xC) (tI + (xC - tC))
773 #define xI_offset(xC) (xI(xC) - sI)
775 #define REPORT_LOCATION_ARGS(xC) \
777 (xI(xC) > eI) /* Don't run off end */ \
778 ? eI - sI /* Length before the <--HERE */ \
779 : ((xI_offset(xC) >= 0) \
781 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
782 IVdf " trying to output message for " \
784 __FILE__, __LINE__, (IV) xI_offset(xC), \
785 ((int) (eC - sC)), sC), 0)), \
786 sI), /* The input pattern printed up to the <--HERE */ \
788 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
789 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
791 /* Used to point after bad bytes for an error message, but avoid skipping
792 * past a nul byte. */
793 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
795 /* Set up to clean up after our imminent demise */
796 #define PREPARE_TO_DIE \
799 SAVEFREESV(RExC_rx_sv); \
800 if (RExC_open_parens) \
801 SAVEFREEPV(RExC_open_parens); \
802 if (RExC_close_parens) \
803 SAVEFREEPV(RExC_close_parens); \
807 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
808 * arg. Show regex, up to a maximum length. If it's too long, chop and add
811 #define _FAIL(code) STMT_START { \
812 const char *ellipses = ""; \
813 IV len = RExC_precomp_end - RExC_precomp; \
816 if (len > RegexLengthToShowInErrorMessages) { \
817 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
818 len = RegexLengthToShowInErrorMessages - 10; \
824 #define FAIL(msg) _FAIL( \
825 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
826 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
828 #define FAIL2(msg,arg) _FAIL( \
829 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
830 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
832 #define FAIL3(msg,arg1,arg2) _FAIL( \
833 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
834 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
837 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
839 #define Simple_vFAIL(m) STMT_START { \
840 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
841 m, REPORT_LOCATION_ARGS(RExC_parse)); \
845 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
847 #define vFAIL(m) STMT_START { \
853 * Like Simple_vFAIL(), but accepts two arguments.
855 #define Simple_vFAIL2(m,a1) STMT_START { \
856 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
857 REPORT_LOCATION_ARGS(RExC_parse)); \
861 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
863 #define vFAIL2(m,a1) STMT_START { \
865 Simple_vFAIL2(m, a1); \
870 * Like Simple_vFAIL(), but accepts three arguments.
872 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
873 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
874 REPORT_LOCATION_ARGS(RExC_parse)); \
878 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
880 #define vFAIL3(m,a1,a2) STMT_START { \
882 Simple_vFAIL3(m, a1, a2); \
886 * Like Simple_vFAIL(), but accepts four arguments.
888 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
889 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
890 REPORT_LOCATION_ARGS(RExC_parse)); \
893 #define vFAIL4(m,a1,a2,a3) STMT_START { \
895 Simple_vFAIL4(m, a1, a2, a3); \
898 /* A specialized version of vFAIL2 that works with UTF8f */
899 #define vFAIL2utf8f(m, a1) STMT_START { \
901 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
902 REPORT_LOCATION_ARGS(RExC_parse)); \
905 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
907 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
908 REPORT_LOCATION_ARGS(RExC_parse)); \
911 /* Setting this to NULL is a signal to not output warnings */
912 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
914 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
915 RExC_copy_start_in_constructed = NULL; \
917 #define RESTORE_WARNINGS \
918 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
920 /* Since a warning can be generated multiple times as the input is reparsed, we
921 * output it the first time we come to that point in the parse, but suppress it
922 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
923 * generate any warnings */
924 #define TO_OUTPUT_WARNINGS(loc) \
925 ( RExC_copy_start_in_constructed \
926 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
928 /* After we've emitted a warning, we save the position in the input so we don't
930 #define UPDATE_WARNINGS_LOC(loc) \
932 if (TO_OUTPUT_WARNINGS(loc)) { \
933 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
938 /* 'warns' is the output of the packWARNx macro used in 'code' */
939 #define _WARN_HELPER(loc, warns, code) \
941 if (! RExC_copy_start_in_constructed) { \
942 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
943 " expected at '%s'", \
944 __FILE__, __LINE__, loc); \
946 if (TO_OUTPUT_WARNINGS(loc)) { \
950 UPDATE_WARNINGS_LOC(loc); \
954 /* m is not necessarily a "literal string", in this macro */
955 #define warn_non_literal_string(loc, packed_warn, m) \
956 _WARN_HELPER(loc, packed_warn, \
957 Perl_warner(aTHX_ packed_warn, \
958 "%s" REPORT_LOCATION, \
959 m, REPORT_LOCATION_ARGS(loc)))
960 #define reg_warn_non_literal_string(loc, m) \
961 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
963 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
966 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
967 Newx(format, format_size, char); \
968 my_strlcpy(format, m, format_size); \
969 my_strlcat(format, REPORT_LOCATION, format_size); \
970 SAVEFREEPV(format); \
971 _WARN_HELPER(loc, packwarn, \
972 Perl_ck_warner(aTHX_ packwarn, \
974 a1, REPORT_LOCATION_ARGS(loc))); \
977 #define ckWARNreg(loc,m) \
978 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
979 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
981 REPORT_LOCATION_ARGS(loc)))
983 #define vWARN(loc, m) \
984 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
985 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
987 REPORT_LOCATION_ARGS(loc))) \
989 #define vWARN_dep(loc, m) \
990 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
991 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
993 REPORT_LOCATION_ARGS(loc)))
995 #define ckWARNdep(loc,m) \
996 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
997 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
999 REPORT_LOCATION_ARGS(loc)))
1001 #define ckWARNregdep(loc,m) \
1002 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1003 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1005 m REPORT_LOCATION, \
1006 REPORT_LOCATION_ARGS(loc)))
1008 #define ckWARN2reg_d(loc,m, a1) \
1009 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1010 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1011 m REPORT_LOCATION, \
1012 a1, REPORT_LOCATION_ARGS(loc)))
1014 #define ckWARN2reg(loc, m, a1) \
1015 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1016 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1017 m REPORT_LOCATION, \
1018 a1, REPORT_LOCATION_ARGS(loc)))
1020 #define vWARN3(loc, m, a1, a2) \
1021 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1022 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1023 m REPORT_LOCATION, \
1024 a1, a2, REPORT_LOCATION_ARGS(loc)))
1026 #define ckWARN3reg(loc, m, a1, a2) \
1027 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1028 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1029 m REPORT_LOCATION, \
1031 REPORT_LOCATION_ARGS(loc)))
1033 #define vWARN4(loc, m, a1, a2, a3) \
1034 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1035 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1036 m REPORT_LOCATION, \
1038 REPORT_LOCATION_ARGS(loc)))
1040 #define ckWARN4reg(loc, m, a1, a2, a3) \
1041 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1042 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1043 m REPORT_LOCATION, \
1045 REPORT_LOCATION_ARGS(loc)))
1047 #define vWARN5(loc, m, a1, a2, a3, a4) \
1048 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1049 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1050 m REPORT_LOCATION, \
1052 REPORT_LOCATION_ARGS(loc)))
1054 #define ckWARNexperimental(loc, class, m) \
1056 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1057 RExC_warned_ ## class = 1; \
1058 _WARN_HELPER(loc, packWARN(class), \
1059 Perl_ck_warner_d(aTHX_ packWARN(class), \
1060 m REPORT_LOCATION, \
1061 REPORT_LOCATION_ARGS(loc)));\
1065 /* Convert between a pointer to a node and its offset from the beginning of the
1067 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1068 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1070 /* Macros for recording node offsets. 20001227 mjd@plover.com
1071 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1072 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1073 * Element 0 holds the number n.
1074 * Position is 1 indexed.
1076 #ifndef RE_TRACK_PATTERN_OFFSETS
1077 #define Set_Node_Offset_To_R(offset,byte)
1078 #define Set_Node_Offset(node,byte)
1079 #define Set_Cur_Node_Offset
1080 #define Set_Node_Length_To_R(node,len)
1081 #define Set_Node_Length(node,len)
1082 #define Set_Node_Cur_Length(node,start)
1083 #define Node_Offset(n)
1084 #define Node_Length(n)
1085 #define Set_Node_Offset_Length(node,offset,len)
1086 #define ProgLen(ri) ri->u.proglen
1087 #define SetProgLen(ri,x) ri->u.proglen = x
1088 #define Track_Code(code)
1090 #define ProgLen(ri) ri->u.offsets[0]
1091 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1092 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
1093 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
1094 __LINE__, (int)(offset), (int)(byte))); \
1095 if((offset) < 0) { \
1096 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
1099 RExC_offsets[2*(offset)-1] = (byte); \
1103 #define Set_Node_Offset(node,byte) \
1104 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1105 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1107 #define Set_Node_Length_To_R(node,len) STMT_START { \
1108 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1109 __LINE__, (int)(node), (int)(len))); \
1111 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1114 RExC_offsets[2*(node)] = (len); \
1118 #define Set_Node_Length(node,len) \
1119 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1120 #define Set_Node_Cur_Length(node, start) \
1121 Set_Node_Length(node, RExC_parse - start)
1123 /* Get offsets and lengths */
1124 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1125 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1127 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1128 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1129 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1132 #define Track_Code(code) STMT_START { code } STMT_END
1135 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1136 #define EXPERIMENTAL_INPLACESCAN
1137 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1141 Perl_re_printf(pTHX_ const char *fmt, ...)
1145 PerlIO *f= Perl_debug_log;
1146 PERL_ARGS_ASSERT_RE_PRINTF;
1148 result = PerlIO_vprintf(f, fmt, ap);
1154 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1158 PerlIO *f= Perl_debug_log;
1159 PERL_ARGS_ASSERT_RE_INDENTF;
1160 va_start(ap, depth);
1161 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1162 result = PerlIO_vprintf(f, fmt, ap);
1166 #endif /* DEBUGGING */
1168 #define DEBUG_RExC_seen() \
1169 DEBUG_OPTIMISE_MORE_r({ \
1170 Perl_re_printf( aTHX_ "RExC_seen: "); \
1172 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1173 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1175 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1176 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1178 if (RExC_seen & REG_GPOS_SEEN) \
1179 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1181 if (RExC_seen & REG_RECURSE_SEEN) \
1182 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1184 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1185 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1187 if (RExC_seen & REG_VERBARG_SEEN) \
1188 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1190 if (RExC_seen & REG_CUTGROUP_SEEN) \
1191 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1193 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1194 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1196 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1197 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1199 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1200 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1202 Perl_re_printf( aTHX_ "\n"); \
1205 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1206 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1211 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1212 const char *close_str)
1217 Perl_re_printf( aTHX_ "%s", open_str);
1218 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1219 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1221 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1227 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1228 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1229 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1230 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1231 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1232 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1233 Perl_re_printf( aTHX_ "%s", close_str);
1238 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1239 U32 depth, int is_inf)
1241 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1243 DEBUG_OPTIMISE_MORE_r({
1246 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1250 (IV)data->pos_delta,
1254 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1256 Perl_re_printf( aTHX_
1257 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1259 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1260 is_inf ? "INF " : ""
1263 if (data->last_found) {
1265 Perl_re_printf(aTHX_
1266 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1267 SvPVX_const(data->last_found),
1269 (IV)data->last_start_min,
1270 (IV)data->last_start_max
1273 for (i = 0; i < 2; i++) {
1274 Perl_re_printf(aTHX_
1275 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1276 data->cur_is_floating == i ? "*" : "",
1277 i ? "Float" : "Fixed",
1278 SvPVX_const(data->substrs[i].str),
1279 (IV)data->substrs[i].min_offset,
1280 (IV)data->substrs[i].max_offset
1282 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1286 Perl_re_printf( aTHX_ "\n");
1292 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1293 regnode *scan, U32 depth, U32 flags)
1295 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1302 Next = regnext(scan);
1303 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1304 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1307 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1308 Next ? (REG_NODE_NUM(Next)) : 0 );
1309 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1310 Perl_re_printf( aTHX_ "\n");
1315 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1316 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1318 # define DEBUG_PEEP(str, scan, depth, flags) \
1319 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1322 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1323 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1327 /* =========================================================
1328 * BEGIN edit_distance stuff.
1330 * This calculates how many single character changes of any type are needed to
1331 * transform a string into another one. It is taken from version 3.1 of
1333 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1336 /* Our unsorted dictionary linked list. */
1337 /* Note we use UVs, not chars. */
1342 struct dictionary* next;
1344 typedef struct dictionary item;
1347 PERL_STATIC_INLINE item*
1348 push(UV key, item* curr)
1351 Newx(head, 1, item);
1359 PERL_STATIC_INLINE item*
1360 find(item* head, UV key)
1362 item* iterator = head;
1364 if (iterator->key == key){
1367 iterator = iterator->next;
1373 PERL_STATIC_INLINE item*
1374 uniquePush(item* head, UV key)
1376 item* iterator = head;
1379 if (iterator->key == key) {
1382 iterator = iterator->next;
1385 return push(key, head);
1388 PERL_STATIC_INLINE void
1389 dict_free(item* head)
1391 item* iterator = head;
1394 item* temp = iterator;
1395 iterator = iterator->next;
1402 /* End of Dictionary Stuff */
1404 /* All calculations/work are done here */
1406 S_edit_distance(const UV* src,
1408 const STRLEN x, /* length of src[] */
1409 const STRLEN y, /* length of tgt[] */
1410 const SSize_t maxDistance
1414 UV swapCount, swapScore, targetCharCount, i, j;
1416 UV score_ceil = x + y;
1418 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1420 /* intialize matrix start values */
1421 Newx(scores, ( (x + 2) * (y + 2)), UV);
1422 scores[0] = score_ceil;
1423 scores[1 * (y + 2) + 0] = score_ceil;
1424 scores[0 * (y + 2) + 1] = score_ceil;
1425 scores[1 * (y + 2) + 1] = 0;
1426 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1431 for (i=1;i<=x;i++) {
1433 head = uniquePush(head, src[i]);
1434 scores[(i+1) * (y + 2) + 1] = i;
1435 scores[(i+1) * (y + 2) + 0] = score_ceil;
1438 for (j=1;j<=y;j++) {
1441 head = uniquePush(head, tgt[j]);
1442 scores[1 * (y + 2) + (j + 1)] = j;
1443 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1446 targetCharCount = find(head, tgt[j-1])->value;
1447 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1449 if (src[i-1] != tgt[j-1]){
1450 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1454 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1458 find(head, src[i-1])->value = i;
1462 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1465 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1469 /* END of edit_distance() stuff
1470 * ========================================================= */
1472 /* Mark that we cannot extend a found fixed substring at this point.
1473 Update the longest found anchored substring or the longest found
1474 floating substrings if needed. */
1477 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1478 SSize_t *minlenp, int is_inf)
1480 const STRLEN l = CHR_SVLEN(data->last_found);
1481 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1482 const STRLEN old_l = CHR_SVLEN(longest_sv);
1483 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1485 PERL_ARGS_ASSERT_SCAN_COMMIT;
1487 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1488 const U8 i = data->cur_is_floating;
1489 SvSetMagicSV(longest_sv, data->last_found);
1490 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1493 data->substrs[0].max_offset = data->substrs[0].min_offset;
1495 data->substrs[1].max_offset =
1499 ? data->last_start_max
1500 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1502 : data->pos_min + data->pos_delta));
1505 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1506 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1507 data->substrs[i].minlenp = minlenp;
1508 data->substrs[i].lookbehind = 0;
1511 SvCUR_set(data->last_found, 0);
1513 SV * const sv = data->last_found;
1514 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1515 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1520 data->last_end = -1;
1521 data->flags &= ~SF_BEFORE_EOL;
1522 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1525 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1526 * list that describes which code points it matches */
1529 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1531 /* Set the SSC 'ssc' to match an empty string or any code point */
1533 PERL_ARGS_ASSERT_SSC_ANYTHING;
1535 assert(is_ANYOF_SYNTHETIC(ssc));
1537 /* mortalize so won't leak */
1538 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1539 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1543 S_ssc_is_anything(const regnode_ssc *ssc)
1545 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1546 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1547 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1548 * in any way, so there's no point in using it */
1553 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1555 assert(is_ANYOF_SYNTHETIC(ssc));
1557 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1561 /* See if the list consists solely of the range 0 - Infinity */
1562 invlist_iterinit(ssc->invlist);
1563 ret = invlist_iternext(ssc->invlist, &start, &end)
1567 invlist_iterfinish(ssc->invlist);
1573 /* If e.g., both \w and \W are set, matches everything */
1574 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1576 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1577 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1587 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1589 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1590 * string, any code point, or any posix class under locale */
1592 PERL_ARGS_ASSERT_SSC_INIT;
1594 Zero(ssc, 1, regnode_ssc);
1595 set_ANYOF_SYNTHETIC(ssc);
1596 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1599 /* If any portion of the regex is to operate under locale rules that aren't
1600 * fully known at compile time, initialization includes it. The reason
1601 * this isn't done for all regexes is that the optimizer was written under
1602 * the assumption that locale was all-or-nothing. Given the complexity and
1603 * lack of documentation in the optimizer, and that there are inadequate
1604 * test cases for locale, many parts of it may not work properly, it is
1605 * safest to avoid locale unless necessary. */
1606 if (RExC_contains_locale) {
1607 ANYOF_POSIXL_SETALL(ssc);
1610 ANYOF_POSIXL_ZERO(ssc);
1615 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1616 const regnode_ssc *ssc)
1618 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1619 * to the list of code points matched, and locale posix classes; hence does
1620 * not check its flags) */
1625 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1627 assert(is_ANYOF_SYNTHETIC(ssc));
1629 invlist_iterinit(ssc->invlist);
1630 ret = invlist_iternext(ssc->invlist, &start, &end)
1634 invlist_iterfinish(ssc->invlist);
1640 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1647 #define INVLIST_INDEX 0
1648 #define ONLY_LOCALE_MATCHES_INDEX 1
1649 #define DEFERRED_USER_DEFINED_INDEX 2
1652 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1653 const regnode_charclass* const node)
1655 /* Returns a mortal inversion list defining which code points are matched
1656 * by 'node', which is of type ANYOF. Handles complementing the result if
1657 * appropriate. If some code points aren't knowable at this time, the
1658 * returned list must, and will, contain every code point that is a
1663 SV* only_utf8_locale_invlist = NULL;
1665 const U32 n = ARG(node);
1666 bool new_node_has_latin1 = FALSE;
1667 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1669 : ANYOF_FLAGS(node);
1671 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1673 /* Look at the data structure created by S_set_ANYOF_arg() */
1674 if (n != ANYOF_ONLY_HAS_BITMAP) {
1675 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1676 AV * const av = MUTABLE_AV(SvRV(rv));
1677 SV **const ary = AvARRAY(av);
1678 assert(RExC_rxi->data->what[n] == 's');
1680 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1682 /* Here there are things that won't be known until runtime -- we
1683 * have to assume it could be anything */
1684 invlist = sv_2mortal(_new_invlist(1));
1685 return _add_range_to_invlist(invlist, 0, UV_MAX);
1687 else if (ary[INVLIST_INDEX]) {
1689 /* Use the node's inversion list */
1690 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1693 /* Get the code points valid only under UTF-8 locales */
1694 if ( (flags & ANYOFL_FOLD)
1695 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1697 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1702 invlist = sv_2mortal(_new_invlist(0));
1705 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1706 * code points, and an inversion list for the others, but if there are code
1707 * points that should match only conditionally on the target string being
1708 * UTF-8, those are placed in the inversion list, and not the bitmap.
1709 * Since there are circumstances under which they could match, they are
1710 * included in the SSC. But if the ANYOF node is to be inverted, we have
1711 * to exclude them here, so that when we invert below, the end result
1712 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1713 * have to do this here before we add the unconditionally matched code
1715 if (flags & ANYOF_INVERT) {
1716 _invlist_intersection_complement_2nd(invlist,
1721 /* Add in the points from the bit map */
1722 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1723 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1724 if (ANYOF_BITMAP_TEST(node, i)) {
1725 unsigned int start = i++;
1727 for (; i < NUM_ANYOF_CODE_POINTS
1728 && ANYOF_BITMAP_TEST(node, i); ++i)
1732 invlist = _add_range_to_invlist(invlist, start, i-1);
1733 new_node_has_latin1 = TRUE;
1738 /* If this can match all upper Latin1 code points, have to add them
1739 * as well. But don't add them if inverting, as when that gets done below,
1740 * it would exclude all these characters, including the ones it shouldn't
1741 * that were added just above */
1742 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1743 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1745 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1748 /* Similarly for these */
1749 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1750 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1753 if (flags & ANYOF_INVERT) {
1754 _invlist_invert(invlist);
1756 else if (flags & ANYOFL_FOLD) {
1757 if (new_node_has_latin1) {
1759 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1760 * the locale. We can skip this if there are no 0-255 at all. */
1761 _invlist_union(invlist, PL_Latin1, &invlist);
1763 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1764 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1767 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1768 invlist = add_cp_to_invlist(invlist, 'I');
1770 if (_invlist_contains_cp(invlist,
1771 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1773 invlist = add_cp_to_invlist(invlist, 'i');
1778 /* Similarly add the UTF-8 locale possible matches. These have to be
1779 * deferred until after the non-UTF-8 locale ones are taken care of just
1780 * above, or it leads to wrong results under ANYOF_INVERT */
1781 if (only_utf8_locale_invlist) {
1782 _invlist_union_maybe_complement_2nd(invlist,
1783 only_utf8_locale_invlist,
1784 flags & ANYOF_INVERT,
1791 /* These two functions currently do the exact same thing */
1792 #define ssc_init_zero ssc_init
1794 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1795 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1797 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1798 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1799 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1802 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1803 const regnode_charclass *and_with)
1805 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1806 * another SSC or a regular ANYOF class. Can create false positives. */
1809 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1811 : ANYOF_FLAGS(and_with);
1814 PERL_ARGS_ASSERT_SSC_AND;
1816 assert(is_ANYOF_SYNTHETIC(ssc));
1818 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1819 * the code point inversion list and just the relevant flags */
1820 if (is_ANYOF_SYNTHETIC(and_with)) {
1821 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1822 anded_flags = and_with_flags;
1824 /* XXX This is a kludge around what appears to be deficiencies in the
1825 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1826 * there are paths through the optimizer where it doesn't get weeded
1827 * out when it should. And if we don't make some extra provision for
1828 * it like the code just below, it doesn't get added when it should.
1829 * This solution is to add it only when AND'ing, which is here, and
1830 * only when what is being AND'ed is the pristine, original node
1831 * matching anything. Thus it is like adding it to ssc_anything() but
1832 * only when the result is to be AND'ed. Probably the same solution
1833 * could be adopted for the same problem we have with /l matching,
1834 * which is solved differently in S_ssc_init(), and that would lead to
1835 * fewer false positives than that solution has. But if this solution
1836 * creates bugs, the consequences are only that a warning isn't raised
1837 * that should be; while the consequences for having /l bugs is
1838 * incorrect matches */
1839 if (ssc_is_anything((regnode_ssc *)and_with)) {
1840 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1844 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1845 if (OP(and_with) == ANYOFD) {
1846 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1849 anded_flags = and_with_flags
1850 &( ANYOF_COMMON_FLAGS
1851 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1852 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1853 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1855 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1860 ANYOF_FLAGS(ssc) &= anded_flags;
1862 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1863 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1864 * 'and_with' may be inverted. When not inverted, we have the situation of
1866 * (C1 | P1) & (C2 | P2)
1867 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1868 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1869 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1870 * <= ((C1 & C2) | P1 | P2)
1871 * Alternatively, the last few steps could be:
1872 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1873 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1874 * <= (C1 | C2 | (P1 & P2))
1875 * We favor the second approach if either P1 or P2 is non-empty. This is
1876 * because these components are a barrier to doing optimizations, as what
1877 * they match cannot be known until the moment of matching as they are
1878 * dependent on the current locale, 'AND"ing them likely will reduce or
1880 * But we can do better if we know that C1,P1 are in their initial state (a
1881 * frequent occurrence), each matching everything:
1882 * (<everything>) & (C2 | P2) = C2 | P2
1883 * Similarly, if C2,P2 are in their initial state (again a frequent
1884 * occurrence), the result is a no-op
1885 * (C1 | P1) & (<everything>) = C1 | P1
1888 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1889 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1890 * <= (C1 & ~C2) | (P1 & ~P2)
1893 if ((and_with_flags & ANYOF_INVERT)
1894 && ! is_ANYOF_SYNTHETIC(and_with))
1898 ssc_intersection(ssc,
1900 FALSE /* Has already been inverted */
1903 /* If either P1 or P2 is empty, the intersection will be also; can skip
1905 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1906 ANYOF_POSIXL_ZERO(ssc);
1908 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1910 /* Note that the Posix class component P from 'and_with' actually
1912 * P = Pa | Pb | ... | Pn
1913 * where each component is one posix class, such as in [\w\s].
1915 * ~P = ~(Pa | Pb | ... | Pn)
1916 * = ~Pa & ~Pb & ... & ~Pn
1917 * <= ~Pa | ~Pb | ... | ~Pn
1918 * The last is something we can easily calculate, but unfortunately
1919 * is likely to have many false positives. We could do better
1920 * in some (but certainly not all) instances if two classes in
1921 * P have known relationships. For example
1922 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1924 * :lower: & :print: = :lower:
1925 * And similarly for classes that must be disjoint. For example,
1926 * since \s and \w can have no elements in common based on rules in
1927 * the POSIX standard,
1928 * \w & ^\S = nothing
1929 * Unfortunately, some vendor locales do not meet the Posix
1930 * standard, in particular almost everything by Microsoft.
1931 * The loop below just changes e.g., \w into \W and vice versa */
1933 regnode_charclass_posixl temp;
1934 int add = 1; /* To calculate the index of the complement */
1936 Zero(&temp, 1, regnode_charclass_posixl);
1937 ANYOF_POSIXL_ZERO(&temp);
1938 for (i = 0; i < ANYOF_MAX; i++) {
1940 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1941 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1943 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1944 ANYOF_POSIXL_SET(&temp, i + add);
1946 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1948 ANYOF_POSIXL_AND(&temp, ssc);
1950 } /* else ssc already has no posixes */
1951 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1952 in its initial state */
1953 else if (! is_ANYOF_SYNTHETIC(and_with)
1954 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1956 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1957 * copy it over 'ssc' */
1958 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1959 if (is_ANYOF_SYNTHETIC(and_with)) {
1960 StructCopy(and_with, ssc, regnode_ssc);
1963 ssc->invlist = anded_cp_list;
1964 ANYOF_POSIXL_ZERO(ssc);
1965 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1966 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1970 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1971 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1973 /* One or the other of P1, P2 is non-empty. */
1974 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1975 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1977 ssc_union(ssc, anded_cp_list, FALSE);
1979 else { /* P1 = P2 = empty */
1980 ssc_intersection(ssc, anded_cp_list, FALSE);
1986 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1987 const regnode_charclass *or_with)
1989 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1990 * another SSC or a regular ANYOF class. Can create false positives if
1991 * 'or_with' is to be inverted. */
1995 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1997 : ANYOF_FLAGS(or_with);
1999 PERL_ARGS_ASSERT_SSC_OR;
2001 assert(is_ANYOF_SYNTHETIC(ssc));
2003 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2004 * the code point inversion list and just the relevant flags */
2005 if (is_ANYOF_SYNTHETIC(or_with)) {
2006 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2007 ored_flags = or_with_flags;
2010 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2011 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2012 if (OP(or_with) != ANYOFD) {
2015 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2016 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2017 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2019 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2024 ANYOF_FLAGS(ssc) |= ored_flags;
2026 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2027 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2028 * 'or_with' may be inverted. When not inverted, we have the simple
2029 * situation of computing:
2030 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2031 * If P1|P2 yields a situation with both a class and its complement are
2032 * set, like having both \w and \W, this matches all code points, and we
2033 * can delete these from the P component of the ssc going forward. XXX We
2034 * might be able to delete all the P components, but I (khw) am not certain
2035 * about this, and it is better to be safe.
2038 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2039 * <= (C1 | P1) | ~C2
2040 * <= (C1 | ~C2) | P1
2041 * (which results in actually simpler code than the non-inverted case)
2044 if ((or_with_flags & ANYOF_INVERT)
2045 && ! is_ANYOF_SYNTHETIC(or_with))
2047 /* We ignore P2, leaving P1 going forward */
2048 } /* else Not inverted */
2049 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2050 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2051 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2053 for (i = 0; i < ANYOF_MAX; i += 2) {
2054 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2056 ssc_match_all_cp(ssc);
2057 ANYOF_POSIXL_CLEAR(ssc, i);
2058 ANYOF_POSIXL_CLEAR(ssc, i+1);
2066 FALSE /* Already has been inverted */
2071 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2073 PERL_ARGS_ASSERT_SSC_UNION;
2075 assert(is_ANYOF_SYNTHETIC(ssc));
2077 _invlist_union_maybe_complement_2nd(ssc->invlist,
2084 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2086 const bool invert2nd)
2088 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2090 assert(is_ANYOF_SYNTHETIC(ssc));
2092 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2099 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2101 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2103 assert(is_ANYOF_SYNTHETIC(ssc));
2105 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2109 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2111 /* AND just the single code point 'cp' into the SSC 'ssc' */
2113 SV* cp_list = _new_invlist(2);
2115 PERL_ARGS_ASSERT_SSC_CP_AND;
2117 assert(is_ANYOF_SYNTHETIC(ssc));
2119 cp_list = add_cp_to_invlist(cp_list, cp);
2120 ssc_intersection(ssc, cp_list,
2121 FALSE /* Not inverted */
2123 SvREFCNT_dec_NN(cp_list);
2127 S_ssc_clear_locale(regnode_ssc *ssc)
2129 /* Set the SSC 'ssc' to not match any locale things */
2130 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2132 assert(is_ANYOF_SYNTHETIC(ssc));
2134 ANYOF_POSIXL_ZERO(ssc);
2135 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2138 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2141 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2143 /* The synthetic start class is used to hopefully quickly winnow down
2144 * places where a pattern could start a match in the target string. If it
2145 * doesn't really narrow things down that much, there isn't much point to
2146 * having the overhead of using it. This function uses some very crude
2147 * heuristics to decide if to use the ssc or not.
2149 * It returns TRUE if 'ssc' rules out more than half what it considers to
2150 * be the "likely" possible matches, but of course it doesn't know what the
2151 * actual things being matched are going to be; these are only guesses
2153 * For /l matches, it assumes that the only likely matches are going to be
2154 * in the 0-255 range, uniformly distributed, so half of that is 127
2155 * For /a and /d matches, it assumes that the likely matches will be just
2156 * the ASCII range, so half of that is 63
2157 * For /u and there isn't anything matching above the Latin1 range, it
2158 * assumes that that is the only range likely to be matched, and uses
2159 * half that as the cut-off: 127. If anything matches above Latin1,
2160 * it assumes that all of Unicode could match (uniformly), except for
2161 * non-Unicode code points and things in the General Category "Other"
2162 * (unassigned, private use, surrogates, controls and formats). This
2163 * is a much large number. */
2165 U32 count = 0; /* Running total of number of code points matched by
2167 UV start, end; /* Start and end points of current range in inversion
2168 XXX outdated. UTF-8 locales are common, what about invert? list */
2169 const U32 max_code_points = (LOC)
2171 : (( ! UNI_SEMANTICS
2172 || invlist_highest(ssc->invlist) < 256)
2175 const U32 max_match = max_code_points / 2;
2177 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2179 invlist_iterinit(ssc->invlist);
2180 while (invlist_iternext(ssc->invlist, &start, &end)) {
2181 if (start >= max_code_points) {
2184 end = MIN(end, max_code_points - 1);
2185 count += end - start + 1;
2186 if (count >= max_match) {
2187 invlist_iterfinish(ssc->invlist);
2197 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2199 /* The inversion list in the SSC is marked mortal; now we need a more
2200 * permanent copy, which is stored the same way that is done in a regular
2201 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2204 SV* invlist = invlist_clone(ssc->invlist, NULL);
2206 PERL_ARGS_ASSERT_SSC_FINALIZE;
2208 assert(is_ANYOF_SYNTHETIC(ssc));
2210 /* The code in this file assumes that all but these flags aren't relevant
2211 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2212 * by the time we reach here */
2213 assert(! (ANYOF_FLAGS(ssc)
2214 & ~( ANYOF_COMMON_FLAGS
2215 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2216 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2218 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2220 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2221 SvREFCNT_dec(invlist);
2223 /* Make sure is clone-safe */
2224 ssc->invlist = NULL;
2226 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2227 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2228 OP(ssc) = ANYOFPOSIXL;
2230 else if (RExC_contains_locale) {
2234 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2237 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2238 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2239 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2240 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2241 ? (TRIE_LIST_CUR( idx ) - 1) \
2247 dump_trie(trie,widecharmap,revcharmap)
2248 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2249 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2251 These routines dump out a trie in a somewhat readable format.
2252 The _interim_ variants are used for debugging the interim
2253 tables that are used to generate the final compressed
2254 representation which is what dump_trie expects.
2256 Part of the reason for their existence is to provide a form
2257 of documentation as to how the different representations function.
2262 Dumps the final compressed table form of the trie to Perl_debug_log.
2263 Used for debugging make_trie().
2267 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2268 AV *revcharmap, U32 depth)
2271 SV *sv=sv_newmortal();
2272 int colwidth= widecharmap ? 6 : 4;
2274 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2276 PERL_ARGS_ASSERT_DUMP_TRIE;
2278 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2279 depth+1, "Match","Base","Ofs" );
2281 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2282 SV ** const tmp = av_fetch( revcharmap, state, 0);
2284 Perl_re_printf( aTHX_ "%*s",
2286 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2287 PL_colors[0], PL_colors[1],
2288 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2289 PERL_PV_ESCAPE_FIRSTCHAR
2294 Perl_re_printf( aTHX_ "\n");
2295 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2297 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2298 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2299 Perl_re_printf( aTHX_ "\n");
2301 for( state = 1 ; state < trie->statecount ; state++ ) {
2302 const U32 base = trie->states[ state ].trans.base;
2304 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2306 if ( trie->states[ state ].wordnum ) {
2307 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2309 Perl_re_printf( aTHX_ "%6s", "" );
2312 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2317 while( ( base + ofs < trie->uniquecharcount ) ||
2318 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2319 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2323 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2325 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2326 if ( ( base + ofs >= trie->uniquecharcount )
2327 && ( base + ofs - trie->uniquecharcount
2329 && trie->trans[ base + ofs
2330 - trie->uniquecharcount ].check == state )
2332 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2333 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2336 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2340 Perl_re_printf( aTHX_ "]");
2343 Perl_re_printf( aTHX_ "\n" );
2345 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2347 for (word=1; word <= trie->wordcount; word++) {
2348 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2349 (int)word, (int)(trie->wordinfo[word].prev),
2350 (int)(trie->wordinfo[word].len));
2352 Perl_re_printf( aTHX_ "\n" );
2355 Dumps a fully constructed but uncompressed trie in list form.
2356 List tries normally only are used for construction when the number of
2357 possible chars (trie->uniquecharcount) is very high.
2358 Used for debugging make_trie().
2361 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2362 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2366 SV *sv=sv_newmortal();
2367 int colwidth= widecharmap ? 6 : 4;
2368 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2370 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2372 /* print out the table precompression. */
2373 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2375 Perl_re_indentf( aTHX_ "%s",
2376 depth+1, "------:-----+-----------------\n" );
2378 for( state=1 ; state < next_alloc ; state ++ ) {
2381 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2382 depth+1, (UV)state );
2383 if ( ! trie->states[ state ].wordnum ) {
2384 Perl_re_printf( aTHX_ "%5s| ","");
2386 Perl_re_printf( aTHX_ "W%4x| ",
2387 trie->states[ state ].wordnum
2390 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2391 SV ** const tmp = av_fetch( revcharmap,
2392 TRIE_LIST_ITEM(state, charid).forid, 0);
2394 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2396 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2398 PL_colors[0], PL_colors[1],
2399 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2400 | PERL_PV_ESCAPE_FIRSTCHAR
2402 TRIE_LIST_ITEM(state, charid).forid,
2403 (UV)TRIE_LIST_ITEM(state, charid).newstate
2406 Perl_re_printf( aTHX_ "\n%*s| ",
2407 (int)((depth * 2) + 14), "");
2410 Perl_re_printf( aTHX_ "\n");
2415 Dumps a fully constructed but uncompressed trie in table form.
2416 This is the normal DFA style state transition table, with a few
2417 twists to facilitate compression later.
2418 Used for debugging make_trie().
2421 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2422 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2427 SV *sv=sv_newmortal();
2428 int colwidth= widecharmap ? 6 : 4;
2429 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2431 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2434 print out the table precompression so that we can do a visual check
2435 that they are identical.
2438 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2440 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2441 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2443 Perl_re_printf( aTHX_ "%*s",
2445 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2446 PL_colors[0], PL_colors[1],
2447 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2448 PERL_PV_ESCAPE_FIRSTCHAR
2454 Perl_re_printf( aTHX_ "\n");
2455 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2457 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2458 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2461 Perl_re_printf( aTHX_ "\n" );
2463 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2465 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2467 (UV)TRIE_NODENUM( state ) );
2469 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2470 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2472 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2474 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2476 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2477 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2478 (UV)trie->trans[ state ].check );
2480 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2481 (UV)trie->trans[ state ].check,
2482 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2490 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2491 startbranch: the first branch in the whole branch sequence
2492 first : start branch of sequence of branch-exact nodes.
2493 May be the same as startbranch
2494 last : Thing following the last branch.
2495 May be the same as tail.
2496 tail : item following the branch sequence
2497 count : words in the sequence
2498 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2499 depth : indent depth
2501 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2503 A trie is an N'ary tree where the branches are determined by digital
2504 decomposition of the key. IE, at the root node you look up the 1st character and
2505 follow that branch repeat until you find the end of the branches. Nodes can be
2506 marked as "accepting" meaning they represent a complete word. Eg:
2510 would convert into the following structure. Numbers represent states, letters
2511 following numbers represent valid transitions on the letter from that state, if
2512 the number is in square brackets it represents an accepting state, otherwise it
2513 will be in parenthesis.
2515 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2519 (1) +-i->(6)-+-s->[7]
2521 +-s->(3)-+-h->(4)-+-e->[5]
2523 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2525 This shows that when matching against the string 'hers' we will begin at state 1
2526 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2527 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2528 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2529 single traverse. We store a mapping from accepting to state to which word was
2530 matched, and then when we have multiple possibilities we try to complete the
2531 rest of the regex in the order in which they occurred in the alternation.
2533 The only prior NFA like behaviour that would be changed by the TRIE support is
2534 the silent ignoring of duplicate alternations which are of the form:
2536 / (DUPE|DUPE) X? (?{ ... }) Y /x
2538 Thus EVAL blocks following a trie may be called a different number of times with
2539 and without the optimisation. With the optimisations dupes will be silently
2540 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2541 the following demonstrates:
2543 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2545 which prints out 'word' three times, but
2547 'words'=~/(word|word|word)(?{ print $1 })S/
2549 which doesnt print it out at all. This is due to other optimisations kicking in.
2551 Example of what happens on a structural level:
2553 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2555 1: CURLYM[1] {1,32767}(18)
2566 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2567 and should turn into:
2569 1: CURLYM[1] {1,32767}(18)
2571 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2579 Cases where tail != last would be like /(?foo|bar)baz/:
2589 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2590 and would end up looking like:
2593 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2600 d = uvchr_to_utf8_flags(d, uv, 0);
2602 is the recommended Unicode-aware way of saying
2607 #define TRIE_STORE_REVCHAR(val) \
2610 SV *zlopp = newSV(UTF8_MAXBYTES); \
2611 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2612 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2614 SvCUR_set(zlopp, kapow - flrbbbbb); \
2617 av_push(revcharmap, zlopp); \
2619 char ooooff = (char)val; \
2620 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2624 /* This gets the next character from the input, folding it if not already
2626 #define TRIE_READ_CHAR STMT_START { \
2629 /* if it is UTF then it is either already folded, or does not need \
2631 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2633 else if (folder == PL_fold_latin1) { \
2634 /* This folder implies Unicode rules, which in the range expressible \
2635 * by not UTF is the lower case, with the two exceptions, one of \
2636 * which should have been taken care of before calling this */ \
2637 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2638 uvc = toLOWER_L1(*uc); \
2639 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2642 /* raw data, will be folded later if needed */ \
2650 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2651 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2652 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2653 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2654 TRIE_LIST_LEN( state ) = ging; \
2656 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2657 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2658 TRIE_LIST_CUR( state )++; \
2661 #define TRIE_LIST_NEW(state) STMT_START { \
2662 Newx( trie->states[ state ].trans.list, \
2663 4, reg_trie_trans_le ); \
2664 TRIE_LIST_CUR( state ) = 1; \
2665 TRIE_LIST_LEN( state ) = 4; \
2668 #define TRIE_HANDLE_WORD(state) STMT_START { \
2669 U16 dupe= trie->states[ state ].wordnum; \
2670 regnode * const noper_next = regnext( noper ); \
2673 /* store the word for dumping */ \
2675 if (OP(noper) != NOTHING) \
2676 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2678 tmp = newSVpvn_utf8( "", 0, UTF ); \
2679 av_push( trie_words, tmp ); \
2683 trie->wordinfo[curword].prev = 0; \
2684 trie->wordinfo[curword].len = wordlen; \
2685 trie->wordinfo[curword].accept = state; \
2687 if ( noper_next < tail ) { \
2689 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2691 trie->jump[curword] = (U16)(noper_next - convert); \
2693 jumper = noper_next; \
2695 nextbranch= regnext(cur); \
2699 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2700 /* chain, so that when the bits of chain are later */\
2701 /* linked together, the dups appear in the chain */\
2702 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2703 trie->wordinfo[dupe].prev = curword; \
2705 /* we haven't inserted this word yet. */ \
2706 trie->states[ state ].wordnum = curword; \
2711 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2712 ( ( base + charid >= ucharcount \
2713 && base + charid < ubound \
2714 && state == trie->trans[ base - ucharcount + charid ].check \
2715 && trie->trans[ base - ucharcount + charid ].next ) \
2716 ? trie->trans[ base - ucharcount + charid ].next \
2717 : ( state==1 ? special : 0 ) \
2720 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2722 TRIE_BITMAP_SET(trie, uvc); \
2723 /* store the folded codepoint */ \
2725 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2728 /* store first byte of utf8 representation of */ \
2729 /* variant codepoints */ \
2730 if (! UVCHR_IS_INVARIANT(uvc)) { \
2731 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2736 #define MADE_JUMP_TRIE 2
2737 #define MADE_EXACT_TRIE 4
2740 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2741 regnode *first, regnode *last, regnode *tail,
2742 U32 word_count, U32 flags, U32 depth)
2744 /* first pass, loop through and scan words */
2745 reg_trie_data *trie;
2746 HV *widecharmap = NULL;
2747 AV *revcharmap = newAV();
2753 regnode *jumper = NULL;
2754 regnode *nextbranch = NULL;
2755 regnode *convert = NULL;
2756 U32 *prev_states; /* temp array mapping each state to previous one */
2757 /* we just use folder as a flag in utf8 */
2758 const U8 * folder = NULL;
2760 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2761 * which stands for one trie structure, one hash, optionally followed
2764 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2765 AV *trie_words = NULL;
2766 /* along with revcharmap, this only used during construction but both are
2767 * useful during debugging so we store them in the struct when debugging.
2770 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2771 STRLEN trie_charcount=0;
2773 SV *re_trie_maxbuff;
2774 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2776 PERL_ARGS_ASSERT_MAKE_TRIE;
2778 PERL_UNUSED_ARG(depth);
2782 case EXACT: case EXACT_REQ8: case EXACTL: break;
2786 case EXACTFLU8: folder = PL_fold_latin1; break;
2787 case EXACTF: folder = PL_fold; break;
2788 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2791 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2793 trie->startstate = 1;
2794 trie->wordcount = word_count;
2795 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2796 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2797 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2798 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2799 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2800 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2803 trie_words = newAV();
2806 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2807 assert(re_trie_maxbuff);
2808 if (!SvIOK(re_trie_maxbuff)) {
2809 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2811 DEBUG_TRIE_COMPILE_r({
2812 Perl_re_indentf( aTHX_
2813 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2815 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2816 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2819 /* Find the node we are going to overwrite */
2820 if ( first == startbranch && OP( last ) != BRANCH ) {
2821 /* whole branch chain */
2824 /* branch sub-chain */
2825 convert = NEXTOPER( first );
2828 /* -- First loop and Setup --
2830 We first traverse the branches and scan each word to determine if it
2831 contains widechars, and how many unique chars there are, this is
2832 important as we have to build a table with at least as many columns as we
2835 We use an array of integers to represent the character codes 0..255
2836 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2837 the native representation of the character value as the key and IV's for
2840 *TODO* If we keep track of how many times each character is used we can
2841 remap the columns so that the table compression later on is more
2842 efficient in terms of memory by ensuring the most common value is in the
2843 middle and the least common are on the outside. IMO this would be better
2844 than a most to least common mapping as theres a decent chance the most
2845 common letter will share a node with the least common, meaning the node
2846 will not be compressible. With a middle is most common approach the worst
2847 case is when we have the least common nodes twice.
2851 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2852 regnode *noper = NEXTOPER( cur );
2856 U32 wordlen = 0; /* required init */
2857 STRLEN minchars = 0;
2858 STRLEN maxchars = 0;
2859 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2862 if (OP(noper) == NOTHING) {
2863 /* skip past a NOTHING at the start of an alternation
2864 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2866 * If the next node is not something we are supposed to process
2867 * we will just ignore it due to the condition guarding the
2871 regnode *noper_next= regnext(noper);
2872 if (noper_next < tail)
2877 && ( OP(noper) == flags
2878 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2879 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
2880 || OP(noper) == EXACTFUP))))
2882 uc= (U8*)STRING(noper);
2883 e= uc + STR_LEN(noper);
2890 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2891 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2892 regardless of encoding */
2893 if (OP( noper ) == EXACTFUP) {
2894 /* false positives are ok, so just set this */
2895 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2899 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2901 TRIE_CHARCOUNT(trie)++;
2904 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2905 * is in effect. Under /i, this character can match itself, or
2906 * anything that folds to it. If not under /i, it can match just
2907 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2908 * all fold to k, and all are single characters. But some folds
2909 * expand to more than one character, so for example LATIN SMALL
2910 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2911 * the string beginning at 'uc' is 'ffi', it could be matched by
2912 * three characters, or just by the one ligature character. (It
2913 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2914 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2915 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2916 * match.) The trie needs to know the minimum and maximum number
2917 * of characters that could match so that it can use size alone to
2918 * quickly reject many match attempts. The max is simple: it is
2919 * the number of folded characters in this branch (since a fold is
2920 * never shorter than what folds to it. */
2924 /* And the min is equal to the max if not under /i (indicated by
2925 * 'folder' being NULL), or there are no multi-character folds. If
2926 * there is a multi-character fold, the min is incremented just
2927 * once, for the character that folds to the sequence. Each
2928 * character in the sequence needs to be added to the list below of
2929 * characters in the trie, but we count only the first towards the
2930 * min number of characters needed. This is done through the
2931 * variable 'foldlen', which is returned by the macros that look
2932 * for these sequences as the number of bytes the sequence
2933 * occupies. Each time through the loop, we decrement 'foldlen' by
2934 * how many bytes the current char occupies. Only when it reaches
2935 * 0 do we increment 'minchars' or look for another multi-character
2937 if (folder == NULL) {
2940 else if (foldlen > 0) {
2941 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2946 /* See if *uc is the beginning of a multi-character fold. If
2947 * so, we decrement the length remaining to look at, to account
2948 * for the current character this iteration. (We can use 'uc'
2949 * instead of the fold returned by TRIE_READ_CHAR because for
2950 * non-UTF, the latin1_safe macro is smart enough to account
2951 * for all the unfolded characters, and because for UTF, the
2952 * string will already have been folded earlier in the
2953 * compilation process */
2955 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2956 foldlen -= UTF8SKIP(uc);
2959 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2964 /* The current character (and any potential folds) should be added
2965 * to the possible matching characters for this position in this
2969 U8 folded= folder[ (U8) uvc ];
2970 if ( !trie->charmap[ folded ] ) {
2971 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2972 TRIE_STORE_REVCHAR( folded );
2975 if ( !trie->charmap[ uvc ] ) {
2976 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2977 TRIE_STORE_REVCHAR( uvc );
2980 /* store the codepoint in the bitmap, and its folded
2982 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2983 set_bit = 0; /* We've done our bit :-) */
2987 /* XXX We could come up with the list of code points that fold
2988 * to this using PL_utf8_foldclosures, except not for
2989 * multi-char folds, as there may be multiple combinations
2990 * there that could work, which needs to wait until runtime to
2991 * resolve (The comment about LIGATURE FFI above is such an
2996 widecharmap = newHV();
2998 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
3001 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3003 if ( !SvTRUE( *svpp ) ) {
3004 sv_setiv( *svpp, ++trie->uniquecharcount );
3005 TRIE_STORE_REVCHAR(uvc);
3008 } /* end loop through characters in this branch of the trie */
3010 /* We take the min and max for this branch and combine to find the min
3011 * and max for all branches processed so far */
3012 if( cur == first ) {
3013 trie->minlen = minchars;
3014 trie->maxlen = maxchars;
3015 } else if (minchars < trie->minlen) {
3016 trie->minlen = minchars;
3017 } else if (maxchars > trie->maxlen) {
3018 trie->maxlen = maxchars;
3020 } /* end first pass */
3021 DEBUG_TRIE_COMPILE_r(
3022 Perl_re_indentf( aTHX_
3023 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3025 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3026 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3027 (int)trie->minlen, (int)trie->maxlen )
3031 We now know what we are dealing with in terms of unique chars and
3032 string sizes so we can calculate how much memory a naive
3033 representation using a flat table will take. If it's over a reasonable
3034 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3035 conservative but potentially much slower representation using an array
3038 At the end we convert both representations into the same compressed
3039 form that will be used in regexec.c for matching with. The latter
3040 is a form that cannot be used to construct with but has memory
3041 properties similar to the list form and access properties similar
3042 to the table form making it both suitable for fast searches and
3043 small enough that its feasable to store for the duration of a program.
3045 See the comment in the code where the compressed table is produced
3046 inplace from the flat tabe representation for an explanation of how
3047 the compression works.
3052 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3055 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3056 > SvIV(re_trie_maxbuff) )
3059 Second Pass -- Array Of Lists Representation
3061 Each state will be represented by a list of charid:state records
3062 (reg_trie_trans_le) the first such element holds the CUR and LEN
3063 points of the allocated array. (See defines above).
3065 We build the initial structure using the lists, and then convert
3066 it into the compressed table form which allows faster lookups
3067 (but cant be modified once converted).
3070 STRLEN transcount = 1;
3072 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3075 trie->states = (reg_trie_state *)
3076 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3077 sizeof(reg_trie_state) );
3081 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3083 regnode *noper = NEXTOPER( cur );
3084 U32 state = 1; /* required init */
3085 U16 charid = 0; /* sanity init */
3086 U32 wordlen = 0; /* required init */
3088 if (OP(noper) == NOTHING) {
3089 regnode *noper_next= regnext(noper);
3090 if (noper_next < tail)
3092 /* we will undo this assignment if noper does not
3093 * point at a trieable type in the else clause of
3094 * the following statement. */
3098 && ( OP(noper) == flags
3099 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3100 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3101 || OP(noper) == EXACTFUP))))
3103 const U8 *uc= (U8*)STRING(noper);
3104 const U8 *e= uc + STR_LEN(noper);
3106 for ( ; uc < e ; uc += len ) {
3111 charid = trie->charmap[ uvc ];
3113 SV** const svpp = hv_fetch( widecharmap,
3120 charid=(U16)SvIV( *svpp );
3123 /* charid is now 0 if we dont know the char read, or
3124 * nonzero if we do */
3131 if ( !trie->states[ state ].trans.list ) {
3132 TRIE_LIST_NEW( state );
3135 check <= TRIE_LIST_USED( state );
3138 if ( TRIE_LIST_ITEM( state, check ).forid
3141 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3146 newstate = next_alloc++;
3147 prev_states[newstate] = state;
3148 TRIE_LIST_PUSH( state, charid, newstate );
3153 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3157 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3158 * on a trieable type. So we need to reset noper back to point at the first regop
3159 * in the branch before we call TRIE_HANDLE_WORD()
3161 noper= NEXTOPER(cur);
3163 TRIE_HANDLE_WORD(state);
3165 } /* end second pass */
3167 /* next alloc is the NEXT state to be allocated */
3168 trie->statecount = next_alloc;
3169 trie->states = (reg_trie_state *)
3170 PerlMemShared_realloc( trie->states,
3172 * sizeof(reg_trie_state) );
3174 /* and now dump it out before we compress it */
3175 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3176 revcharmap, next_alloc,
3180 trie->trans = (reg_trie_trans *)
3181 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3188 for( state=1 ; state < next_alloc ; state ++ ) {
3192 DEBUG_TRIE_COMPILE_MORE_r(
3193 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3197 if (trie->states[state].trans.list) {
3198 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3202 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3203 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3204 if ( forid < minid ) {
3206 } else if ( forid > maxid ) {
3210 if ( transcount < tp + maxid - minid + 1) {
3212 trie->trans = (reg_trie_trans *)
3213 PerlMemShared_realloc( trie->trans,
3215 * sizeof(reg_trie_trans) );
3216 Zero( trie->trans + (transcount / 2),
3220 base = trie->uniquecharcount + tp - minid;
3221 if ( maxid == minid ) {
3223 for ( ; zp < tp ; zp++ ) {
3224 if ( ! trie->trans[ zp ].next ) {
3225 base = trie->uniquecharcount + zp - minid;
3226 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3228 trie->trans[ zp ].check = state;
3234 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3236 trie->trans[ tp ].check = state;
3241 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3242 const U32 tid = base
3243 - trie->uniquecharcount
3244 + TRIE_LIST_ITEM( state, idx ).forid;
3245 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3247 trie->trans[ tid ].check = state;
3249 tp += ( maxid - minid + 1 );
3251 Safefree(trie->states[ state ].trans.list);
3254 DEBUG_TRIE_COMPILE_MORE_r(
3255 Perl_re_printf( aTHX_ " base: %d\n",base);
3258 trie->states[ state ].trans.base=base;
3260 trie->lasttrans = tp + 1;
3264 Second Pass -- Flat Table Representation.
3266 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3267 each. We know that we will need Charcount+1 trans at most to store
3268 the data (one row per char at worst case) So we preallocate both
3269 structures assuming worst case.
3271 We then construct the trie using only the .next slots of the entry
3274 We use the .check field of the first entry of the node temporarily
3275 to make compression both faster and easier by keeping track of how
3276 many non zero fields are in the node.
3278 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3281 There are two terms at use here: state as a TRIE_NODEIDX() which is
3282 a number representing the first entry of the node, and state as a
3283 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3284 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3285 if there are 2 entrys per node. eg:
3293 The table is internally in the right hand, idx form. However as we
3294 also have to deal with the states array which is indexed by nodenum
3295 we have to use TRIE_NODENUM() to convert.
3298 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3301 trie->trans = (reg_trie_trans *)
3302 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3303 * trie->uniquecharcount + 1,
3304 sizeof(reg_trie_trans) );
3305 trie->states = (reg_trie_state *)
3306 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3307 sizeof(reg_trie_state) );
3308 next_alloc = trie->uniquecharcount + 1;
3311 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3313 regnode *noper = NEXTOPER( cur );
3315 U32 state = 1; /* required init */
3317 U16 charid = 0; /* sanity init */
3318 U32 accept_state = 0; /* sanity init */
3320 U32 wordlen = 0; /* required init */
3322 if (OP(noper) == NOTHING) {
3323 regnode *noper_next= regnext(noper);
3324 if (noper_next < tail)
3326 /* we will undo this assignment if noper does not
3327 * point at a trieable type in the else clause of
3328 * the following statement. */
3332 && ( OP(noper) == flags
3333 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3334 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3335 || OP(noper) == EXACTFUP))))
3337 const U8 *uc= (U8*)STRING(noper);
3338 const U8 *e= uc + STR_LEN(noper);
3340 for ( ; uc < e ; uc += len ) {
3345 charid = trie->charmap[ uvc ];
3347 SV* const * const svpp = hv_fetch( widecharmap,
3351 charid = svpp ? (U16)SvIV(*svpp) : 0;
3355 if ( !trie->trans[ state + charid ].next ) {
3356 trie->trans[ state + charid ].next = next_alloc;
3357 trie->trans[ state ].check++;
3358 prev_states[TRIE_NODENUM(next_alloc)]
3359 = TRIE_NODENUM(state);
3360 next_alloc += trie->uniquecharcount;
3362 state = trie->trans[ state + charid ].next;
3364 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3366 /* charid is now 0 if we dont know the char read, or
3367 * nonzero if we do */
3370 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3371 * on a trieable type. So we need to reset noper back to point at the first regop
3372 * in the branch before we call TRIE_HANDLE_WORD().
3374 noper= NEXTOPER(cur);
3376 accept_state = TRIE_NODENUM( state );
3377 TRIE_HANDLE_WORD(accept_state);
3379 } /* end second pass */
3381 /* and now dump it out before we compress it */
3382 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3384 next_alloc, depth+1));
3388 * Inplace compress the table.*
3390 For sparse data sets the table constructed by the trie algorithm will
3391 be mostly 0/FAIL transitions or to put it another way mostly empty.
3392 (Note that leaf nodes will not contain any transitions.)
3394 This algorithm compresses the tables by eliminating most such
3395 transitions, at the cost of a modest bit of extra work during lookup:
3397 - Each states[] entry contains a .base field which indicates the
3398 index in the state[] array wheres its transition data is stored.
3400 - If .base is 0 there are no valid transitions from that node.
3402 - If .base is nonzero then charid is added to it to find an entry in
3405 -If trans[states[state].base+charid].check!=state then the
3406 transition is taken to be a 0/Fail transition. Thus if there are fail
3407 transitions at the front of the node then the .base offset will point
3408 somewhere inside the previous nodes data (or maybe even into a node
3409 even earlier), but the .check field determines if the transition is
3413 The following process inplace converts the table to the compressed
3414 table: We first do not compress the root node 1,and mark all its
3415 .check pointers as 1 and set its .base pointer as 1 as well. This
3416 allows us to do a DFA construction from the compressed table later,
3417 and ensures that any .base pointers we calculate later are greater
3420 - We set 'pos' to indicate the first entry of the second node.
3422 - We then iterate over the columns of the node, finding the first and
3423 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3424 and set the .check pointers accordingly, and advance pos
3425 appropriately and repreat for the next node. Note that when we copy
3426 the next pointers we have to convert them from the original
3427 NODEIDX form to NODENUM form as the former is not valid post
3430 - If a node has no transitions used we mark its base as 0 and do not
3431 advance the pos pointer.
3433 - If a node only has one transition we use a second pointer into the
3434 structure to fill in allocated fail transitions from other states.
3435 This pointer is independent of the main pointer and scans forward
3436 looking for null transitions that are allocated to a state. When it
3437 finds one it writes the single transition into the "hole". If the
3438 pointer doesnt find one the single transition is appended as normal.
3440 - Once compressed we can Renew/realloc the structures to release the
3443 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3444 specifically Fig 3.47 and the associated pseudocode.
3448 const U32 laststate = TRIE_NODENUM( next_alloc );
3451 trie->statecount = laststate;
3453 for ( state = 1 ; state < laststate ; state++ ) {
3455 const U32 stateidx = TRIE_NODEIDX( state );
3456 const U32 o_used = trie->trans[ stateidx ].check;
3457 U32 used = trie->trans[ stateidx ].check;
3458 trie->trans[ stateidx ].check = 0;
3461 used && charid < trie->uniquecharcount;
3464 if ( flag || trie->trans[ stateidx + charid ].next ) {
3465 if ( trie->trans[ stateidx + charid ].next ) {
3467 for ( ; zp < pos ; zp++ ) {
3468 if ( ! trie->trans[ zp ].next ) {
3472 trie->states[ state ].trans.base
3474 + trie->uniquecharcount
3476 trie->trans[ zp ].next
3477 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3479 trie->trans[ zp ].check = state;
3480 if ( ++zp > pos ) pos = zp;
3487 trie->states[ state ].trans.base
3488 = pos + trie->uniquecharcount - charid ;
3490 trie->trans[ pos ].next
3491 = SAFE_TRIE_NODENUM(
3492 trie->trans[ stateidx + charid ].next );
3493 trie->trans[ pos ].check = state;
3498 trie->lasttrans = pos + 1;
3499 trie->states = (reg_trie_state *)
3500 PerlMemShared_realloc( trie->states, laststate
3501 * sizeof(reg_trie_state) );
3502 DEBUG_TRIE_COMPILE_MORE_r(
3503 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3505 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3509 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3512 } /* end table compress */
3514 DEBUG_TRIE_COMPILE_MORE_r(
3515 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3517 (UV)trie->statecount,
3518 (UV)trie->lasttrans)
3520 /* resize the trans array to remove unused space */
3521 trie->trans = (reg_trie_trans *)
3522 PerlMemShared_realloc( trie->trans, trie->lasttrans
3523 * sizeof(reg_trie_trans) );
3525 { /* Modify the program and insert the new TRIE node */
3526 U8 nodetype =(U8)(flags & 0xFF);
3530 regnode *optimize = NULL;
3531 #ifdef RE_TRACK_PATTERN_OFFSETS
3534 U32 mjd_nodelen = 0;
3535 #endif /* RE_TRACK_PATTERN_OFFSETS */
3536 #endif /* DEBUGGING */
3538 This means we convert either the first branch or the first Exact,
3539 depending on whether the thing following (in 'last') is a branch
3540 or not and whther first is the startbranch (ie is it a sub part of
3541 the alternation or is it the whole thing.)
3542 Assuming its a sub part we convert the EXACT otherwise we convert
3543 the whole branch sequence, including the first.
3545 /* Find the node we are going to overwrite */
3546 if ( first != startbranch || OP( last ) == BRANCH ) {
3547 /* branch sub-chain */
3548 NEXT_OFF( first ) = (U16)(last - first);
3549 #ifdef RE_TRACK_PATTERN_OFFSETS
3551 mjd_offset= Node_Offset((convert));
3552 mjd_nodelen= Node_Length((convert));
3555 /* whole branch chain */
3557 #ifdef RE_TRACK_PATTERN_OFFSETS
3560 const regnode *nop = NEXTOPER( convert );
3561 mjd_offset= Node_Offset((nop));
3562 mjd_nodelen= Node_Length((nop));
3566 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3568 (UV)mjd_offset, (UV)mjd_nodelen)
3571 /* But first we check to see if there is a common prefix we can
3572 split out as an EXACT and put in front of the TRIE node. */
3573 trie->startstate= 1;
3574 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3575 /* we want to find the first state that has more than
3576 * one transition, if that state is not the first state
3577 * then we have a common prefix which we can remove.
3580 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3582 I32 first_ofs = -1; /* keeps track of the ofs of the first
3583 transition, -1 means none */
3585 const U32 base = trie->states[ state ].trans.base;
3587 /* does this state terminate an alternation? */
3588 if ( trie->states[state].wordnum )
3591 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3592 if ( ( base + ofs >= trie->uniquecharcount ) &&
3593 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3594 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3596 if ( ++count > 1 ) {
3597 /* we have more than one transition */
3600 /* if this is the first state there is no common prefix
3601 * to extract, so we can exit */
3602 if ( state == 1 ) break;
3603 tmp = av_fetch( revcharmap, ofs, 0);
3604 ch = (U8*)SvPV_nolen_const( *tmp );
3606 /* if we are on count 2 then we need to initialize the
3607 * bitmap, and store the previous char if there was one
3610 /* clear the bitmap */
3611 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3613 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3616 if (first_ofs >= 0) {
3617 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3618 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3620 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3622 Perl_re_printf( aTHX_ "%s", (char*)ch)
3626 /* store the current firstchar in the bitmap */
3627 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3628 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3634 /* This state has only one transition, its transition is part
3635 * of a common prefix - we need to concatenate the char it
3636 * represents to what we have so far. */
3637 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3639 char *ch = SvPV( *tmp, len );
3641 SV *sv=sv_newmortal();
3642 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3644 (UV)state, (UV)first_ofs,
3645 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3646 PL_colors[0], PL_colors[1],
3647 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3648 PERL_PV_ESCAPE_FIRSTCHAR
3653 OP( convert ) = nodetype;
3654 str=STRING(convert);
3655 setSTR_LEN(convert, 0);
3657 assert( ( STR_LEN(convert) + len ) < 256 );
3658 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3664 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3669 trie->prefixlen = (state-1);
3671 regnode *n = convert+NODE_SZ_STR(convert);
3672 assert( NODE_SZ_STR(convert) <= U16_MAX );
3673 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3674 trie->startstate = state;
3675 trie->minlen -= (state - 1);
3676 trie->maxlen -= (state - 1);
3678 /* At least the UNICOS C compiler choked on this
3679 * being argument to DEBUG_r(), so let's just have
3682 #ifdef PERL_EXT_RE_BUILD
3688 regnode *fix = convert;
3689 U32 word = trie->wordcount;
3690 #ifdef RE_TRACK_PATTERN_OFFSETS
3693 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3694 while( ++fix < n ) {
3695 Set_Node_Offset_Length(fix, 0, 0);
3698 SV ** const tmp = av_fetch( trie_words, word, 0 );
3700 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3701 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3703 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3711 NEXT_OFF(convert) = (U16)(tail - convert);
3712 DEBUG_r(optimize= n);
3718 if ( trie->maxlen ) {
3719 NEXT_OFF( convert ) = (U16)(tail - convert);
3720 ARG_SET( convert, data_slot );
3721 /* Store the offset to the first unabsorbed branch in
3722 jump[0], which is otherwise unused by the jump logic.
3723 We use this when dumping a trie and during optimisation. */
3725 trie->jump[0] = (U16)(nextbranch - convert);
3727 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3728 * and there is a bitmap
3729 * and the first "jump target" node we found leaves enough room
3730 * then convert the TRIE node into a TRIEC node, with the bitmap
3731 * embedded inline in the opcode - this is hypothetically faster.
3733 if ( !trie->states[trie->startstate].wordnum
3735 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3737 OP( convert ) = TRIEC;
3738 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3739 PerlMemShared_free(trie->bitmap);
3742 OP( convert ) = TRIE;
3744 /* store the type in the flags */
3745 convert->flags = nodetype;
3749 + regarglen[ OP( convert ) ];
3751 /* XXX We really should free up the resource in trie now,
3752 as we won't use them - (which resources?) dmq */
3754 /* needed for dumping*/
3755 DEBUG_r(if (optimize) {
3756 regnode *opt = convert;
3758 while ( ++opt < optimize) {
3759 Set_Node_Offset_Length(opt, 0, 0);
3762 Try to clean up some of the debris left after the
3765 while( optimize < jumper ) {
3766 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3767 OP( optimize ) = OPTIMIZED;
3768 Set_Node_Offset_Length(optimize, 0, 0);
3771 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3773 } /* end node insert */
3775 /* Finish populating the prev field of the wordinfo array. Walk back
3776 * from each accept state until we find another accept state, and if
3777 * so, point the first word's .prev field at the second word. If the
3778 * second already has a .prev field set, stop now. This will be the
3779 * case either if we've already processed that word's accept state,
3780 * or that state had multiple words, and the overspill words were
3781 * already linked up earlier.
3788 for (word=1; word <= trie->wordcount; word++) {
3790 if (trie->wordinfo[word].prev)
3792 state = trie->wordinfo[word].accept;
3794 state = prev_states[state];
3797 prev = trie->states[state].wordnum;
3801 trie->wordinfo[word].prev = prev;
3803 Safefree(prev_states);
3807 /* and now dump out the compressed format */
3808 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3810 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3812 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3813 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3815 SvREFCNT_dec_NN(revcharmap);
3819 : trie->startstate>1
3825 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3827 /* The Trie is constructed and compressed now so we can build a fail array if
3830 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3832 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3836 We find the fail state for each state in the trie, this state is the longest
3837 proper suffix of the current state's 'word' that is also a proper prefix of
3838 another word in our trie. State 1 represents the word '' and is thus the
3839 default fail state. This allows the DFA not to have to restart after its
3840 tried and failed a word at a given point, it simply continues as though it
3841 had been matching the other word in the first place.
3843 'abcdgu'=~/abcdefg|cdgu/
3844 When we get to 'd' we are still matching the first word, we would encounter
3845 'g' which would fail, which would bring us to the state representing 'd' in
3846 the second word where we would try 'g' and succeed, proceeding to match
3849 /* add a fail transition */
3850 const U32 trie_offset = ARG(source);
3851 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3853 const U32 ucharcount = trie->uniquecharcount;
3854 const U32 numstates = trie->statecount;
3855 const U32 ubound = trie->lasttrans + ucharcount;
3859 U32 base = trie->states[ 1 ].trans.base;
3862 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3864 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3866 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3867 PERL_UNUSED_CONTEXT;
3869 PERL_UNUSED_ARG(depth);
3872 if ( OP(source) == TRIE ) {
3873 struct regnode_1 *op = (struct regnode_1 *)
3874 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3875 StructCopy(source, op, struct regnode_1);
3876 stclass = (regnode *)op;
3878 struct regnode_charclass *op = (struct regnode_charclass *)
3879 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3880 StructCopy(source, op, struct regnode_charclass);
3881 stclass = (regnode *)op;
3883 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3885 ARG_SET( stclass, data_slot );
3886 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3887 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3888 aho->trie=trie_offset;
3889 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3890 Copy( trie->states, aho->states, numstates, reg_trie_state );
3891 Newx( q, numstates, U32);
3892 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3895 /* initialize fail[0..1] to be 1 so that we always have
3896 a valid final fail state */
3897 fail[ 0 ] = fail[ 1 ] = 1;
3899 for ( charid = 0; charid < ucharcount ; charid++ ) {
3900 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3902 q[ q_write ] = newstate;
3903 /* set to point at the root */
3904 fail[ q[ q_write++ ] ]=1;
3907 while ( q_read < q_write) {
3908 const U32 cur = q[ q_read++ % numstates ];
3909 base = trie->states[ cur ].trans.base;
3911 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3912 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3914 U32 fail_state = cur;
3917 fail_state = fail[ fail_state ];
3918 fail_base = aho->states[ fail_state ].trans.base;
3919 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3921 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3922 fail[ ch_state ] = fail_state;
3923 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3925 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3927 q[ q_write++ % numstates] = ch_state;
3931 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3932 when we fail in state 1, this allows us to use the
3933 charclass scan to find a valid start char. This is based on the principle
3934 that theres a good chance the string being searched contains lots of stuff
3935 that cant be a start char.
3937 fail[ 0 ] = fail[ 1 ] = 0;
3938 DEBUG_TRIE_COMPILE_r({
3939 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3940 depth, (UV)numstates
3942 for( q_read=1; q_read<numstates; q_read++ ) {
3943 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3945 Perl_re_printf( aTHX_ "\n");
3948 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3953 /* The below joins as many adjacent EXACTish nodes as possible into a single
3954 * one. The regop may be changed if the node(s) contain certain sequences that
3955 * require special handling. The joining is only done if:
3956 * 1) there is room in the current conglomerated node to entirely contain the
3958 * 2) they are compatible node types
3960 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3961 * these get optimized out
3963 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3964 * as possible, even if that means splitting an existing node so that its first
3965 * part is moved to the preceeding node. This would maximise the efficiency of
3966 * memEQ during matching.
3968 * If a node is to match under /i (folded), the number of characters it matches
3969 * can be different than its character length if it contains a multi-character
3970 * fold. *min_subtract is set to the total delta number of characters of the
3973 * And *unfolded_multi_char is set to indicate whether or not the node contains
3974 * an unfolded multi-char fold. This happens when it won't be known until
3975 * runtime whether the fold is valid or not; namely
3976 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3977 * target string being matched against turns out to be UTF-8 is that fold
3979 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3981 * (Multi-char folds whose components are all above the Latin1 range are not
3982 * run-time locale dependent, and have already been folded by the time this
3983 * function is called.)
3985 * This is as good a place as any to discuss the design of handling these
3986 * multi-character fold sequences. It's been wrong in Perl for a very long
3987 * time. There are three code points in Unicode whose multi-character folds
3988 * were long ago discovered to mess things up. The previous designs for
3989 * dealing with these involved assigning a special node for them. This
3990 * approach doesn't always work, as evidenced by this example:
3991 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3992 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3993 * would match just the \xDF, it won't be able to handle the case where a
3994 * successful match would have to cross the node's boundary. The new approach
3995 * that hopefully generally solves the problem generates an EXACTFUP node
3996 * that is "sss" in this case.
3998 * It turns out that there are problems with all multi-character folds, and not
3999 * just these three. Now the code is general, for all such cases. The
4000 * approach taken is:
4001 * 1) This routine examines each EXACTFish node that could contain multi-
4002 * character folded sequences. Since a single character can fold into
4003 * such a sequence, the minimum match length for this node is less than
4004 * the number of characters in the node. This routine returns in
4005 * *min_subtract how many characters to subtract from the the actual
4006 * length of the string to get a real minimum match length; it is 0 if
4007 * there are no multi-char foldeds. This delta is used by the caller to
4008 * adjust the min length of the match, and the delta between min and max,
4009 * so that the optimizer doesn't reject these possibilities based on size
4012 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4013 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4014 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4015 * EXACTFU nodes. The node type of such nodes is then changed to
4016 * EXACTFUP, indicating it is problematic, and needs careful handling.
4017 * (The procedures in step 1) above are sufficient to handle this case in
4018 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4019 * the only case where there is a possible fold length change in non-UTF-8
4020 * patterns. By reserving a special node type for problematic cases, the
4021 * far more common regular EXACTFU nodes can be processed faster.
4022 * regexec.c takes advantage of this.
4024 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4025 * problematic cases. These all only occur when the pattern is not
4026 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4027 * length change, it handles the situation where the string cannot be
4028 * entirely folded. The strings in an EXACTFish node are folded as much
4029 * as possible during compilation in regcomp.c. This saves effort in
4030 * regex matching. By using an EXACTFUP node when it is not possible to
4031 * fully fold at compile time, regexec.c can know that everything in an
4032 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4033 * case where folding in EXACTFU nodes can't be done at compile time is
4034 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4035 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4036 * handle two very different cases. Alternatively, there could have been
4037 * a node type where there are length changes, one for unfolded, and one
4038 * for both. If yet another special case needed to be created, the number
4039 * of required node types would have to go to 7. khw figures that even
4040 * though there are plenty of node types to spare, that the maintenance
4041 * cost wasn't worth the small speedup of doing it that way, especially
4042 * since he thinks the MICRO SIGN is rarely encountered in practice.
4044 * There are other cases where folding isn't done at compile time, but
4045 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4046 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4047 * changes. Some folds in EXACTF depend on if the runtime target string
4048 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4049 * when no fold in it depends on the UTF-8ness of the target string.)
4051 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4052 * validity of the fold won't be known until runtime, and so must remain
4053 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4054 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4055 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4056 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4057 * The reason this is a problem is that the optimizer part of regexec.c
4058 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4059 * that a character in the pattern corresponds to at most a single
4060 * character in the target string. (And I do mean character, and not byte
4061 * here, unlike other parts of the documentation that have never been
4062 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4063 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4064 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4065 * EXACTFL nodes, violate the assumption, and they are the only instances
4066 * where it is violated. I'm reluctant to try to change the assumption,
4067 * as the code involved is impenetrable to me (khw), so instead the code
4068 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4069 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4070 * boolean indicating whether or not the node contains such a fold. When
4071 * it is true, the caller sets a flag that later causes the optimizer in
4072 * this file to not set values for the floating and fixed string lengths,
4073 * and thus avoids the optimizer code in regexec.c that makes the invalid
4074 * assumption. Thus, there is no optimization based on string lengths for
4075 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4076 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4077 * assumption is wrong only in these cases is that all other non-UTF-8
4078 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4079 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4080 * EXACTF nodes because we don't know at compile time if it actually
4081 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4082 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4083 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4084 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4085 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4086 * string would require the pattern to be forced into UTF-8, the overhead
4087 * of which we want to avoid. Similarly the unfolded multi-char folds in
4088 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4091 * Similarly, the code that generates tries doesn't currently handle
4092 * not-already-folded multi-char folds, and it looks like a pain to change
4093 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4094 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4095 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4096 * using /iaa matching will be doing so almost entirely with ASCII
4097 * strings, so this should rarely be encountered in practice */
4100 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4101 UV *min_subtract, bool *unfolded_multi_char,
4102 U32 flags, regnode *val, U32 depth)
4104 /* Merge several consecutive EXACTish nodes into one. */
4106 regnode *n = regnext(scan);
4108 regnode *next = scan + NODE_SZ_STR(scan);
4112 regnode *stop = scan;
4113 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4115 PERL_UNUSED_ARG(depth);
4118 PERL_ARGS_ASSERT_JOIN_EXACT;
4119 #ifndef EXPERIMENTAL_INPLACESCAN
4120 PERL_UNUSED_ARG(flags);
4121 PERL_UNUSED_ARG(val);
4123 DEBUG_PEEP("join", scan, depth, 0);
4125 assert(PL_regkind[OP(scan)] == EXACT);
4127 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4128 * EXACT ones that are mergeable to the current one. */
4130 && ( PL_regkind[OP(n)] == NOTHING
4131 || (stringok && PL_regkind[OP(n)] == EXACT))
4133 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4136 if (OP(n) == TAIL || n > next)
4138 if (PL_regkind[OP(n)] == NOTHING) {
4139 DEBUG_PEEP("skip:", n, depth, 0);
4140 NEXT_OFF(scan) += NEXT_OFF(n);
4141 next = n + NODE_STEP_REGNODE;
4148 else if (stringok) {
4149 const unsigned int oldl = STR_LEN(scan);
4150 regnode * const nnext = regnext(n);
4152 /* XXX I (khw) kind of doubt that this works on platforms (should
4153 * Perl ever run on one) where U8_MAX is above 255 because of lots
4154 * of other assumptions */
4155 /* Don't join if the sum can't fit into a single node */
4156 if (oldl + STR_LEN(n) > U8_MAX)
4159 /* Joining something that requires UTF-8 with something that
4160 * doesn't, means the result requires UTF-8. */
4161 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4162 OP(scan) = EXACT_REQ8;
4164 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4165 ; /* join is compatible, no need to change OP */
4167 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4168 OP(scan) = EXACTFU_REQ8;
4170 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4171 ; /* join is compatible, no need to change OP */
4173 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4174 ; /* join is compatible, no need to change OP */
4176 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4178 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4179 * which can join with EXACTFU ones. We check for this case
4180 * here. These need to be resolved to either EXACTFU or
4181 * EXACTF at joining time. They have nothing in them that
4182 * would forbid them from being the more desirable EXACTFU
4183 * nodes except that they begin and/or end with a single [Ss].
4184 * The reason this is problematic is because they could be
4185 * joined in this loop with an adjacent node that ends and/or
4186 * begins with [Ss] which would then form the sequence 'ss',
4187 * which matches differently under /di than /ui, in which case
4188 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4189 * formed, the nodes get absorbed into any adjacent EXACTFU
4190 * node. And if the only adjacent node is EXACTF, they get
4191 * absorbed into that, under the theory that a longer node is
4192 * better than two shorter ones, even if one is EXACTFU. Note
4193 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4194 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4196 if (STRING(n)[STR_LEN(n)-1] == 's') {
4198 /* Here the joined node would end with 's'. If the node
4199 * following the combination is an EXACTF one, it's better to
4200 * join this trailing edge 's' node with that one, leaving the
4201 * current one in 'scan' be the more desirable EXACTFU */
4202 if (OP(nnext) == EXACTF) {
4206 OP(scan) = EXACTFU_S_EDGE;
4208 } /* Otherwise, the beginning 's' of the 2nd node just
4209 becomes an interior 's' in 'scan' */
4211 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4212 ; /* join is compatible, no need to change OP */
4214 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4216 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4217 * nodes. But the latter nodes can be also joined with EXACTFU
4218 * ones, and that is a better outcome, so if the node following
4219 * 'n' is EXACTFU, quit now so that those two can be joined
4221 if (OP(nnext) == EXACTFU) {
4225 /* The join is compatible, and the combined node will be
4226 * EXACTF. (These don't care if they begin or end with 's' */
4228 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4229 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4230 && STRING(n)[0] == 's')
4232 /* When combined, we have the sequence 'ss', which means we
4233 * have to remain /di */
4237 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4238 if (STRING(n)[0] == 's') {
4239 ; /* Here the join is compatible and the combined node
4240 starts with 's', no need to change OP */
4242 else { /* Now the trailing 's' is in the interior */
4246 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4248 /* The join is compatible, and the combined node will be
4249 * EXACTF. (These don't care if they begin or end with 's' */
4252 else if (OP(scan) != OP(n)) {
4254 /* The only other compatible joinings are the same node type */
4258 DEBUG_PEEP("merg", n, depth, 0);
4261 NEXT_OFF(scan) += NEXT_OFF(n);
4262 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4263 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4264 next = n + NODE_SZ_STR(n);
4265 /* Now we can overwrite *n : */
4266 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4274 #ifdef EXPERIMENTAL_INPLACESCAN
4275 if (flags && !NEXT_OFF(n)) {
4276 DEBUG_PEEP("atch", val, depth, 0);
4277 if (reg_off_by_arg[OP(n)]) {
4278 ARG_SET(n, val - n);
4281 NEXT_OFF(n) = val - n;
4288 /* This temporary node can now be turned into EXACTFU, and must, as
4289 * regexec.c doesn't handle it */
4290 if (OP(scan) == EXACTFU_S_EDGE) {
4295 *unfolded_multi_char = FALSE;
4297 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4298 * can now analyze for sequences of problematic code points. (Prior to
4299 * this final joining, sequences could have been split over boundaries, and
4300 * hence missed). The sequences only happen in folding, hence for any
4301 * non-EXACT EXACTish node */
4302 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4303 U8* s0 = (U8*) STRING(scan);
4305 U8* s_end = s0 + STR_LEN(scan);
4307 int total_count_delta = 0; /* Total delta number of characters that
4308 multi-char folds expand to */
4310 /* One pass is made over the node's string looking for all the
4311 * possibilities. To avoid some tests in the loop, there are two main
4312 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4317 if (OP(scan) == EXACTFL) {
4320 /* An EXACTFL node would already have been changed to another
4321 * node type unless there is at least one character in it that
4322 * is problematic; likely a character whose fold definition
4323 * won't be known until runtime, and so has yet to be folded.
4324 * For all but the UTF-8 locale, folds are 1-1 in length, but
4325 * to handle the UTF-8 case, we need to create a temporary
4326 * folded copy using UTF-8 locale rules in order to analyze it.
4327 * This is because our macros that look to see if a sequence is
4328 * a multi-char fold assume everything is folded (otherwise the
4329 * tests in those macros would be too complicated and slow).
4330 * Note that here, the non-problematic folds will have already
4331 * been done, so we can just copy such characters. We actually
4332 * don't completely fold the EXACTFL string. We skip the
4333 * unfolded multi-char folds, as that would just create work
4334 * below to figure out the size they already are */
4336 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4339 STRLEN s_len = UTF8SKIP(s);
4340 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4341 Copy(s, d, s_len, U8);
4344 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4345 *unfolded_multi_char = TRUE;
4346 Copy(s, d, s_len, U8);
4349 else if (isASCII(*s)) {
4350 *(d++) = toFOLD(*s);
4354 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4360 /* Point the remainder of the routine to look at our temporary
4364 } /* End of creating folded copy of EXACTFL string */
4366 /* Examine the string for a multi-character fold sequence. UTF-8
4367 * patterns have all characters pre-folded by the time this code is
4369 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4370 length sequence we are looking for is 2 */
4372 int count = 0; /* How many characters in a multi-char fold */
4373 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4374 if (! len) { /* Not a multi-char fold: get next char */
4379 { /* Here is a generic multi-char fold. */
4380 U8* multi_end = s + len;
4382 /* Count how many characters are in it. In the case of
4383 * /aa, no folds which contain ASCII code points are
4384 * allowed, so check for those, and skip if found. */
4385 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4386 count = utf8_length(s, multi_end);
4390 while (s < multi_end) {
4393 goto next_iteration;
4403 /* The delta is how long the sequence is minus 1 (1 is how long
4404 * the character that folds to the sequence is) */
4405 total_count_delta += count - 1;
4409 /* We created a temporary folded copy of the string in EXACTFL
4410 * nodes. Therefore we need to be sure it doesn't go below zero,
4411 * as the real string could be shorter */
4412 if (OP(scan) == EXACTFL) {
4413 int total_chars = utf8_length((U8*) STRING(scan),
4414 (U8*) STRING(scan) + STR_LEN(scan));
4415 if (total_count_delta > total_chars) {
4416 total_count_delta = total_chars;
4420 *min_subtract += total_count_delta;
4423 else if (OP(scan) == EXACTFAA) {
4425 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4426 * fold to the ASCII range (and there are no existing ones in the
4427 * upper latin1 range). But, as outlined in the comments preceding
4428 * this function, we need to flag any occurrences of the sharp s.
4429 * This character forbids trie formation (because of added
4431 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4432 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4433 || UNICODE_DOT_DOT_VERSION > 0)
4435 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4436 OP(scan) = EXACTFAA_NO_TRIE;
4437 *unfolded_multi_char = TRUE;
4443 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4445 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4446 * folds that are all Latin1. As explained in the comments
4447 * preceding this function, we look also for the sharp s in EXACTF
4448 * and EXACTFL nodes; it can be in the final position. Otherwise
4449 * we can stop looking 1 byte earlier because have to find at least
4450 * two characters for a multi-fold */
4451 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4456 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4457 if (! len) { /* Not a multi-char fold. */
4458 if (*s == LATIN_SMALL_LETTER_SHARP_S
4459 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4461 *unfolded_multi_char = TRUE;
4468 && isALPHA_FOLD_EQ(*s, 's')
4469 && isALPHA_FOLD_EQ(*(s+1), 's'))
4472 /* EXACTF nodes need to know that the minimum length
4473 * changed so that a sharp s in the string can match this
4474 * ss in the pattern, but they remain EXACTF nodes, as they
4475 * won't match this unless the target string is is UTF-8,
4476 * which we don't know until runtime. EXACTFL nodes can't
4477 * transform into EXACTFU nodes */
4478 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4479 OP(scan) = EXACTFUP;
4483 *min_subtract += len - 1;
4491 /* Allow dumping but overwriting the collection of skipped
4492 * ops and/or strings with fake optimized ops */
4493 n = scan + NODE_SZ_STR(scan);
4501 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4505 /* REx optimizer. Converts nodes into quicker variants "in place".
4506 Finds fixed substrings. */
4508 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4509 to the position after last scanned or to NULL. */
4511 #define INIT_AND_WITHP \
4512 assert(!and_withp); \
4513 Newx(and_withp, 1, regnode_ssc); \
4514 SAVEFREEPV(and_withp)
4518 S_unwind_scan_frames(pTHX_ const void *p)
4520 scan_frame *f= (scan_frame *)p;
4522 scan_frame *n= f->next_frame;
4528 /* the return from this sub is the minimum length that could possibly match */
4530 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4531 SSize_t *minlenp, SSize_t *deltap,
4536 regnode_ssc *and_withp,
4537 U32 flags, U32 depth)
4538 /* scanp: Start here (read-write). */
4539 /* deltap: Write maxlen-minlen here. */
4540 /* last: Stop before this one. */
4541 /* data: string data about the pattern */
4542 /* stopparen: treat close N as END */
4543 /* recursed: which subroutines have we recursed into */
4544 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4547 SSize_t final_minlen;
4548 /* There must be at least this number of characters to match */
4551 regnode *scan = *scanp, *next;
4553 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4554 int is_inf_internal = 0; /* The studied chunk is infinite */
4555 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4556 scan_data_t data_fake;
4557 SV *re_trie_maxbuff = NULL;
4558 regnode *first_non_open = scan;
4559 SSize_t stopmin = OPTIMIZE_INFTY;
4560 scan_frame *frame = NULL;
4561 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4563 PERL_ARGS_ASSERT_STUDY_CHUNK;
4564 RExC_study_started= 1;
4566 Zero(&data_fake, 1, scan_data_t);
4569 while (first_non_open && OP(first_non_open) == OPEN)
4570 first_non_open=regnext(first_non_open);
4576 RExC_study_chunk_recursed_count++;
4578 DEBUG_OPTIMISE_MORE_r(
4580 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4581 depth, (long)stopparen,
4582 (unsigned long)RExC_study_chunk_recursed_count,
4583 (unsigned long)depth, (unsigned long)recursed_depth,
4586 if (recursed_depth) {
4589 for ( j = 0 ; j < recursed_depth ; j++ ) {
4590 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4591 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4592 Perl_re_printf( aTHX_ " %d",(int)i);
4596 if ( j + 1 < recursed_depth ) {
4597 Perl_re_printf( aTHX_ ",");
4601 Perl_re_printf( aTHX_ "\n");
4604 while ( scan && OP(scan) != END && scan < last ){
4605 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4606 node length to get a real minimum (because
4607 the folded version may be shorter) */
4608 bool unfolded_multi_char = FALSE;
4609 /* Peephole optimizer: */
4610 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4611 DEBUG_PEEP("Peep", scan, depth, flags);
4614 /* The reason we do this here is that we need to deal with things like
4615 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4616 * parsing code, as each (?:..) is handled by a different invocation of
4619 if (PL_regkind[OP(scan)] == EXACT
4620 && OP(scan) != LEXACT
4621 && OP(scan) != LEXACT_REQ8
4624 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4625 0, NULL, depth + 1);
4628 /* Follow the next-chain of the current node and optimize
4629 away all the NOTHINGs from it. */
4630 if (OP(scan) != CURLYX) {
4631 const int max = (reg_off_by_arg[OP(scan)]
4633 /* I32 may be smaller than U16 on CRAYs! */
4634 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4635 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4639 /* Skip NOTHING and LONGJMP. */
4640 while ( (n = regnext(n))
4641 && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4642 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4643 && off + noff < max)
4645 if (reg_off_by_arg[OP(scan)])
4648 NEXT_OFF(scan) = off;
4651 /* The principal pseudo-switch. Cannot be a switch, since we look into
4652 * several different things. */
4653 if ( OP(scan) == DEFINEP ) {
4655 SSize_t deltanext = 0;
4656 SSize_t fake_last_close = 0;
4657 I32 f = SCF_IN_DEFINE;
4659 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4660 scan = regnext(scan);
4661 assert( OP(scan) == IFTHEN );
4662 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4664 data_fake.last_closep= &fake_last_close;
4666 next = regnext(scan);
4667 scan = NEXTOPER(NEXTOPER(scan));
4668 DEBUG_PEEP("scan", scan, depth, flags);
4669 DEBUG_PEEP("next", next, depth, flags);
4671 /* we suppose the run is continuous, last=next...
4672 * NOTE we dont use the return here! */
4673 /* DEFINEP study_chunk() recursion */
4674 (void)study_chunk(pRExC_state, &scan, &minlen,
4675 &deltanext, next, &data_fake, stopparen,
4676 recursed_depth, NULL, f, depth+1);
4681 OP(scan) == BRANCH ||
4682 OP(scan) == BRANCHJ ||
4685 next = regnext(scan);
4688 /* The op(next)==code check below is to see if we
4689 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4690 * IFTHEN is special as it might not appear in pairs.
4691 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4692 * we dont handle it cleanly. */
4693 if (OP(next) == code || code == IFTHEN) {
4694 /* NOTE - There is similar code to this block below for
4695 * handling TRIE nodes on a re-study. If you change stuff here
4696 * check there too. */
4697 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4699 regnode * const startbranch=scan;
4701 if (flags & SCF_DO_SUBSTR) {
4702 /* Cannot merge strings after this. */
4703 scan_commit(pRExC_state, data, minlenp, is_inf);
4706 if (flags & SCF_DO_STCLASS)
4707 ssc_init_zero(pRExC_state, &accum);
4709 while (OP(scan) == code) {
4710 SSize_t deltanext, minnext, fake;
4712 regnode_ssc this_class;
4714 DEBUG_PEEP("Branch", scan, depth, flags);
4717 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4719 data_fake.whilem_c = data->whilem_c;
4720 data_fake.last_closep = data->last_closep;
4723 data_fake.last_closep = &fake;
4725 data_fake.pos_delta = delta;
4726 next = regnext(scan);
4728 scan = NEXTOPER(scan); /* everything */
4729 if (code != BRANCH) /* everything but BRANCH */
4730 scan = NEXTOPER(scan);
4732 if (flags & SCF_DO_STCLASS) {
4733 ssc_init(pRExC_state, &this_class);
4734 data_fake.start_class = &this_class;
4735 f = SCF_DO_STCLASS_AND;
4737 if (flags & SCF_WHILEM_VISITED_POS)
4738 f |= SCF_WHILEM_VISITED_POS;
4740 /* we suppose the run is continuous, last=next...*/
4741 /* recurse study_chunk() for each BRANCH in an alternation */
4742 minnext = study_chunk(pRExC_state, &scan, minlenp,
4743 &deltanext, next, &data_fake, stopparen,
4744 recursed_depth, NULL, f, depth+1);
4748 if (deltanext == OPTIMIZE_INFTY) {
4749 is_inf = is_inf_internal = 1;
4750 max1 = OPTIMIZE_INFTY;
4751 } else if (max1 < minnext + deltanext)
4752 max1 = minnext + deltanext;
4754 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4756 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4757 if ( stopmin > minnext)
4758 stopmin = min + min1;
4759 flags &= ~SCF_DO_SUBSTR;
4761 data->flags |= SCF_SEEN_ACCEPT;
4764 if (data_fake.flags & SF_HAS_EVAL)
4765 data->flags |= SF_HAS_EVAL;
4766 data->whilem_c = data_fake.whilem_c;
4768 if (flags & SCF_DO_STCLASS)
4769 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4771 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4773 if (flags & SCF_DO_SUBSTR) {
4774 data->pos_min += min1;
4775 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4776 data->pos_delta = OPTIMIZE_INFTY;
4778 data->pos_delta += max1 - min1;
4779 if (max1 != min1 || is_inf)
4780 data->cur_is_floating = 1;
4783 if (delta == OPTIMIZE_INFTY
4784 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4785 delta = OPTIMIZE_INFTY;
4787 delta += max1 - min1;
4788 if (flags & SCF_DO_STCLASS_OR) {
4789 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4791 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4792 flags &= ~SCF_DO_STCLASS;
4795 else if (flags & SCF_DO_STCLASS_AND) {
4797 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4798 flags &= ~SCF_DO_STCLASS;
4801 /* Switch to OR mode: cache the old value of
4802 * data->start_class */
4804 StructCopy(data->start_class, and_withp, regnode_ssc);
4805 flags &= ~SCF_DO_STCLASS_AND;
4806 StructCopy(&accum, data->start_class, regnode_ssc);
4807 flags |= SCF_DO_STCLASS_OR;
4811 if (PERL_ENABLE_TRIE_OPTIMISATION
4812 && OP(startbranch) == BRANCH
4817 Assuming this was/is a branch we are dealing with: 'scan'
4818 now points at the item that follows the branch sequence,
4819 whatever it is. We now start at the beginning of the
4820 sequence and look for subsequences of
4826 which would be constructed from a pattern like
4829 If we can find such a subsequence we need to turn the first
4830 element into a trie and then add the subsequent branch exact
4831 strings to the trie.
4835 1. patterns where the whole set of branches can be
4838 2. patterns where only a subset can be converted.
4840 In case 1 we can replace the whole set with a single regop
4841 for the trie. In case 2 we need to keep the start and end
4844 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4845 becomes BRANCH TRIE; BRANCH X;
4847 There is an additional case, that being where there is a
4848 common prefix, which gets split out into an EXACT like node
4849 preceding the TRIE node.
4851 If x(1..n)==tail then we can do a simple trie, if not we make
4852 a "jump" trie, such that when we match the appropriate word
4853 we "jump" to the appropriate tail node. Essentially we turn
4854 a nested if into a case structure of sorts.
4859 if (!re_trie_maxbuff) {
4860 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4861 if (!SvIOK(re_trie_maxbuff))
4862 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4864 if ( SvIV(re_trie_maxbuff)>=0 ) {
4866 regnode *first = (regnode *)NULL;
4867 regnode *prev = (regnode *)NULL;
4868 regnode *tail = scan;
4872 /* var tail is used because there may be a TAIL
4873 regop in the way. Ie, the exacts will point to the
4874 thing following the TAIL, but the last branch will
4875 point at the TAIL. So we advance tail. If we
4876 have nested (?:) we may have to move through several
4880 while ( OP( tail ) == TAIL ) {
4881 /* this is the TAIL generated by (?:) */
4882 tail = regnext( tail );
4886 DEBUG_TRIE_COMPILE_r({
4887 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4888 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4890 "Looking for TRIE'able sequences. Tail node is ",
4891 (UV) REGNODE_OFFSET(tail),
4892 SvPV_nolen_const( RExC_mysv )
4898 Step through the branches
4899 cur represents each branch,
4900 noper is the first thing to be matched as part
4902 noper_next is the regnext() of that node.
4904 We normally handle a case like this
4905 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4906 support building with NOJUMPTRIE, which restricts
4907 the trie logic to structures like /FOO|BAR/.
4909 If noper is a trieable nodetype then the branch is
4910 a possible optimization target. If we are building
4911 under NOJUMPTRIE then we require that noper_next is
4912 the same as scan (our current position in the regex
4915 Once we have two or more consecutive such branches
4916 we can create a trie of the EXACT's contents and
4917 stitch it in place into the program.
4919 If the sequence represents all of the branches in
4920 the alternation we replace the entire thing with a
4923 Otherwise when it is a subsequence we need to
4924 stitch it in place and replace only the relevant
4925 branches. This means the first branch has to remain
4926 as it is used by the alternation logic, and its
4927 next pointer, and needs to be repointed at the item
4928 on the branch chain following the last branch we
4929 have optimized away.
4931 This could be either a BRANCH, in which case the
4932 subsequence is internal, or it could be the item
4933 following the branch sequence in which case the
4934 subsequence is at the end (which does not
4935 necessarily mean the first node is the start of the
4938 TRIE_TYPE(X) is a define which maps the optype to a
4942 ----------------+-----------
4947 EXACTFU_REQ8 | EXACTFU
4951 EXACTFLU8 | EXACTFLU8
4955 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4957 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
4959 : ( EXACTFU == (X) \
4960 || EXACTFU_REQ8 == (X) \
4961 || EXACTFUP == (X) ) \
4963 : ( EXACTFAA == (X) ) \
4965 : ( EXACTL == (X) ) \
4967 : ( EXACTFLU8 == (X) ) \
4971 /* dont use tail as the end marker for this traverse */
4972 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4973 regnode * const noper = NEXTOPER( cur );
4974 U8 noper_type = OP( noper );
4975 U8 noper_trietype = TRIE_TYPE( noper_type );
4976 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4977 regnode * const noper_next = regnext( noper );
4978 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4979 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4982 DEBUG_TRIE_COMPILE_r({
4983 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4984 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4986 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4988 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4989 Perl_re_printf( aTHX_ " -> %d:%s",
4990 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4993 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4994 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4995 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4997 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4998 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
4999 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5003 /* Is noper a trieable nodetype that can be merged
5004 * with the current trie (if there is one)? */
5008 ( noper_trietype == NOTHING )
5009 || ( trietype == NOTHING )
5010 || ( trietype == noper_trietype )
5013 && noper_next >= tail
5017 /* Handle mergable triable node Either we are
5018 * the first node in a new trieable sequence,
5019 * in which case we do some bookkeeping,
5020 * otherwise we update the end pointer. */
5023 if ( noper_trietype == NOTHING ) {
5024 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5025 regnode * const noper_next = regnext( noper );
5026 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5027 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5030 if ( noper_next_trietype ) {
5031 trietype = noper_next_trietype;
5032 } else if (noper_next_type) {
5033 /* a NOTHING regop is 1 regop wide.
5034 * We need at least two for a trie
5035 * so we can't merge this in */
5039 trietype = noper_trietype;
5042 if ( trietype == NOTHING )
5043 trietype = noper_trietype;
5048 } /* end handle mergable triable node */
5050 /* handle unmergable node -
5051 * noper may either be a triable node which can
5052 * not be tried together with the current trie,
5053 * or a non triable node */
5055 /* If last is set and trietype is not
5056 * NOTHING then we have found at least two
5057 * triable branch sequences in a row of a
5058 * similar trietype so we can turn them
5059 * into a trie. If/when we allow NOTHING to
5060 * start a trie sequence this condition
5061 * will be required, and it isn't expensive
5062 * so we leave it in for now. */
5063 if ( trietype && trietype != NOTHING )
5064 make_trie( pRExC_state,
5065 startbranch, first, cur, tail,
5066 count, trietype, depth+1 );
5067 prev = NULL; /* note: we clear/update
5068 first, trietype etc below,
5069 so we dont do it here */
5073 && noper_next >= tail
5076 /* noper is triable, so we can start a new
5080 trietype = noper_trietype;
5082 /* if we already saw a first but the
5083 * current node is not triable then we have
5084 * to reset the first information. */
5089 } /* end handle unmergable node */
5090 } /* loop over branches */
5091 DEBUG_TRIE_COMPILE_r({
5092 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5093 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5094 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5095 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5096 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5097 PL_reg_name[trietype]
5101 if ( prev && trietype ) {
5102 if ( trietype != NOTHING ) {
5103 /* the last branch of the sequence was part of
5104 * a trie, so we have to construct it here
5105 * outside of the loop */
5106 made= make_trie( pRExC_state, startbranch,
5107 first, scan, tail, count,
5108 trietype, depth+1 );
5109 #ifdef TRIE_STUDY_OPT
5110 if ( ((made == MADE_EXACT_TRIE &&
5111 startbranch == first)
5112 || ( first_non_open == first )) &&
5114 flags |= SCF_TRIE_RESTUDY;
5115 if ( startbranch == first
5118 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5123 /* at this point we know whatever we have is a
5124 * NOTHING sequence/branch AND if 'startbranch'
5125 * is 'first' then we can turn the whole thing
5128 if ( startbranch == first ) {
5130 /* the entire thing is a NOTHING sequence,
5131 * something like this: (?:|) So we can
5132 * turn it into a plain NOTHING op. */
5133 DEBUG_TRIE_COMPILE_r({
5134 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5135 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5137 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5140 OP(startbranch)= NOTHING;
5141 NEXT_OFF(startbranch)= tail - startbranch;
5142 for ( opt= startbranch + 1; opt < tail ; opt++ )
5146 } /* end if ( prev) */
5147 } /* TRIE_MAXBUF is non zero */
5151 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5152 scan = NEXTOPER(NEXTOPER(scan));
5153 } else /* single branch is optimized. */
5154 scan = NEXTOPER(scan);
5156 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5158 regnode *start = NULL;
5159 regnode *end = NULL;
5160 U32 my_recursed_depth= recursed_depth;
5162 if (OP(scan) != SUSPEND) { /* GOSUB */
5163 /* Do setup, note this code has side effects beyond
5164 * the rest of this block. Specifically setting
5165 * RExC_recurse[] must happen at least once during
5168 RExC_recurse[ARG2L(scan)] = scan;
5169 start = REGNODE_p(RExC_open_parens[paren]);
5170 end = REGNODE_p(RExC_close_parens[paren]);
5172 /* NOTE we MUST always execute the above code, even
5173 * if we do nothing with a GOSUB */
5175 ( flags & SCF_IN_DEFINE )
5178 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5180 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5183 /* no need to do anything here if we are in a define. */
5184 /* or we are after some kind of infinite construct
5185 * so we can skip recursing into this item.
5186 * Since it is infinite we will not change the maxlen
5187 * or delta, and if we miss something that might raise
5188 * the minlen it will merely pessimise a little.
5190 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5191 * might result in a minlen of 1 and not of 4,
5192 * but this doesn't make us mismatch, just try a bit
5193 * harder than we should.
5195 scan= regnext(scan);
5201 || !PAREN_TEST(recursed_depth - 1, paren)
5203 /* it is quite possible that there are more efficient ways
5204 * to do this. We maintain a bitmap per level of recursion
5205 * of which patterns we have entered so we can detect if a
5206 * pattern creates a possible infinite loop. When we
5207 * recurse down a level we copy the previous levels bitmap
5208 * down. When we are at recursion level 0 we zero the top
5209 * level bitmap. It would be nice to implement a different
5210 * more efficient way of doing this. In particular the top
5211 * level bitmap may be unnecessary.
5213 if (!recursed_depth) {
5214 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5216 Copy(PAREN_OFFSET(recursed_depth - 1),
5217 PAREN_OFFSET(recursed_depth),
5218 RExC_study_chunk_recursed_bytes, U8);
5220 /* we havent recursed into this paren yet, so recurse into it */
5221 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5222 PAREN_SET(recursed_depth, paren);
5223 my_recursed_depth= recursed_depth + 1;
5225 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5226 /* some form of infinite recursion, assume infinite length
5228 if (flags & SCF_DO_SUBSTR) {
5229 scan_commit(pRExC_state, data, minlenp, is_inf);
5230 data->cur_is_floating = 1;
5232 is_inf = is_inf_internal = 1;
5233 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5234 ssc_anything(data->start_class);
5235 flags &= ~SCF_DO_STCLASS;
5237 start= NULL; /* reset start so we dont recurse later on. */
5242 end = regnext(scan);
5245 scan_frame *newframe;
5247 if (!RExC_frame_last) {
5248 Newxz(newframe, 1, scan_frame);
5249 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5250 RExC_frame_head= newframe;
5252 } else if (!RExC_frame_last->next_frame) {
5253 Newxz(newframe, 1, scan_frame);
5254 RExC_frame_last->next_frame= newframe;
5255 newframe->prev_frame= RExC_frame_last;
5258 newframe= RExC_frame_last->next_frame;
5260 RExC_frame_last= newframe;
5262 newframe->next_regnode = regnext(scan);
5263 newframe->last_regnode = last;
5264 newframe->stopparen = stopparen;
5265 newframe->prev_recursed_depth = recursed_depth;
5266 newframe->this_prev_frame= frame;
5268 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5269 DEBUG_PEEP("fnew", scan, depth, flags);
5276 recursed_depth= my_recursed_depth;
5281 else if ( OP(scan) == EXACT
5282 || OP(scan) == LEXACT
5283 || OP(scan) == EXACT_REQ8
5284 || OP(scan) == LEXACT_REQ8
5285 || OP(scan) == EXACTL)
5287 SSize_t bytelen = STR_LEN(scan), charlen;
5291 const U8 * const s = (U8*)STRING(scan);
5292 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5293 charlen = utf8_length(s, s + bytelen);
5295 uc = *((U8*)STRING(scan));
5299 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5300 /* The code below prefers earlier match for fixed
5301 offset, later match for variable offset. */
5302 if (data->last_end == -1) { /* Update the start info. */
5303 data->last_start_min = data->pos_min;
5304 data->last_start_max = is_inf
5305 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5307 sv_catpvn(data->last_found, STRING(scan), bytelen);
5309 SvUTF8_on(data->last_found);
5311 SV * const sv = data->last_found;
5312 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5313 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5314 if (mg && mg->mg_len >= 0)
5315 mg->mg_len += charlen;
5317 data->last_end = data->pos_min + charlen;
5318 data->pos_min += charlen; /* As in the first entry. */
5319 data->flags &= ~SF_BEFORE_EOL;
5322 /* ANDing the code point leaves at most it, and not in locale, and
5323 * can't match null string */
5324 if (flags & SCF_DO_STCLASS_AND) {
5325 ssc_cp_and(data->start_class, uc);
5326 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5327 ssc_clear_locale(data->start_class);
5329 else if (flags & SCF_DO_STCLASS_OR) {
5330 ssc_add_cp(data->start_class, uc);
5331 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5333 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5334 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5336 flags &= ~SCF_DO_STCLASS;
5338 else if (PL_regkind[OP(scan)] == EXACT) {
5339 /* But OP != EXACT!, so is EXACTFish */
5340 SSize_t bytelen = STR_LEN(scan), charlen;
5341 const U8 * s = (U8*)STRING(scan);
5343 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5344 * with the mask set to the complement of the bit that differs
5345 * between upper and lower case, and the lowest code point of the
5346 * pair (which the '&' forces) */
5349 && ( OP(scan) == EXACTFAA
5350 || ( OP(scan) == EXACTFU
5351 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5354 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5357 ARG_SET(scan, *s & mask);
5359 /* we're not EXACTFish any more, so restudy */
5363 /* Search for fixed substrings supports EXACT only. */
5364 if (flags & SCF_DO_SUBSTR) {
5366 scan_commit(pRExC_state, data, minlenp, is_inf);
5368 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5369 if (unfolded_multi_char) {
5370 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5372 min += charlen - min_subtract;
5374 delta += min_subtract;
5375 if (flags & SCF_DO_SUBSTR) {
5376 data->pos_min += charlen - min_subtract;
5377 if (data->pos_min < 0) {
5380 data->pos_delta += min_subtract;
5382 data->cur_is_floating = 1; /* float */
5386 if (flags & SCF_DO_STCLASS) {
5387 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5389 assert(EXACTF_invlist);
5390 if (flags & SCF_DO_STCLASS_AND) {
5391 if (OP(scan) != EXACTFL)
5392 ssc_clear_locale(data->start_class);
5393 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5394 ANYOF_POSIXL_ZERO(data->start_class);
5395 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5397 else { /* SCF_DO_STCLASS_OR */
5398 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5399 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5401 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5402 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5404 flags &= ~SCF_DO_STCLASS;
5405 SvREFCNT_dec(EXACTF_invlist);
5408 else if (REGNODE_VARIES(OP(scan))) {
5409 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5410 I32 fl = 0, f = flags;
5411 regnode * const oscan = scan;
5412 regnode_ssc this_class;
5413 regnode_ssc *oclass = NULL;
5414 I32 next_is_eval = 0;
5416 switch (PL_regkind[OP(scan)]) {
5417 case WHILEM: /* End of (?:...)* . */
5418 scan = NEXTOPER(scan);
5421 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5422 next = NEXTOPER(scan);
5423 if ( OP(next) == EXACT
5424 || OP(next) == LEXACT
5425 || OP(next) == EXACT_REQ8
5426 || OP(next) == LEXACT_REQ8
5427 || OP(next) == EXACTL
5428 || (flags & SCF_DO_STCLASS))
5431 maxcount = REG_INFTY;
5432 next = regnext(scan);
5433 scan = NEXTOPER(scan);
5437 if (flags & SCF_DO_SUBSTR)
5442 next = NEXTOPER(scan);
5444 /* This temporary node can now be turned into EXACTFU, and
5445 * must, as regexec.c doesn't handle it */
5446 if (OP(next) == EXACTFU_S_EDGE && !frame) {
5450 if ( STR_LEN(next) == 1
5451 && isALPHA_A(* STRING(next))
5452 && ( OP(next) == EXACTFAA
5453 || ( OP(next) == EXACTFU
5454 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5457 /* These differ in just one bit */
5458 U8 mask = ~ ('A' ^ 'a');
5460 assert(isALPHA_A(* STRING(next)));
5462 /* Then replace it by an ANYOFM node, with
5463 * the mask set to the complement of the
5464 * bit that differs between upper and lower
5465 * case, and the lowest code point of the
5466 * pair (which the '&' forces) */
5468 ARG_SET(next, *STRING(next) & mask);
5472 if (flags & SCF_DO_STCLASS) {
5474 maxcount = REG_INFTY;
5475 next = regnext(scan);
5476 scan = NEXTOPER(scan);
5479 if (flags & SCF_DO_SUBSTR) {
5480 scan_commit(pRExC_state, data, minlenp, is_inf);
5481 /* Cannot extend fixed substrings */
5482 data->cur_is_floating = 1; /* float */
5484 is_inf = is_inf_internal = 1;
5485 scan = regnext(scan);
5486 goto optimize_curly_tail;
5488 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5489 && (scan->flags == stopparen))
5494 mincount = ARG1(scan);
5495 maxcount = ARG2(scan);
5497 next = regnext(scan);
5498 if (OP(scan) == CURLYX) {
5499 I32 lp = (data ? *(data->last_closep) : 0);
5500 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5502 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5503 next_is_eval = (OP(scan) == EVAL);
5505 if (flags & SCF_DO_SUBSTR) {
5507 scan_commit(pRExC_state, data, minlenp, is_inf);
5508 /* Cannot extend fixed substrings */
5509 pos_before = data->pos_min;
5513 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5515 data->flags |= SF_IS_INF;
5517 if (flags & SCF_DO_STCLASS) {
5518 ssc_init(pRExC_state, &this_class);
5519 oclass = data->start_class;
5520 data->start_class = &this_class;
5521 f |= SCF_DO_STCLASS_AND;
5522 f &= ~SCF_DO_STCLASS_OR;
5524 /* Exclude from super-linear cache processing any {n,m}
5525 regops for which the combination of input pos and regex
5526 pos is not enough information to determine if a match
5529 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5530 regex pos at the \s*, the prospects for a match depend not
5531 only on the input position but also on how many (bar\s*)
5532 repeats into the {4,8} we are. */
5533 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5534 f &= ~SCF_WHILEM_VISITED_POS;
5536 /* This will finish on WHILEM, setting scan, or on NULL: */
5537 /* recurse study_chunk() on loop bodies */
5538 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5539 last, data, stopparen, recursed_depth, NULL,
5541 ? (f & ~SCF_DO_SUBSTR)
5545 if (flags & SCF_DO_STCLASS)
5546 data->start_class = oclass;
5547 if (mincount == 0 || minnext == 0) {
5548 if (flags & SCF_DO_STCLASS_OR) {
5549 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5551 else if (flags & SCF_DO_STCLASS_AND) {
5552 /* Switch to OR mode: cache the old value of
5553 * data->start_class */
5555 StructCopy(data->start_class, and_withp, regnode_ssc);
5556 flags &= ~SCF_DO_STCLASS_AND;
5557 StructCopy(&this_class, data->start_class, regnode_ssc);
5558 flags |= SCF_DO_STCLASS_OR;
5559 ANYOF_FLAGS(data->start_class)
5560 |= SSC_MATCHES_EMPTY_STRING;
5562 } else { /* Non-zero len */
5563 if (flags & SCF_DO_STCLASS_OR) {
5564 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5565 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5567 else if (flags & SCF_DO_STCLASS_AND)
5568 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5569 flags &= ~SCF_DO_STCLASS;
5571 if (!scan) /* It was not CURLYX, but CURLY. */
5573 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5574 /* ? quantifier ok, except for (?{ ... }) */
5575 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5576 && (minnext == 0) && (deltanext == 0)
5577 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5578 && maxcount <= REG_INFTY/3) /* Complement check for big
5581 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5582 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5583 "Quantifier unexpected on zero-length expression "
5584 "in regex m/%" UTF8f "/",
5585 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5589 min += minnext * mincount;
5590 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5591 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5592 is_inf |= is_inf_internal;
5594 delta = OPTIMIZE_INFTY;
5596 delta += (minnext + deltanext) * maxcount
5597 - minnext * mincount;
5599 /* Try powerful optimization CURLYX => CURLYN. */
5600 if ( OP(oscan) == CURLYX && data
5601 && data->flags & SF_IN_PAR
5602 && !(data->flags & SF_HAS_EVAL)
5603 && !deltanext && minnext == 1
5606 /* Try to optimize to CURLYN. */
5607 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5608 regnode * const nxt1 = nxt;
5615 if (!REGNODE_SIMPLE(OP(nxt))
5616 && !(PL_regkind[OP(nxt)] == EXACT
5617 && STR_LEN(nxt) == 1))
5623 if (OP(nxt) != CLOSE)
5625 if (RExC_open_parens) {
5628 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5631 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5633 /* Now we know that nxt2 is the only contents: */
5634 oscan->flags = (U8)ARG(nxt);
5636 OP(nxt1) = NOTHING; /* was OPEN. */
5639 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5640 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5641 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5642 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5643 OP(nxt + 1) = OPTIMIZED; /* was count. */
5644 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5649 /* Try optimization CURLYX => CURLYM. */
5650 if ( OP(oscan) == CURLYX && data
5651 && !(data->flags & SF_HAS_PAR)
5652 && !(data->flags & SF_HAS_EVAL)
5653 && !deltanext /* atom is fixed width */
5654 && minnext != 0 /* CURLYM can't handle zero width */
5655 /* Nor characters whose fold at run-time may be
5656 * multi-character */
5657 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5660 /* XXXX How to optimize if data == 0? */
5661 /* Optimize to a simpler form. */
5662 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5666 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5667 && (OP(nxt2) != WHILEM))
5669 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5670 /* Need to optimize away parenths. */
5671 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5672 /* Set the parenth number. */
5673 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5675 oscan->flags = (U8)ARG(nxt);
5676 if (RExC_open_parens) {
5678 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5681 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5684 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5685 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5688 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5689 OP(nxt + 1) = OPTIMIZED; /* was count. */
5690 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5691 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5694 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5695 regnode *nnxt = regnext(nxt1);
5697 if (reg_off_by_arg[OP(nxt1)])
5698 ARG_SET(nxt1, nxt2 - nxt1);
5699 else if (nxt2 - nxt1 < U16_MAX)
5700 NEXT_OFF(nxt1) = nxt2 - nxt1;
5702 OP(nxt) = NOTHING; /* Cannot beautify */
5707 /* Optimize again: */
5708 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5709 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5710 NULL, stopparen, recursed_depth, NULL, 0,
5716 else if ((OP(oscan) == CURLYX)
5717 && (flags & SCF_WHILEM_VISITED_POS)
5718 /* See the comment on a similar expression above.
5719 However, this time it's not a subexpression
5720 we care about, but the expression itself. */
5721 && (maxcount == REG_INFTY)
5723 /* This stays as CURLYX, we can put the count/of pair. */
5724 /* Find WHILEM (as in regexec.c) */
5725 regnode *nxt = oscan + NEXT_OFF(oscan);
5727 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5729 nxt = PREVOPER(nxt);
5730 if (nxt->flags & 0xf) {
5731 /* we've already set whilem count on this node */
5732 } else if (++data->whilem_c < 16) {
5733 assert(data->whilem_c <= RExC_whilem_seen);
5734 nxt->flags = (U8)(data->whilem_c
5735 | (RExC_whilem_seen << 4)); /* On WHILEM */
5738 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5740 if (flags & SCF_DO_SUBSTR) {
5741 SV *last_str = NULL;
5742 STRLEN last_chrs = 0;
5743 int counted = mincount != 0;
5745 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5747 SSize_t b = pos_before >= data->last_start_min
5748 ? pos_before : data->last_start_min;
5750 const char * const s = SvPV_const(data->last_found, l);
5751 SSize_t old = b - data->last_start_min;
5755 old = utf8_hop_forward((U8*)s, old,
5756 (U8 *) SvEND(data->last_found))
5759 /* Get the added string: */
5760 last_str = newSVpvn_utf8(s + old, l, UTF);
5761 last_chrs = UTF ? utf8_length((U8*)(s + old),
5762 (U8*)(s + old + l)) : l;
5763 if (deltanext == 0 && pos_before == b) {
5764 /* What was added is a constant string */
5767 SvGROW(last_str, (mincount * l) + 1);
5768 repeatcpy(SvPVX(last_str) + l,
5769 SvPVX_const(last_str), l,
5771 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5772 /* Add additional parts. */
5773 SvCUR_set(data->last_found,
5774 SvCUR(data->last_found) - l);
5775 sv_catsv(data->last_found, last_str);
5777 SV * sv = data->last_found;
5779 SvUTF8(sv) && SvMAGICAL(sv) ?
5780 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5781 if (mg && mg->mg_len >= 0)
5782 mg->mg_len += last_chrs * (mincount-1);
5784 last_chrs *= mincount;
5785 data->last_end += l * (mincount - 1);
5788 /* start offset must point into the last copy */
5789 data->last_start_min += minnext * (mincount - 1);
5790 data->last_start_max =
5793 : data->last_start_max +
5794 (maxcount - 1) * (minnext + data->pos_delta);
5797 /* It is counted once already... */
5798 data->pos_min += minnext * (mincount - counted);
5800 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5801 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5802 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5803 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5805 if (deltanext != OPTIMIZE_INFTY)
5806 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5807 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5808 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5810 if (deltanext == OPTIMIZE_INFTY
5811 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5812 data->pos_delta = OPTIMIZE_INFTY;
5814 data->pos_delta += - counted * deltanext +
5815 (minnext + deltanext) * maxcount - minnext * mincount;
5816 if (mincount != maxcount) {
5817 /* Cannot extend fixed substrings found inside
5819 scan_commit(pRExC_state, data, minlenp, is_inf);
5820 if (mincount && last_str) {
5821 SV * const sv = data->last_found;
5822 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5823 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5827 sv_setsv(sv, last_str);
5828 data->last_end = data->pos_min;
5829 data->last_start_min = data->pos_min - last_chrs;
5830 data->last_start_max = is_inf
5832 : data->pos_min + data->pos_delta - last_chrs;
5834 data->cur_is_floating = 1; /* float */
5836 SvREFCNT_dec(last_str);
5838 if (data && (fl & SF_HAS_EVAL))
5839 data->flags |= SF_HAS_EVAL;
5840 optimize_curly_tail:
5841 if (OP(oscan) != CURLYX) {
5842 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5844 NEXT_OFF(oscan) += NEXT_OFF(next);
5849 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5853 if (flags & SCF_DO_SUBSTR) {
5854 /* Cannot expect anything... */
5855 scan_commit(pRExC_state, data, minlenp, is_inf);
5856 data->cur_is_floating = 1; /* float */
5858 is_inf = is_inf_internal = 1;
5859 if (flags & SCF_DO_STCLASS_OR) {
5860 if (OP(scan) == CLUMP) {
5861 /* Actually is any start char, but very few code points
5862 * aren't start characters */
5863 ssc_match_all_cp(data->start_class);
5866 ssc_anything(data->start_class);
5869 flags &= ~SCF_DO_STCLASS;
5873 else if (OP(scan) == LNBREAK) {
5874 if (flags & SCF_DO_STCLASS) {
5875 if (flags & SCF_DO_STCLASS_AND) {
5876 ssc_intersection(data->start_class,
5877 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5878 ssc_clear_locale(data->start_class);
5879 ANYOF_FLAGS(data->start_class)
5880 &= ~SSC_MATCHES_EMPTY_STRING;
5882 else if (flags & SCF_DO_STCLASS_OR) {
5883 ssc_union(data->start_class,
5884 PL_XPosix_ptrs[_CC_VERTSPACE],
5886 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5888 /* See commit msg for
5889 * 749e076fceedeb708a624933726e7989f2302f6a */
5890 ANYOF_FLAGS(data->start_class)
5891 &= ~SSC_MATCHES_EMPTY_STRING;
5893 flags &= ~SCF_DO_STCLASS;
5896 if (delta != OPTIMIZE_INFTY)
5897 delta++; /* Because of the 2 char string cr-lf */
5898 if (flags & SCF_DO_SUBSTR) {
5899 /* Cannot expect anything... */
5900 scan_commit(pRExC_state, data, minlenp, is_inf);
5902 if (data->pos_delta != OPTIMIZE_INFTY) {
5903 data->pos_delta += 1;
5905 data->cur_is_floating = 1; /* float */
5908 else if (REGNODE_SIMPLE(OP(scan))) {
5910 if (flags & SCF_DO_SUBSTR) {
5911 scan_commit(pRExC_state, data, minlenp, is_inf);
5915 if (flags & SCF_DO_STCLASS) {
5917 SV* my_invlist = NULL;
5920 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5921 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5923 /* Some of the logic below assumes that switching
5924 locale on will only add false positives. */
5929 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5933 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5934 ssc_match_all_cp(data->start_class);
5939 SV* REG_ANY_invlist = _new_invlist(2);
5940 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5942 if (flags & SCF_DO_STCLASS_OR) {
5943 ssc_union(data->start_class,
5945 TRUE /* TRUE => invert, hence all but \n
5949 else if (flags & SCF_DO_STCLASS_AND) {
5950 ssc_intersection(data->start_class,
5952 TRUE /* TRUE => invert */
5954 ssc_clear_locale(data->start_class);
5956 SvREFCNT_dec_NN(REG_ANY_invlist);
5968 if (flags & SCF_DO_STCLASS_AND)
5969 ssc_and(pRExC_state, data->start_class,
5970 (regnode_charclass *) scan);
5972 ssc_or(pRExC_state, data->start_class,
5973 (regnode_charclass *) scan);
5976 case NANYOFM: /* NANYOFM already contains the inversion of the
5977 input ANYOF data, so, unlike things like
5978 NPOSIXA, don't change 'invert' to TRUE */
5982 SV* cp_list = get_ANYOFM_contents(scan);
5984 if (flags & SCF_DO_STCLASS_OR) {
5985 ssc_union(data->start_class, cp_list, invert);
5987 else if (flags & SCF_DO_STCLASS_AND) {
5988 ssc_intersection(data->start_class, cp_list, invert);
5991 SvREFCNT_dec_NN(cp_list);
6000 cp_list = _add_range_to_invlist(cp_list,
6002 ANYOFRbase(scan) + ANYOFRdelta(scan));
6004 if (flags & SCF_DO_STCLASS_OR) {
6005 ssc_union(data->start_class, cp_list, invert);
6007 else if (flags & SCF_DO_STCLASS_AND) {
6008 ssc_intersection(data->start_class, cp_list, invert);
6011 SvREFCNT_dec_NN(cp_list);
6020 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6021 if (flags & SCF_DO_STCLASS_AND) {
6022 bool was_there = cBOOL(
6023 ANYOF_POSIXL_TEST(data->start_class,
6025 ANYOF_POSIXL_ZERO(data->start_class);
6026 if (was_there) { /* Do an AND */
6027 ANYOF_POSIXL_SET(data->start_class, namedclass);
6029 /* No individual code points can now match */
6030 data->start_class->invlist
6031 = sv_2mortal(_new_invlist(0));
6034 int complement = namedclass + ((invert) ? -1 : 1);
6036 assert(flags & SCF_DO_STCLASS_OR);
6038 /* If the complement of this class was already there,
6039 * the result is that they match all code points,
6040 * (\d + \D == everything). Remove the classes from
6041 * future consideration. Locale is not relevant in
6043 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6044 ssc_match_all_cp(data->start_class);
6045 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6046 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6048 else { /* The usual case; just add this class to the
6050 ANYOF_POSIXL_SET(data->start_class, namedclass);
6055 case NPOSIXA: /* For these, we always know the exact set of
6060 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6061 goto join_posix_and_ascii;
6069 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6071 /* NPOSIXD matches all upper Latin1 code points unless the
6072 * target string being matched is UTF-8, which is
6073 * unknowable until match time. Since we are going to
6074 * invert, we want to get rid of all of them so that the
6075 * inversion will match all */
6076 if (OP(scan) == NPOSIXD) {
6077 _invlist_subtract(my_invlist, PL_UpperLatin1,
6081 join_posix_and_ascii:
6083 if (flags & SCF_DO_STCLASS_AND) {
6084 ssc_intersection(data->start_class, my_invlist, invert);
6085 ssc_clear_locale(data->start_class);
6088 assert(flags & SCF_DO_STCLASS_OR);
6089 ssc_union(data->start_class, my_invlist, invert);
6091 SvREFCNT_dec(my_invlist);
6093 if (flags & SCF_DO_STCLASS_OR)
6094 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6095 flags &= ~SCF_DO_STCLASS;
6098 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6099 data->flags |= (OP(scan) == MEOL
6102 scan_commit(pRExC_state, data, minlenp, is_inf);
6105 else if ( PL_regkind[OP(scan)] == BRANCHJ
6106 /* Lookbehind, or need to calculate parens/evals/stclass: */
6107 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6108 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6110 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6111 || OP(scan) == UNLESSM )
6113 /* Negative Lookahead/lookbehind
6114 In this case we can't do fixed string optimisation.
6117 SSize_t deltanext, minnext, fake = 0;
6122 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6124 data_fake.whilem_c = data->whilem_c;
6125 data_fake.last_closep = data->last_closep;
6128 data_fake.last_closep = &fake;
6129 data_fake.pos_delta = delta;
6130 if ( flags & SCF_DO_STCLASS && !scan->flags
6131 && OP(scan) == IFMATCH ) { /* Lookahead */
6132 ssc_init(pRExC_state, &intrnl);
6133 data_fake.start_class = &intrnl;
6134 f |= SCF_DO_STCLASS_AND;
6136 if (flags & SCF_WHILEM_VISITED_POS)
6137 f |= SCF_WHILEM_VISITED_POS;
6138 next = regnext(scan);
6139 nscan = NEXTOPER(NEXTOPER(scan));
6141 /* recurse study_chunk() for lookahead body */
6142 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6143 last, &data_fake, stopparen,
6144 recursed_depth, NULL, f, depth+1);
6147 || deltanext > (I32) U8_MAX
6148 || minnext > (I32)U8_MAX
6149 || minnext + deltanext > (I32)U8_MAX)
6151 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6155 /* The 'next_off' field has been repurposed to count the
6156 * additional starting positions to try beyond the initial
6157 * one. (This leaves it at 0 for non-variable length
6158 * matches to avoid breakage for those not using this
6161 scan->next_off = deltanext;
6162 ckWARNexperimental(RExC_parse,
6163 WARN_EXPERIMENTAL__VLB,
6164 "Variable length lookbehind is experimental");
6166 scan->flags = (U8)minnext + deltanext;
6169 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6171 if (data_fake.flags & SF_HAS_EVAL)
6172 data->flags |= SF_HAS_EVAL;
6173 data->whilem_c = data_fake.whilem_c;
6175 if (f & SCF_DO_STCLASS_AND) {
6176 if (flags & SCF_DO_STCLASS_OR) {
6177 /* OR before, AND after: ideally we would recurse with
6178 * data_fake to get the AND applied by study of the
6179 * remainder of the pattern, and then derecurse;
6180 * *** HACK *** for now just treat as "no information".
6181 * See [perl #56690].
6183 ssc_init(pRExC_state, data->start_class);
6185 /* AND before and after: combine and continue. These
6186 * assertions are zero-length, so can match an EMPTY
6188 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6189 ANYOF_FLAGS(data->start_class)
6190 |= SSC_MATCHES_EMPTY_STRING;
6194 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6196 /* Positive Lookahead/lookbehind
6197 In this case we can do fixed string optimisation,
6198 but we must be careful about it. Note in the case of
6199 lookbehind the positions will be offset by the minimum
6200 length of the pattern, something we won't know about
6201 until after the recurse.
6203 SSize_t deltanext, fake = 0;
6207 /* We use SAVEFREEPV so that when the full compile
6208 is finished perl will clean up the allocated
6209 minlens when it's all done. This way we don't
6210 have to worry about freeing them when we know
6211 they wont be used, which would be a pain.
6214 Newx( minnextp, 1, SSize_t );
6215 SAVEFREEPV(minnextp);
6218 StructCopy(data, &data_fake, scan_data_t);
6219 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6222 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6223 data_fake.last_found=newSVsv(data->last_found);
6227 data_fake.last_closep = &fake;
6228 data_fake.flags = 0;
6229 data_fake.substrs[0].flags = 0;
6230 data_fake.substrs[1].flags = 0;
6231 data_fake.pos_delta = delta;
6233 data_fake.flags |= SF_IS_INF;
6234 if ( flags & SCF_DO_STCLASS && !scan->flags
6235 && OP(scan) == IFMATCH ) { /* Lookahead */
6236 ssc_init(pRExC_state, &intrnl);
6237 data_fake.start_class = &intrnl;
6238 f |= SCF_DO_STCLASS_AND;
6240 if (flags & SCF_WHILEM_VISITED_POS)
6241 f |= SCF_WHILEM_VISITED_POS;
6242 next = regnext(scan);
6243 nscan = NEXTOPER(NEXTOPER(scan));
6245 /* positive lookahead study_chunk() recursion */
6246 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6247 &deltanext, last, &data_fake,
6248 stopparen, recursed_depth, NULL,
6251 assert(0); /* This code has never been tested since this
6252 is normally not compiled */
6254 || deltanext > (I32) U8_MAX
6255 || *minnextp > (I32)U8_MAX
6256 || *minnextp + deltanext > (I32)U8_MAX)
6258 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6263 scan->next_off = deltanext;
6265 scan->flags = (U8)*minnextp + deltanext;
6270 if (f & SCF_DO_STCLASS_AND) {
6271 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6272 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6275 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6277 if (data_fake.flags & SF_HAS_EVAL)
6278 data->flags |= SF_HAS_EVAL;
6279 data->whilem_c = data_fake.whilem_c;
6280 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6282 if (RExC_rx->minlen<*minnextp)
6283 RExC_rx->minlen=*minnextp;
6284 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6285 SvREFCNT_dec_NN(data_fake.last_found);
6287 for (i = 0; i < 2; i++) {
6288 if (data_fake.substrs[i].minlenp != minlenp) {
6289 data->substrs[i].min_offset =
6290 data_fake.substrs[i].min_offset;
6291 data->substrs[i].max_offset =
6292 data_fake.substrs[i].max_offset;
6293 data->substrs[i].minlenp =
6294 data_fake.substrs[i].minlenp;
6295 data->substrs[i].lookbehind += scan->flags;
6303 else if (OP(scan) == OPEN) {
6304 if (stopparen != (I32)ARG(scan))
6307 else if (OP(scan) == CLOSE) {
6308 if (stopparen == (I32)ARG(scan)) {
6311 if ((I32)ARG(scan) == is_par) {
6312 next = regnext(scan);
6314 if ( next && (OP(next) != WHILEM) && next < last)
6315 is_par = 0; /* Disable optimization */
6318 *(data->last_closep) = ARG(scan);
6320 else if (OP(scan) == EVAL) {
6322 data->flags |= SF_HAS_EVAL;
6324 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6325 if (flags & SCF_DO_SUBSTR) {
6326 scan_commit(pRExC_state, data, minlenp, is_inf);
6327 flags &= ~SCF_DO_SUBSTR;
6329 if (data && OP(scan)==ACCEPT) {
6330 data->flags |= SCF_SEEN_ACCEPT;
6335 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6337 if (flags & SCF_DO_SUBSTR) {
6338 scan_commit(pRExC_state, data, minlenp, is_inf);
6339 data->cur_is_floating = 1; /* float */
6341 is_inf = is_inf_internal = 1;
6342 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6343 ssc_anything(data->start_class);
6344 flags &= ~SCF_DO_STCLASS;
6346 else if (OP(scan) == GPOS) {
6347 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6348 !(delta || is_inf || (data && data->pos_delta)))
6350 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6351 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6352 if (RExC_rx->gofs < (STRLEN)min)
6353 RExC_rx->gofs = min;
6355 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6359 #ifdef TRIE_STUDY_OPT
6360 #ifdef FULL_TRIE_STUDY
6361 else if (PL_regkind[OP(scan)] == TRIE) {
6362 /* NOTE - There is similar code to this block above for handling
6363 BRANCH nodes on the initial study. If you change stuff here
6365 regnode *trie_node= scan;
6366 regnode *tail= regnext(scan);
6367 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6368 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6371 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6372 /* Cannot merge strings after this. */
6373 scan_commit(pRExC_state, data, minlenp, is_inf);
6375 if (flags & SCF_DO_STCLASS)
6376 ssc_init_zero(pRExC_state, &accum);
6382 const regnode *nextbranch= NULL;
6385 for ( word=1 ; word <= trie->wordcount ; word++)
6387 SSize_t deltanext=0, minnext=0, f = 0, fake;
6388 regnode_ssc this_class;
6390 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6392 data_fake.whilem_c = data->whilem_c;
6393 data_fake.last_closep = data->last_closep;
6396 data_fake.last_closep = &fake;
6397 data_fake.pos_delta = delta;
6398 if (flags & SCF_DO_STCLASS) {
6399 ssc_init(pRExC_state, &this_class);
6400 data_fake.start_class = &this_class;
6401 f = SCF_DO_STCLASS_AND;
6403 if (flags & SCF_WHILEM_VISITED_POS)
6404 f |= SCF_WHILEM_VISITED_POS;
6406 if (trie->jump[word]) {
6408 nextbranch = trie_node + trie->jump[0];
6409 scan= trie_node + trie->jump[word];
6410 /* We go from the jump point to the branch that follows
6411 it. Note this means we need the vestigal unused
6412 branches even though they arent otherwise used. */
6413 /* optimise study_chunk() for TRIE */
6414 minnext = study_chunk(pRExC_state, &scan, minlenp,
6415 &deltanext, (regnode *)nextbranch, &data_fake,
6416 stopparen, recursed_depth, NULL, f, depth+1);
6418 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6419 nextbranch= regnext((regnode*)nextbranch);
6421 if (min1 > (SSize_t)(minnext + trie->minlen))
6422 min1 = minnext + trie->minlen;
6423 if (deltanext == OPTIMIZE_INFTY) {
6424 is_inf = is_inf_internal = 1;
6425 max1 = OPTIMIZE_INFTY;
6426 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6427 max1 = minnext + deltanext + trie->maxlen;
6429 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6431 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6432 if ( stopmin > min + min1)
6433 stopmin = min + min1;
6434 flags &= ~SCF_DO_SUBSTR;
6436 data->flags |= SCF_SEEN_ACCEPT;
6439 if (data_fake.flags & SF_HAS_EVAL)
6440 data->flags |= SF_HAS_EVAL;
6441 data->whilem_c = data_fake.whilem_c;
6443 if (flags & SCF_DO_STCLASS)
6444 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6447 if (flags & SCF_DO_SUBSTR) {
6448 data->pos_min += min1;
6449 data->pos_delta += max1 - min1;
6450 if (max1 != min1 || is_inf)
6451 data->cur_is_floating = 1; /* float */
6454 if (delta != OPTIMIZE_INFTY) {
6455 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6456 delta += max1 - min1;
6458 delta = OPTIMIZE_INFTY;
6460 if (flags & SCF_DO_STCLASS_OR) {
6461 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6463 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6464 flags &= ~SCF_DO_STCLASS;
6467 else if (flags & SCF_DO_STCLASS_AND) {
6469 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6470 flags &= ~SCF_DO_STCLASS;
6473 /* Switch to OR mode: cache the old value of
6474 * data->start_class */
6476 StructCopy(data->start_class, and_withp, regnode_ssc);
6477 flags &= ~SCF_DO_STCLASS_AND;
6478 StructCopy(&accum, data->start_class, regnode_ssc);
6479 flags |= SCF_DO_STCLASS_OR;
6486 else if (PL_regkind[OP(scan)] == TRIE) {
6487 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6490 min += trie->minlen;
6491 delta += (trie->maxlen - trie->minlen);
6492 flags &= ~SCF_DO_STCLASS; /* xxx */
6493 if (flags & SCF_DO_SUBSTR) {
6494 /* Cannot expect anything... */
6495 scan_commit(pRExC_state, data, minlenp, is_inf);
6496 data->pos_min += trie->minlen;
6497 data->pos_delta += (trie->maxlen - trie->minlen);
6498 if (trie->maxlen != trie->minlen)
6499 data->cur_is_floating = 1; /* float */
6501 if (trie->jump) /* no more substrings -- for now /grr*/
6502 flags &= ~SCF_DO_SUBSTR;
6504 else if (OP(scan) == REGEX_SET) {
6505 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6506 " before optimization", reg_name[REGEX_SET]);
6509 #endif /* old or new */
6510 #endif /* TRIE_STUDY_OPT */
6512 /* Else: zero-length, ignore. */
6513 scan = regnext(scan);
6518 /* we need to unwind recursion. */
6521 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6522 DEBUG_PEEP("fend", scan, depth, flags);
6524 /* restore previous context */
6525 last = frame->last_regnode;
6526 scan = frame->next_regnode;
6527 stopparen = frame->stopparen;
6528 recursed_depth = frame->prev_recursed_depth;
6530 RExC_frame_last = frame->prev_frame;
6531 frame = frame->this_prev_frame;
6532 goto fake_study_recurse;
6536 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6539 *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6541 if (flags & SCF_DO_SUBSTR && is_inf)
6542 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6543 if (is_par > (I32)U8_MAX)
6545 if (is_par && pars==1 && data) {
6546 data->flags |= SF_IN_PAR;
6547 data->flags &= ~SF_HAS_PAR;
6549 else if (pars && data) {
6550 data->flags |= SF_HAS_PAR;
6551 data->flags &= ~SF_IN_PAR;
6553 if (flags & SCF_DO_STCLASS_OR)
6554 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6555 if (flags & SCF_TRIE_RESTUDY)
6556 data->flags |= SCF_TRIE_RESTUDY;
6558 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6560 final_minlen = min < stopmin
6563 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6564 if (final_minlen > OPTIMIZE_INFTY - delta)
6565 RExC_maxlen = OPTIMIZE_INFTY;
6566 else if (RExC_maxlen < final_minlen + delta)
6567 RExC_maxlen = final_minlen + delta;
6569 return final_minlen;
6573 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6575 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6577 PERL_ARGS_ASSERT_ADD_DATA;
6579 Renewc(RExC_rxi->data,
6580 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6581 char, struct reg_data);
6583 Renew(RExC_rxi->data->what, count + n, U8);
6585 Newx(RExC_rxi->data->what, n, U8);
6586 RExC_rxi->data->count = count + n;
6587 Copy(s, RExC_rxi->data->what + count, n, U8);
6591 /*XXX: todo make this not included in a non debugging perl, but appears to be
6592 * used anyway there, in 'use re' */
6593 #ifndef PERL_IN_XSUB_RE
6595 Perl_reginitcolors(pTHX)
6597 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6599 char *t = savepv(s);
6603 t = strchr(t, '\t');
6609 PL_colors[i] = t = (char *)"";
6614 PL_colors[i++] = (char *)"";
6621 #ifdef TRIE_STUDY_OPT
6622 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6625 (data.flags & SCF_TRIE_RESTUDY) \
6633 #define CHECK_RESTUDY_GOTO_butfirst
6637 * pregcomp - compile a regular expression into internal code
6639 * Decides which engine's compiler to call based on the hint currently in
6643 #ifndef PERL_IN_XSUB_RE
6645 /* return the currently in-scope regex engine (or the default if none) */
6647 regexp_engine const *
6648 Perl_current_re_engine(pTHX)
6650 if (IN_PERL_COMPILETIME) {
6651 HV * const table = GvHV(PL_hintgv);
6654 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6655 return &PL_core_reg_engine;
6656 ptr = hv_fetchs(table, "regcomp", FALSE);
6657 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6658 return &PL_core_reg_engine;
6659 return INT2PTR(regexp_engine*, SvIV(*ptr));
6663 if (!PL_curcop->cop_hints_hash)
6664 return &PL_core_reg_engine;
6665 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6666 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6667 return &PL_core_reg_engine;
6668 return INT2PTR(regexp_engine*, SvIV(ptr));
6674 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6676 regexp_engine const *eng = current_re_engine();
6677 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6679 PERL_ARGS_ASSERT_PREGCOMP;
6681 /* Dispatch a request to compile a regexp to correct regexp engine. */
6683 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6686 return CALLREGCOMP_ENG(eng, pattern, flags);
6690 /* public(ish) entry point for the perl core's own regex compiling code.
6691 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6692 * pattern rather than a list of OPs, and uses the internal engine rather
6693 * than the current one */
6696 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6698 SV *pat = pattern; /* defeat constness! */
6700 PERL_ARGS_ASSERT_RE_COMPILE;
6702 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6703 #ifdef PERL_IN_XSUB_RE
6706 &PL_core_reg_engine,
6708 NULL, NULL, rx_flags, 0);
6712 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6716 if (--cbs->refcnt > 0)
6718 for (n = 0; n < cbs->count; n++) {
6719 REGEXP *rx = cbs->cb[n].src_regex;
6721 cbs->cb[n].src_regex = NULL;
6722 SvREFCNT_dec_NN(rx);
6730 static struct reg_code_blocks *
6731 S_alloc_code_blocks(pTHX_ int ncode)
6733 struct reg_code_blocks *cbs;
6734 Newx(cbs, 1, struct reg_code_blocks);
6737 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6739 Newx(cbs->cb, ncode, struct reg_code_block);
6746 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6747 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6748 * point to the realloced string and length.
6750 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6754 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6755 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6757 U8 *const src = (U8*)*pat_p;
6762 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6764 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6765 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6767 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6768 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6771 while (s < *plen_p) {
6772 append_utf8_from_native_byte(src[s], &d);
6774 if (n < num_code_blocks) {
6775 assert(pRExC_state->code_blocks);
6776 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6777 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6778 assert(*(d - 1) == '(');
6781 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6782 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6783 assert(*(d - 1) == ')');
6792 *pat_p = (char*) dst;
6794 RExC_orig_utf8 = RExC_utf8 = 1;
6799 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6800 * while recording any code block indices, and handling overloading,
6801 * nested qr// objects etc. If pat is null, it will allocate a new
6802 * string, or just return the first arg, if there's only one.
6804 * Returns the malloced/updated pat.
6805 * patternp and pat_count is the array of SVs to be concatted;
6806 * oplist is the optional list of ops that generated the SVs;
6807 * recompile_p is a pointer to a boolean that will be set if
6808 * the regex will need to be recompiled.
6809 * delim, if non-null is an SV that will be inserted between each element
6813 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6814 SV *pat, SV ** const patternp, int pat_count,
6815 OP *oplist, bool *recompile_p, SV *delim)
6819 bool use_delim = FALSE;
6820 bool alloced = FALSE;
6822 /* if we know we have at least two args, create an empty string,
6823 * then concatenate args to that. For no args, return an empty string */
6824 if (!pat && pat_count != 1) {
6830 for (svp = patternp; svp < patternp + pat_count; svp++) {
6833 STRLEN orig_patlen = 0;
6835 SV *msv = use_delim ? delim : *svp;
6836 if (!msv) msv = &PL_sv_undef;
6838 /* if we've got a delimiter, we go round the loop twice for each
6839 * svp slot (except the last), using the delimiter the second
6848 if (SvTYPE(msv) == SVt_PVAV) {
6849 /* we've encountered an interpolated array within
6850 * the pattern, e.g. /...@a..../. Expand the list of elements,
6851 * then recursively append elements.
6852 * The code in this block is based on S_pushav() */
6854 AV *const av = (AV*)msv;
6855 const SSize_t maxarg = AvFILL(av) + 1;
6859 assert(oplist->op_type == OP_PADAV
6860 || oplist->op_type == OP_RV2AV);
6861 oplist = OpSIBLING(oplist);
6864 if (SvRMAGICAL(av)) {
6867 Newx(array, maxarg, SV*);
6869 for (i=0; i < maxarg; i++) {
6870 SV ** const svp = av_fetch(av, i, FALSE);
6871 array[i] = svp ? *svp : &PL_sv_undef;
6875 array = AvARRAY(av);
6877 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6878 array, maxarg, NULL, recompile_p,
6880 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6886 /* we make the assumption here that each op in the list of
6887 * op_siblings maps to one SV pushed onto the stack,
6888 * except for code blocks, with have both an OP_NULL and
6890 * This allows us to match up the list of SVs against the
6891 * list of OPs to find the next code block.
6893 * Note that PUSHMARK PADSV PADSV ..
6895 * PADRANGE PADSV PADSV ..
6896 * so the alignment still works. */
6899 if (oplist->op_type == OP_NULL
6900 && (oplist->op_flags & OPf_SPECIAL))
6902 assert(n < pRExC_state->code_blocks->count);
6903 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6904 pRExC_state->code_blocks->cb[n].block = oplist;
6905 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6908 oplist = OpSIBLING(oplist); /* skip CONST */
6911 oplist = OpSIBLING(oplist);;
6914 /* apply magic and QR overloading to arg */
6917 if (SvROK(msv) && SvAMAGIC(msv)) {
6918 SV *sv = AMG_CALLunary(msv, regexp_amg);
6922 if (SvTYPE(sv) != SVt_REGEXP)
6923 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6928 /* try concatenation overload ... */
6929 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6930 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6933 /* overloading involved: all bets are off over literal
6934 * code. Pretend we haven't seen it */
6936 pRExC_state->code_blocks->count -= n;
6940 /* ... or failing that, try "" overload */
6941 while (SvAMAGIC(msv)
6942 && (sv = AMG_CALLunary(msv, string_amg))
6946 && SvRV(msv) == SvRV(sv))
6951 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6955 /* this is a partially unrolled
6956 * sv_catsv_nomg(pat, msv);
6957 * that allows us to adjust code block indices if
6960 char *dst = SvPV_force_nomg(pat, dlen);
6962 if (SvUTF8(msv) && !SvUTF8(pat)) {
6963 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6964 sv_setpvn(pat, dst, dlen);
6967 sv_catsv_nomg(pat, msv);
6971 /* We have only one SV to process, but we need to verify
6972 * it is properly null terminated or we will fail asserts
6973 * later. In theory we probably shouldn't get such SV's,
6974 * but if we do we should handle it gracefully. */
6975 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6976 /* not a string, or a string with a trailing null */
6979 /* a string with no trailing null, we need to copy it
6980 * so it has a trailing null */
6981 pat = sv_2mortal(newSVsv(msv));
6986 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6989 /* extract any code blocks within any embedded qr//'s */
6990 if (rx && SvTYPE(rx) == SVt_REGEXP
6991 && RX_ENGINE((REGEXP*)rx)->op_comp)
6994 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6995 if (ri->code_blocks && ri->code_blocks->count) {
6997 /* the presence of an embedded qr// with code means
6998 * we should always recompile: the text of the
6999 * qr// may not have changed, but it may be a
7000 * different closure than last time */
7002 if (pRExC_state->code_blocks) {
7003 int new_count = pRExC_state->code_blocks->count
7004 + ri->code_blocks->count;
7005 Renew(pRExC_state->code_blocks->cb,
7006 new_count, struct reg_code_block);
7007 pRExC_state->code_blocks->count = new_count;
7010 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7011 ri->code_blocks->count);
7013 for (i=0; i < ri->code_blocks->count; i++) {
7014 struct reg_code_block *src, *dst;
7015 STRLEN offset = orig_patlen
7016 + ReANY((REGEXP *)rx)->pre_prefix;
7017 assert(n < pRExC_state->code_blocks->count);
7018 src = &ri->code_blocks->cb[i];
7019 dst = &pRExC_state->code_blocks->cb[n];
7020 dst->start = src->start + offset;
7021 dst->end = src->end + offset;
7022 dst->block = src->block;
7023 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7032 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7041 /* see if there are any run-time code blocks in the pattern.
7042 * False positives are allowed */
7045 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7046 char *pat, STRLEN plen)
7051 PERL_UNUSED_CONTEXT;
7053 for (s = 0; s < plen; s++) {
7054 if ( pRExC_state->code_blocks
7055 && n < pRExC_state->code_blocks->count
7056 && s == pRExC_state->code_blocks->cb[n].start)
7058 s = pRExC_state->code_blocks->cb[n].end;
7062 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7064 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7066 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7073 /* Handle run-time code blocks. We will already have compiled any direct
7074 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7075 * copy of it, but with any literal code blocks blanked out and
7076 * appropriate chars escaped; then feed it into
7078 * eval "qr'modified_pattern'"
7082 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7086 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7088 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7089 * and merge them with any code blocks of the original regexp.
7091 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7092 * instead, just save the qr and return FALSE; this tells our caller that
7093 * the original pattern needs upgrading to utf8.
7097 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7098 char *pat, STRLEN plen)
7102 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7104 if (pRExC_state->runtime_code_qr) {
7105 /* this is the second time we've been called; this should
7106 * only happen if the main pattern got upgraded to utf8
7107 * during compilation; re-use the qr we compiled first time
7108 * round (which should be utf8 too)
7110 qr = pRExC_state->runtime_code_qr;
7111 pRExC_state->runtime_code_qr = NULL;
7112 assert(RExC_utf8 && SvUTF8(qr));
7118 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7122 /* determine how many extra chars we need for ' and \ escaping */
7123 for (s = 0; s < plen; s++) {
7124 if (pat[s] == '\'' || pat[s] == '\\')
7128 Newx(newpat, newlen, char);
7130 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7132 for (s = 0; s < plen; s++) {
7133 if ( pRExC_state->code_blocks
7134 && n < pRExC_state->code_blocks->count
7135 && s == pRExC_state->code_blocks->cb[n].start)
7137 /* blank out literal code block so that they aren't
7138 * recompiled: eg change from/to:
7148 assert(pat[s] == '(');
7149 assert(pat[s+1] == '?');
7153 while (s < pRExC_state->code_blocks->cb[n].end) {
7161 if (pat[s] == '\'' || pat[s] == '\\')
7166 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7168 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7174 Perl_re_printf( aTHX_
7175 "%sre-parsing pattern for runtime code:%s %s\n",
7176 PL_colors[4], PL_colors[5], newpat);
7179 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7185 PUSHSTACKi(PERLSI_REQUIRE);
7186 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7187 * parsing qr''; normally only q'' does this. It also alters
7189 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7190 SvREFCNT_dec_NN(sv);
7195 SV * const errsv = ERRSV;
7196 if (SvTRUE_NN(errsv))
7197 /* use croak_sv ? */
7198 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7200 assert(SvROK(qr_ref));
7202 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7203 /* the leaving below frees the tmp qr_ref.
7204 * Give qr a life of its own */
7212 if (!RExC_utf8 && SvUTF8(qr)) {
7213 /* first time through; the pattern got upgraded; save the
7214 * qr for the next time through */
7215 assert(!pRExC_state->runtime_code_qr);
7216 pRExC_state->runtime_code_qr = qr;
7221 /* extract any code blocks within the returned qr// */
7224 /* merge the main (r1) and run-time (r2) code blocks into one */
7226 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7227 struct reg_code_block *new_block, *dst;
7228 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7232 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7234 SvREFCNT_dec_NN(qr);
7238 if (!r1->code_blocks)
7239 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7241 r1c = r1->code_blocks->count;
7242 r2c = r2->code_blocks->count;
7244 Newx(new_block, r1c + r2c, struct reg_code_block);
7248 while (i1 < r1c || i2 < r2c) {
7249 struct reg_code_block *src;
7253 src = &r2->code_blocks->cb[i2++];
7257 src = &r1->code_blocks->cb[i1++];
7258 else if ( r1->code_blocks->cb[i1].start
7259 < r2->code_blocks->cb[i2].start)
7261 src = &r1->code_blocks->cb[i1++];
7262 assert(src->end < r2->code_blocks->cb[i2].start);
7265 assert( r1->code_blocks->cb[i1].start
7266 > r2->code_blocks->cb[i2].start);
7267 src = &r2->code_blocks->cb[i2++];
7269 assert(src->end < r1->code_blocks->cb[i1].start);
7272 assert(pat[src->start] == '(');
7273 assert(pat[src->end] == ')');
7274 dst->start = src->start;
7275 dst->end = src->end;
7276 dst->block = src->block;
7277 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7281 r1->code_blocks->count += r2c;
7282 Safefree(r1->code_blocks->cb);
7283 r1->code_blocks->cb = new_block;
7286 SvREFCNT_dec_NN(qr);
7292 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7293 struct reg_substr_datum *rsd,
7294 struct scan_data_substrs *sub,
7295 STRLEN longest_length)
7297 /* This is the common code for setting up the floating and fixed length
7298 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7299 * as to whether succeeded or not */
7303 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7304 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7306 if (! (longest_length
7307 || (eol /* Can't have SEOL and MULTI */
7308 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7310 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7311 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7316 /* copy the information about the longest from the reg_scan_data
7317 over to the program. */
7318 if (SvUTF8(sub->str)) {
7320 rsd->utf8_substr = sub->str;
7322 rsd->substr = sub->str;
7323 rsd->utf8_substr = NULL;
7325 /* end_shift is how many chars that must be matched that
7326 follow this item. We calculate it ahead of time as once the
7327 lookbehind offset is added in we lose the ability to correctly
7329 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7330 rsd->end_shift = ml - sub->min_offset
7332 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7334 + (SvTAIL(sub->str) != 0)
7338 t = (eol/* Can't have SEOL and MULTI */
7339 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7340 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7346 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7348 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7349 * properly wrapped with the right modifiers */
7351 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7352 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7353 != REGEX_DEPENDS_CHARSET);
7355 /* The caret is output if there are any defaults: if not all the STD
7356 * flags are set, or if no character set specifier is needed */
7358 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7360 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7361 == REG_RUN_ON_COMMENT_SEEN);
7362 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7363 >> RXf_PMf_STD_PMMOD_SHIFT);
7364 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7366 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7368 /* We output all the necessary flags; we never output a minus, as all
7369 * those are defaults, so are
7370 * covered by the caret */
7371 const STRLEN wraplen = pat_len + has_p + has_runon
7372 + has_default /* If needs a caret */
7373 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7375 /* If needs a character set specifier */
7376 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7377 + (sizeof("(?:)") - 1);
7379 PERL_ARGS_ASSERT_SET_REGEX_PV;
7381 /* make sure PL_bitcount bounds not exceeded */
7382 assert(sizeof(STD_PAT_MODS) <= 8);
7384 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7387 SvFLAGS(Rx) |= SVf_UTF8;
7390 /* If a default, cover it using the caret */
7392 *p++= DEFAULT_PAT_MOD;
7398 name = get_regex_charset_name(RExC_rx->extflags, &len);
7399 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7401 name = UNICODE_PAT_MODS;
7402 len = sizeof(UNICODE_PAT_MODS) - 1;
7404 Copy(name, p, len, char);
7408 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7411 while((ch = *fptr++)) {
7419 Copy(RExC_precomp, p, pat_len, char);
7420 assert ((RX_WRAPPED(Rx) - p) < 16);
7421 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7424 /* Adding a trailing \n causes this to compile properly:
7425 my $R = qr / A B C # D E/x; /($R)/
7426 Otherwise the parens are considered part of the comment */
7431 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7435 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7436 * regular expression into internal code.
7437 * The pattern may be passed either as:
7438 * a list of SVs (patternp plus pat_count)
7439 * a list of OPs (expr)
7440 * If both are passed, the SV list is used, but the OP list indicates
7441 * which SVs are actually pre-compiled code blocks
7443 * The SVs in the list have magic and qr overloading applied to them (and
7444 * the list may be modified in-place with replacement SVs in the latter
7447 * If the pattern hasn't changed from old_re, then old_re will be
7450 * eng is the current engine. If that engine has an op_comp method, then
7451 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7452 * do the initial concatenation of arguments and pass on to the external
7455 * If is_bare_re is not null, set it to a boolean indicating whether the
7456 * arg list reduced (after overloading) to a single bare regex which has
7457 * been returned (i.e. /$qr/).
7459 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7461 * pm_flags contains the PMf_* flags, typically based on those from the
7462 * pm_flags field of the related PMOP. Currently we're only interested in
7463 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7465 * For many years this code had an initial sizing pass that calculated
7466 * (sometimes incorrectly, leading to security holes) the size needed for the
7467 * compiled pattern. That was changed by commit
7468 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7469 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7470 * references to this sizing pass.
7472 * Now, an initial crude guess as to the size needed is made, based on the
7473 * length of the pattern. Patches welcome to improve that guess. That amount
7474 * of space is malloc'd and then immediately freed, and then clawed back node
7475 * by node. This design is to minimze, to the extent possible, memory churn
7476 * when doing the the reallocs.
7478 * A separate parentheses counting pass may be needed in some cases.
7479 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7482 * The existence of a sizing pass necessitated design decisions that are no
7483 * longer needed. There are potential areas of simplification.
7485 * Beware that the optimization-preparation code in here knows about some
7486 * of the structure of the compiled regexp. [I'll say.]
7490 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7491 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7492 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7495 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7503 SV** new_patternp = patternp;
7505 /* these are all flags - maybe they should be turned
7506 * into a single int with different bit masks */
7507 I32 sawlookahead = 0;
7512 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7514 bool runtime_code = 0;
7516 RExC_state_t RExC_state;
7517 RExC_state_t * const pRExC_state = &RExC_state;
7518 #ifdef TRIE_STUDY_OPT
7520 RExC_state_t copyRExC_state;
7522 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7524 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7526 DEBUG_r(if (!PL_colorset) reginitcolors());
7529 pRExC_state->warn_text = NULL;
7530 pRExC_state->unlexed_names = NULL;
7531 pRExC_state->code_blocks = NULL;
7534 *is_bare_re = FALSE;
7536 if (expr && (expr->op_type == OP_LIST ||
7537 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7538 /* allocate code_blocks if needed */
7542 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7543 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7544 ncode++; /* count of DO blocks */
7547 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7551 /* compile-time pattern with just OP_CONSTs and DO blocks */
7556 /* find how many CONSTs there are */
7559 if (expr->op_type == OP_CONST)
7562 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7563 if (o->op_type == OP_CONST)
7567 /* fake up an SV array */
7569 assert(!new_patternp);
7570 Newx(new_patternp, n, SV*);
7571 SAVEFREEPV(new_patternp);
7575 if (expr->op_type == OP_CONST)
7576 new_patternp[n] = cSVOPx_sv(expr);
7578 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7579 if (o->op_type == OP_CONST)
7580 new_patternp[n++] = cSVOPo_sv;
7585 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7586 "Assembling pattern from %d elements%s\n", pat_count,
7587 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7589 /* set expr to the first arg op */
7591 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7592 && expr->op_type != OP_CONST)
7594 expr = cLISTOPx(expr)->op_first;
7595 assert( expr->op_type == OP_PUSHMARK
7596 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7597 || expr->op_type == OP_PADRANGE);
7598 expr = OpSIBLING(expr);
7601 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7602 expr, &recompile, NULL);
7604 /* handle bare (possibly after overloading) regex: foo =~ $re */
7609 if (SvTYPE(re) == SVt_REGEXP) {
7613 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7614 "Precompiled pattern%s\n",
7615 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7621 exp = SvPV_nomg(pat, plen);
7623 if (!eng->op_comp) {
7624 if ((SvUTF8(pat) && IN_BYTES)
7625 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7627 /* make a temporary copy; either to convert to bytes,
7628 * or to avoid repeating get-magic / overloaded stringify */
7629 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7630 (IN_BYTES ? 0 : SvUTF8(pat)));
7632 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7635 /* ignore the utf8ness if the pattern is 0 length */
7636 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7637 RExC_uni_semantics = 0;
7638 RExC_contains_locale = 0;
7639 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7640 RExC_in_script_run = 0;
7641 RExC_study_started = 0;
7642 pRExC_state->runtime_code_qr = NULL;
7643 RExC_frame_head= NULL;
7644 RExC_frame_last= NULL;
7645 RExC_frame_count= 0;
7646 RExC_latest_warn_offset = 0;
7647 RExC_use_BRANCHJ = 0;
7648 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7649 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7650 RExC_total_parens = 0;
7651 RExC_open_parens = NULL;
7652 RExC_close_parens = NULL;
7653 RExC_paren_names = NULL;
7655 RExC_seen_d_op = FALSE;
7657 RExC_paren_name_list = NULL;
7661 RExC_mysv1= sv_newmortal();
7662 RExC_mysv2= sv_newmortal();
7666 SV *dsv= sv_newmortal();
7667 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7668 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7669 PL_colors[4], PL_colors[5], s);
7672 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7675 if ((pm_flags & PMf_USE_RE_EVAL)
7676 /* this second condition covers the non-regex literal case,
7677 * i.e. $foo =~ '(?{})'. */
7678 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7680 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7683 /* return old regex if pattern hasn't changed */
7684 /* XXX: note in the below we have to check the flags as well as the
7687 * Things get a touch tricky as we have to compare the utf8 flag
7688 * independently from the compile flags. */
7692 && !!RX_UTF8(old_re) == !!RExC_utf8
7693 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7694 && RX_PRECOMP(old_re)
7695 && RX_PRELEN(old_re) == plen
7696 && memEQ(RX_PRECOMP(old_re), exp, plen)
7697 && !runtime_code /* with runtime code, always recompile */ )
7700 SV *dsv= sv_newmortal();
7701 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7702 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
7703 PL_colors[4], PL_colors[5], s);
7708 /* Allocate the pattern's SV */
7709 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7710 RExC_rx = ReANY(Rx);
7711 if ( RExC_rx == NULL )
7712 FAIL("Regexp out of space");
7714 rx_flags = orig_rx_flags;
7716 if ( (UTF || RExC_uni_semantics)
7717 && initial_charset == REGEX_DEPENDS_CHARSET)
7720 /* Set to use unicode semantics if the pattern is in utf8 and has the
7721 * 'depends' charset specified, as it means unicode when utf8 */
7722 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7723 RExC_uni_semantics = 1;
7726 RExC_pm_flags = pm_flags;
7729 assert(TAINTING_get || !TAINT_get);
7731 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7733 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7734 /* whoops, we have a non-utf8 pattern, whilst run-time code
7735 * got compiled as utf8. Try again with a utf8 pattern */
7736 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7737 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7741 assert(!pRExC_state->runtime_code_qr);
7747 RExC_in_lookbehind = 0;
7748 RExC_in_lookahead = 0;
7749 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7750 RExC_recode_x_to_native = 0;
7751 RExC_in_multi_char_class = 0;
7753 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7754 RExC_precomp_end = RExC_end = exp + plen;
7756 RExC_whilem_seen = 0;
7758 RExC_recurse = NULL;
7759 RExC_study_chunk_recursed = NULL;
7760 RExC_study_chunk_recursed_bytes= 0;
7761 RExC_recurse_count = 0;
7762 RExC_sets_depth = 0;
7763 pRExC_state->code_index = 0;
7765 /* Initialize the string in the compiled pattern. This is so that there is
7766 * something to output if necessary */
7767 set_regex_pv(pRExC_state, Rx);
7770 Perl_re_printf( aTHX_
7771 "Starting parse and generation\n");
7773 RExC_lastparse=NULL;
7776 /* Allocate space and zero-initialize. Note, the two step process
7777 of zeroing when in debug mode, thus anything assigned has to
7778 happen after that */
7781 /* On the first pass of the parse, we guess how big this will be. Then
7782 * we grow in one operation to that amount and then give it back. As
7783 * we go along, we re-allocate what we need.
7785 * XXX Currently the guess is essentially that the pattern will be an
7786 * EXACT node with one byte input, one byte output. This is crude, and
7787 * better heuristics are welcome.
7789 * On any subsequent passes, we guess what we actually computed in the
7790 * latest earlier pass. Such a pass probably didn't complete so is
7791 * missing stuff. We could improve those guesses by knowing where the
7792 * parse stopped, and use the length so far plus apply the above
7793 * assumption to what's left. */
7794 RExC_size = STR_SZ(RExC_end - RExC_start);
7797 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7798 if ( RExC_rxi == NULL )
7799 FAIL("Regexp out of space");
7801 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7802 RXi_SET( RExC_rx, RExC_rxi );
7804 /* We start from 0 (over from 0 in the case this is a reparse. The first
7805 * node parsed will give back any excess memory we have allocated so far).
7809 /* non-zero initialization begins here */
7810 RExC_rx->engine= eng;
7811 RExC_rx->extflags = rx_flags;
7812 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7814 if (pm_flags & PMf_IS_QR) {
7815 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7816 if (RExC_rxi->code_blocks) {
7817 RExC_rxi->code_blocks->refcnt++;
7821 RExC_rx->intflags = 0;
7823 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7826 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7827 * code makes sure the final byte is an uncounted NUL. But should this
7828 * ever not be the case, lots of things could read beyond the end of the
7829 * buffer: loops like
7830 * while(isFOO(*RExC_parse)) RExC_parse++;
7831 * strchr(RExC_parse, "foo");
7832 * etc. So it is worth noting. */
7833 assert(*RExC_end == '\0');
7837 RExC_parens_buf_size = 0;
7838 RExC_emit_start = RExC_rxi->program;
7839 pRExC_state->code_index = 0;
7841 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7845 if (reg(pRExC_state, 0, &flags, 1)) {
7847 /* Success!, But we may need to redo the parse knowing how many parens
7848 * there actually are */
7849 if (IN_PARENS_PASS) {
7850 flags |= RESTART_PARSE;
7853 /* We have that number in RExC_npar */
7854 RExC_total_parens = RExC_npar;
7856 else if (! MUST_RESTART(flags)) {
7858 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7861 /* Here, we either have success, or we have to redo the parse for some reason */
7862 if (MUST_RESTART(flags)) {
7864 /* It's possible to write a regexp in ascii that represents Unicode
7865 codepoints outside of the byte range, such as via \x{100}. If we
7866 detect such a sequence we have to convert the entire pattern to utf8
7867 and then recompile, as our sizing calculation will have been based
7868 on 1 byte == 1 character, but we will need to use utf8 to encode
7869 at least some part of the pattern, and therefore must convert the whole
7872 if (flags & NEED_UTF8) {
7874 /* We have stored the offset of the final warning output so far.
7875 * That must be adjusted. Any variant characters between the start
7876 * of the pattern and this warning count for 2 bytes in the final,
7877 * so just add them again */
7878 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7879 RExC_latest_warn_offset +=
7880 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7881 + RExC_latest_warn_offset);
7883 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7884 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7885 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7888 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7891 if (ALL_PARENS_COUNTED) {
7892 /* Make enough room for all the known parens, and zero it */
7893 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7894 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7895 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7897 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7898 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7900 else { /* Parse did not complete. Reinitialize the parentheses
7902 RExC_total_parens = 0;
7903 if (RExC_open_parens) {
7904 Safefree(RExC_open_parens);
7905 RExC_open_parens = NULL;
7907 if (RExC_close_parens) {
7908 Safefree(RExC_close_parens);
7909 RExC_close_parens = NULL;
7913 /* Clean up what we did in this parse */
7914 SvREFCNT_dec_NN(RExC_rx_sv);
7919 /* Here, we have successfully parsed and generated the pattern's program
7920 * for the regex engine. We are ready to finish things up and look for
7923 /* Update the string to compile, with correct modifiers, etc */
7924 set_regex_pv(pRExC_state, Rx);
7926 RExC_rx->nparens = RExC_total_parens - 1;
7928 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7929 if (RExC_whilem_seen > 15)
7930 RExC_whilem_seen = 15;
7933 Perl_re_printf( aTHX_
7934 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7936 RExC_lastparse=NULL;
7939 #ifdef RE_TRACK_PATTERN_OFFSETS
7940 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7941 "%s %" UVuf " bytes for offset annotations.\n",
7942 RExC_offsets ? "Got" : "Couldn't get",
7943 (UV)((RExC_offsets[0] * 2 + 1))));
7944 DEBUG_OFFSETS_r(if (RExC_offsets) {
7945 const STRLEN len = RExC_offsets[0];
7947 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7948 Perl_re_printf( aTHX_
7949 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7950 for (i = 1; i <= len; i++) {
7951 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7952 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7953 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7955 Perl_re_printf( aTHX_ "\n");
7959 SetProgLen(RExC_rxi,RExC_size);
7962 DEBUG_DUMP_PRE_OPTIMIZE_r({
7963 SV * const sv = sv_newmortal();
7964 RXi_GET_DECL(RExC_rx, ri);
7966 Perl_re_printf( aTHX_ "Program before optimization:\n");
7968 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7973 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7976 /* XXXX To minimize changes to RE engine we always allocate
7977 3-units-long substrs field. */
7978 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7979 if (RExC_recurse_count) {
7980 Newx(RExC_recurse, RExC_recurse_count, regnode *);
7981 SAVEFREEPV(RExC_recurse);
7984 if (RExC_seen & REG_RECURSE_SEEN) {
7985 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7986 * So its 1 if there are no parens. */
7987 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7988 ((RExC_total_parens & 0x07) != 0);
7989 Newx(RExC_study_chunk_recursed,
7990 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7991 SAVEFREEPV(RExC_study_chunk_recursed);
7995 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7997 RExC_study_chunk_recursed_count= 0;
7999 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8000 if (RExC_study_chunk_recursed) {
8001 Zero(RExC_study_chunk_recursed,
8002 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8006 #ifdef TRIE_STUDY_OPT
8008 StructCopy(&zero_scan_data, &data, scan_data_t);
8009 copyRExC_state = RExC_state;
8012 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8014 RExC_state = copyRExC_state;
8015 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8016 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8018 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8019 StructCopy(&zero_scan_data, &data, scan_data_t);
8022 StructCopy(&zero_scan_data, &data, scan_data_t);
8025 /* Dig out information for optimizations. */
8026 RExC_rx->extflags = RExC_flags; /* was pm_op */
8027 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8030 SvUTF8_on(Rx); /* Unicode in it? */
8031 RExC_rxi->regstclass = NULL;
8032 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8033 RExC_rx->intflags |= PREGf_NAUGHTY;
8034 scan = RExC_rxi->program + 1; /* First BRANCH. */
8036 /* testing for BRANCH here tells us whether there is "must appear"
8037 data in the pattern. If there is then we can use it for optimisations */
8038 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8041 STRLEN longest_length[2];
8042 regnode_ssc ch_class; /* pointed to by data */
8044 SSize_t last_close = 0; /* pointed to by data */
8045 regnode *first= scan;
8046 regnode *first_next= regnext(first);
8050 * Skip introductions and multiplicators >= 1
8051 * so that we can extract the 'meat' of the pattern that must
8052 * match in the large if() sequence following.
8053 * NOTE that EXACT is NOT covered here, as it is normally
8054 * picked up by the optimiser separately.
8056 * This is unfortunate as the optimiser isnt handling lookahead
8057 * properly currently.
8060 while ((OP(first) == OPEN && (sawopen = 1)) ||
8061 /* An OR of *one* alternative - should not happen now. */
8062 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8063 /* for now we can't handle lookbehind IFMATCH*/
8064 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8065 (OP(first) == PLUS) ||
8066 (OP(first) == MINMOD) ||
8067 /* An {n,m} with n>0 */
8068 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8069 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8072 * the only op that could be a regnode is PLUS, all the rest
8073 * will be regnode_1 or regnode_2.
8075 * (yves doesn't think this is true)
8077 if (OP(first) == PLUS)
8080 if (OP(first) == MINMOD)
8082 first += regarglen[OP(first)];
8084 first = NEXTOPER(first);
8085 first_next= regnext(first);
8088 /* Starting-point info. */
8090 DEBUG_PEEP("first:", first, 0, 0);
8091 /* Ignore EXACT as we deal with it later. */
8092 if (PL_regkind[OP(first)] == EXACT) {
8093 if ( OP(first) == EXACT
8094 || OP(first) == LEXACT
8095 || OP(first) == EXACT_REQ8
8096 || OP(first) == LEXACT_REQ8
8097 || OP(first) == EXACTL)
8099 NOOP; /* Empty, get anchored substr later. */
8102 RExC_rxi->regstclass = first;
8105 else if (PL_regkind[OP(first)] == TRIE &&
8106 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8108 /* this can happen only on restudy */
8109 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8112 else if (REGNODE_SIMPLE(OP(first)))
8113 RExC_rxi->regstclass = first;
8114 else if (PL_regkind[OP(first)] == BOUND ||
8115 PL_regkind[OP(first)] == NBOUND)
8116 RExC_rxi->regstclass = first;
8117 else if (PL_regkind[OP(first)] == BOL) {
8118 RExC_rx->intflags |= (OP(first) == MBOL
8121 first = NEXTOPER(first);
8124 else if (OP(first) == GPOS) {
8125 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8126 first = NEXTOPER(first);
8129 else if ((!sawopen || !RExC_sawback) &&
8131 (OP(first) == STAR &&
8132 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8133 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8135 /* turn .* into ^.* with an implied $*=1 */
8137 (OP(NEXTOPER(first)) == REG_ANY)
8140 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8141 first = NEXTOPER(first);
8144 if (sawplus && !sawminmod && !sawlookahead
8145 && (!sawopen || !RExC_sawback)
8146 && !pRExC_state->code_blocks) /* May examine pos and $& */
8147 /* x+ must match at the 1st pos of run of x's */
8148 RExC_rx->intflags |= PREGf_SKIP;
8150 /* Scan is after the zeroth branch, first is atomic matcher. */
8151 #ifdef TRIE_STUDY_OPT
8154 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8155 (IV)(first - scan + 1))
8159 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8160 (IV)(first - scan + 1))
8166 * If there's something expensive in the r.e., find the
8167 * longest literal string that must appear and make it the
8168 * regmust. Resolve ties in favor of later strings, since
8169 * the regstart check works with the beginning of the r.e.
8170 * and avoiding duplication strengthens checking. Not a
8171 * strong reason, but sufficient in the absence of others.
8172 * [Now we resolve ties in favor of the earlier string if
8173 * it happens that c_offset_min has been invalidated, since the
8174 * earlier string may buy us something the later one won't.]
8177 data.substrs[0].str = newSVpvs("");
8178 data.substrs[1].str = newSVpvs("");
8179 data.last_found = newSVpvs("");
8180 data.cur_is_floating = 0; /* initially any found substring is fixed */
8181 ENTER_with_name("study_chunk");
8182 SAVEFREESV(data.substrs[0].str);
8183 SAVEFREESV(data.substrs[1].str);
8184 SAVEFREESV(data.last_found);
8186 if (!RExC_rxi->regstclass) {
8187 ssc_init(pRExC_state, &ch_class);
8188 data.start_class = &ch_class;
8189 stclass_flag = SCF_DO_STCLASS_AND;
8190 } else /* XXXX Check for BOUND? */
8192 data.last_closep = &last_close;
8196 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8197 * (NO top level branches)
8199 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8200 scan + RExC_size, /* Up to end */
8202 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8203 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8207 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8210 if ( RExC_total_parens == 1 && !data.cur_is_floating
8211 && data.last_start_min == 0 && data.last_end > 0
8212 && !RExC_seen_zerolen
8213 && !(RExC_seen & REG_VERBARG_SEEN)
8214 && !(RExC_seen & REG_GPOS_SEEN)
8216 RExC_rx->extflags |= RXf_CHECK_ALL;
8218 scan_commit(pRExC_state, &data,&minlen, 0);
8221 /* XXX this is done in reverse order because that's the way the
8222 * code was before it was parameterised. Don't know whether it
8223 * actually needs doing in reverse order. DAPM */
8224 for (i = 1; i >= 0; i--) {
8225 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8228 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8229 && data.substrs[0].min_offset
8230 == data.substrs[1].min_offset
8231 && SvCUR(data.substrs[0].str)
8232 == SvCUR(data.substrs[1].str)
8234 && S_setup_longest (aTHX_ pRExC_state,
8235 &(RExC_rx->substrs->data[i]),
8239 RExC_rx->substrs->data[i].min_offset =
8240 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8242 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8243 /* Don't offset infinity */
8244 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8245 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8246 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8249 RExC_rx->substrs->data[i].substr = NULL;
8250 RExC_rx->substrs->data[i].utf8_substr = NULL;
8251 longest_length[i] = 0;
8255 LEAVE_with_name("study_chunk");
8257 if (RExC_rxi->regstclass
8258 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8259 RExC_rxi->regstclass = NULL;
8261 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8262 || RExC_rx->substrs->data[0].min_offset)
8264 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8265 && is_ssc_worth_it(pRExC_state, data.start_class))
8267 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8269 ssc_finalize(pRExC_state, data.start_class);
8271 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8272 StructCopy(data.start_class,
8273 (regnode_ssc*)RExC_rxi->data->data[n],
8275 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8276 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8277 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8278 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8279 Perl_re_printf( aTHX_
8280 "synthetic stclass \"%s\".\n",
8281 SvPVX_const(sv));});
8282 data.start_class = NULL;
8285 /* A temporary algorithm prefers floated substr to fixed one of
8286 * same length to dig more info. */
8287 i = (longest_length[0] <= longest_length[1]);
8288 RExC_rx->substrs->check_ix = i;
8289 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8290 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8291 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8292 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8293 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8294 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8295 RExC_rx->intflags |= PREGf_NOSCAN;
8297 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8298 RExC_rx->extflags |= RXf_USE_INTUIT;
8299 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8300 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8303 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8304 if ( (STRLEN)minlen < longest_length[1] )
8305 minlen= longest_length[1];
8306 if ( (STRLEN)minlen < longest_length[0] )
8307 minlen= longest_length[0];
8311 /* Several toplevels. Best we can is to set minlen. */
8313 regnode_ssc ch_class;
8314 SSize_t last_close = 0;
8316 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8318 scan = RExC_rxi->program + 1;
8319 ssc_init(pRExC_state, &ch_class);
8320 data.start_class = &ch_class;
8321 data.last_closep = &last_close;
8325 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8326 * (patterns WITH top level branches)
8328 minlen = study_chunk(pRExC_state,
8329 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8330 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8331 ? SCF_TRIE_DOING_RESTUDY
8335 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8337 RExC_rx->check_substr = NULL;
8338 RExC_rx->check_utf8 = NULL;
8339 RExC_rx->substrs->data[0].substr = NULL;
8340 RExC_rx->substrs->data[0].utf8_substr = NULL;
8341 RExC_rx->substrs->data[1].substr = NULL;
8342 RExC_rx->substrs->data[1].utf8_substr = NULL;
8344 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8345 && is_ssc_worth_it(pRExC_state, data.start_class))
8347 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8349 ssc_finalize(pRExC_state, data.start_class);
8351 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8352 StructCopy(data.start_class,
8353 (regnode_ssc*)RExC_rxi->data->data[n],
8355 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8356 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8357 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8358 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8359 Perl_re_printf( aTHX_
8360 "synthetic stclass \"%s\".\n",
8361 SvPVX_const(sv));});
8362 data.start_class = NULL;
8366 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8367 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8368 RExC_rx->maxlen = REG_INFTY;
8371 RExC_rx->maxlen = RExC_maxlen;
8374 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8375 the "real" pattern. */
8377 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8378 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8380 RExC_rx->minlenret = minlen;
8381 if (RExC_rx->minlen < minlen)
8382 RExC_rx->minlen = minlen;
8384 if (RExC_seen & REG_RECURSE_SEEN ) {
8385 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8386 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8388 if (RExC_seen & REG_GPOS_SEEN)
8389 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8390 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8391 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8393 if (pRExC_state->code_blocks)
8394 RExC_rx->extflags |= RXf_EVAL_SEEN;
8395 if (RExC_seen & REG_VERBARG_SEEN)
8397 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8398 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8400 if (RExC_seen & REG_CUTGROUP_SEEN)
8401 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8402 if (pm_flags & PMf_USE_RE_EVAL)
8403 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8404 if (RExC_paren_names)
8405 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8407 RXp_PAREN_NAMES(RExC_rx) = NULL;
8409 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8410 * so it can be used in pp.c */
8411 if (RExC_rx->intflags & PREGf_ANCH)
8412 RExC_rx->extflags |= RXf_IS_ANCHORED;
8416 /* this is used to identify "special" patterns that might result
8417 * in Perl NOT calling the regex engine and instead doing the match "itself",
8418 * particularly special cases in split//. By having the regex compiler
8419 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8420 * we avoid weird issues with equivalent patterns resulting in different behavior,
8421 * AND we allow non Perl engines to get the same optimizations by the setting the
8422 * flags appropriately - Yves */
8423 regnode *first = RExC_rxi->program + 1;
8425 regnode *next = regnext(first);
8428 if (PL_regkind[fop] == NOTHING && nop == END)
8429 RExC_rx->extflags |= RXf_NULL;
8430 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8431 /* when fop is SBOL first->flags will be true only when it was
8432 * produced by parsing /\A/, and not when parsing /^/. This is
8433 * very important for the split code as there we want to
8434 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8435 * See rt #122761 for more details. -- Yves */
8436 RExC_rx->extflags |= RXf_START_ONLY;
8437 else if (fop == PLUS
8438 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8440 RExC_rx->extflags |= RXf_WHITE;
8441 else if ( RExC_rx->extflags & RXf_SPLIT
8442 && ( fop == EXACT || fop == LEXACT
8443 || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8445 && STR_LEN(first) == 1
8446 && *(STRING(first)) == ' '
8448 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8452 if (RExC_contains_locale) {
8453 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8457 if (RExC_paren_names) {
8458 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8459 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8460 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8463 RExC_rxi->name_list_idx = 0;
8465 while ( RExC_recurse_count > 0 ) {
8466 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8468 * This data structure is set up in study_chunk() and is used
8469 * to calculate the distance between a GOSUB regopcode and
8470 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8473 * If for some reason someone writes code that optimises
8474 * away a GOSUB opcode then the assert should be changed to
8475 * an if(scan) to guard the ARG2L_SET() - Yves
8478 assert(scan && OP(scan) == GOSUB);
8479 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8482 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8483 /* assume we don't need to swap parens around before we match */
8485 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8486 (unsigned long)RExC_study_chunk_recursed_count);
8490 Perl_re_printf( aTHX_ "Final program:\n");
8494 if (RExC_open_parens) {
8495 Safefree(RExC_open_parens);
8496 RExC_open_parens = NULL;
8498 if (RExC_close_parens) {
8499 Safefree(RExC_close_parens);
8500 RExC_close_parens = NULL;
8504 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8505 * by setting the regexp SV to readonly-only instead. If the
8506 * pattern's been recompiled, the USEDness should remain. */
8507 if (old_re && SvREADONLY(old_re))
8515 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8518 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8520 PERL_UNUSED_ARG(value);
8522 if (flags & RXapif_FETCH) {
8523 return reg_named_buff_fetch(rx, key, flags);
8524 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8525 Perl_croak_no_modify();
8527 } else if (flags & RXapif_EXISTS) {
8528 return reg_named_buff_exists(rx, key, flags)
8531 } else if (flags & RXapif_REGNAMES) {
8532 return reg_named_buff_all(rx, flags);
8533 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8534 return reg_named_buff_scalar(rx, flags);
8536 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8542 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8545 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8546 PERL_UNUSED_ARG(lastkey);
8548 if (flags & RXapif_FIRSTKEY)
8549 return reg_named_buff_firstkey(rx, flags);
8550 else if (flags & RXapif_NEXTKEY)
8551 return reg_named_buff_nextkey(rx, flags);
8553 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8560 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8564 struct regexp *const rx = ReANY(r);
8566 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8568 if (rx && RXp_PAREN_NAMES(rx)) {
8569 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8572 SV* sv_dat=HeVAL(he_str);
8573 I32 *nums=(I32*)SvPVX(sv_dat);
8574 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8575 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8576 if ((I32)(rx->nparens) >= nums[i]
8577 && rx->offs[nums[i]].start != -1
8578 && rx->offs[nums[i]].end != -1)
8581 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8586 ret = newSVsv(&PL_sv_undef);
8589 av_push(retarray, ret);
8592 return newRV_noinc(MUTABLE_SV(retarray));
8599 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8602 struct regexp *const rx = ReANY(r);
8604 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8606 if (rx && RXp_PAREN_NAMES(rx)) {
8607 if (flags & RXapif_ALL) {
8608 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8610 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8612 SvREFCNT_dec_NN(sv);
8624 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8626 struct regexp *const rx = ReANY(r);
8628 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8630 if ( rx && RXp_PAREN_NAMES(rx) ) {
8631 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8633 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8640 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8642 struct regexp *const rx = ReANY(r);
8643 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8645 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8647 if (rx && RXp_PAREN_NAMES(rx)) {
8648 HV *hv = RXp_PAREN_NAMES(rx);
8650 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8653 SV* sv_dat = HeVAL(temphe);
8654 I32 *nums = (I32*)SvPVX(sv_dat);
8655 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8656 if ((I32)(rx->lastparen) >= nums[i] &&
8657 rx->offs[nums[i]].start != -1 &&
8658 rx->offs[nums[i]].end != -1)
8664 if (parno || flags & RXapif_ALL) {
8665 return newSVhek(HeKEY_hek(temphe));
8673 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8678 struct regexp *const rx = ReANY(r);
8680 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8682 if (rx && RXp_PAREN_NAMES(rx)) {
8683 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8684 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8685 } else if (flags & RXapif_ONE) {
8686 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8687 av = MUTABLE_AV(SvRV(ret));
8688 length = av_tindex(av);
8689 SvREFCNT_dec_NN(ret);
8690 return newSViv(length + 1);
8692 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8697 return &PL_sv_undef;
8701 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8703 struct regexp *const rx = ReANY(r);
8706 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8708 if (rx && RXp_PAREN_NAMES(rx)) {
8709 HV *hv= RXp_PAREN_NAMES(rx);
8711 (void)hv_iterinit(hv);
8712 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8715 SV* sv_dat = HeVAL(temphe);
8716 I32 *nums = (I32*)SvPVX(sv_dat);
8717 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8718 if ((I32)(rx->lastparen) >= nums[i] &&
8719 rx->offs[nums[i]].start != -1 &&
8720 rx->offs[nums[i]].end != -1)
8726 if (parno || flags & RXapif_ALL) {
8727 av_push(av, newSVhek(HeKEY_hek(temphe)));
8732 return newRV_noinc(MUTABLE_SV(av));
8736 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8739 struct regexp *const rx = ReANY(r);
8745 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8747 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8748 || n == RX_BUFF_IDX_CARET_FULLMATCH
8749 || n == RX_BUFF_IDX_CARET_POSTMATCH
8752 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8754 /* on something like
8757 * the KEEPCOPY is set on the PMOP rather than the regex */
8758 if (PL_curpm && r == PM_GETRE(PL_curpm))
8759 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8768 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8769 /* no need to distinguish between them any more */
8770 n = RX_BUFF_IDX_FULLMATCH;
8772 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8773 && rx->offs[0].start != -1)
8775 /* $`, ${^PREMATCH} */
8776 i = rx->offs[0].start;
8780 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8781 && rx->offs[0].end != -1)
8783 /* $', ${^POSTMATCH} */
8784 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8785 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8788 if (inRANGE(n, 0, (I32)rx->nparens) &&
8789 (s1 = rx->offs[n].start) != -1 &&
8790 (t1 = rx->offs[n].end) != -1)
8792 /* $&, ${^MATCH}, $1 ... */
8794 s = rx->subbeg + s1 - rx->suboffset;
8799 assert(s >= rx->subbeg);
8800 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8802 #ifdef NO_TAINT_SUPPORT
8803 sv_setpvn(sv, s, i);
8805 const int oldtainted = TAINT_get;
8807 sv_setpvn(sv, s, i);
8808 TAINT_set(oldtainted);
8810 if (RXp_MATCH_UTF8(rx))
8815 if (RXp_MATCH_TAINTED(rx)) {
8816 if (SvTYPE(sv) >= SVt_PVMG) {
8817 MAGIC* const mg = SvMAGIC(sv);
8820 SvMAGIC_set(sv, mg->mg_moremagic);
8822 if ((mgt = SvMAGIC(sv))) {
8823 mg->mg_moremagic = mgt;
8824 SvMAGIC_set(sv, mg);
8841 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8842 SV const * const value)
8844 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8846 PERL_UNUSED_ARG(rx);
8847 PERL_UNUSED_ARG(paren);
8848 PERL_UNUSED_ARG(value);
8851 Perl_croak_no_modify();
8855 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8858 struct regexp *const rx = ReANY(r);
8862 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8864 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8865 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8866 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8869 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8871 /* on something like
8874 * the KEEPCOPY is set on the PMOP rather than the regex */
8875 if (PL_curpm && r == PM_GETRE(PL_curpm))
8876 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8882 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8884 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8885 case RX_BUFF_IDX_PREMATCH: /* $` */
8886 if (rx->offs[0].start != -1) {
8887 i = rx->offs[0].start;
8896 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8897 case RX_BUFF_IDX_POSTMATCH: /* $' */
8898 if (rx->offs[0].end != -1) {
8899 i = rx->sublen - rx->offs[0].end;
8901 s1 = rx->offs[0].end;
8908 default: /* $& / ${^MATCH}, $1, $2, ... */
8909 if (paren <= (I32)rx->nparens &&
8910 (s1 = rx->offs[paren].start) != -1 &&
8911 (t1 = rx->offs[paren].end) != -1)
8917 if (ckWARN(WARN_UNINITIALIZED))
8918 report_uninit((const SV *)sv);
8923 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8924 const char * const s = rx->subbeg - rx->suboffset + s1;
8929 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8936 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8938 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8939 PERL_UNUSED_ARG(rx);
8943 return newSVpvs("Regexp");
8946 /* Scans the name of a named buffer from the pattern.
8947 * If flags is REG_RSN_RETURN_NULL returns null.
8948 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8949 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8950 * to the parsed name as looked up in the RExC_paren_names hash.
8951 * If there is an error throws a vFAIL().. type exception.
8954 #define REG_RSN_RETURN_NULL 0
8955 #define REG_RSN_RETURN_NAME 1
8956 #define REG_RSN_RETURN_DATA 2
8959 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8961 char *name_start = RExC_parse;
8964 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8966 assert (RExC_parse <= RExC_end);
8967 if (RExC_parse == RExC_end) NOOP;
8968 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8969 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8970 * using do...while */
8973 RExC_parse += UTF8SKIP(RExC_parse);
8974 } while ( RExC_parse < RExC_end
8975 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8979 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8981 RExC_parse++; /* so the <- from the vFAIL is after the offending
8983 vFAIL("Group name must start with a non-digit word character");
8985 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8986 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8987 if ( flags == REG_RSN_RETURN_NAME)
8989 else if (flags==REG_RSN_RETURN_DATA) {
8992 if ( ! sv_name ) /* should not happen*/
8993 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8994 if (RExC_paren_names)
8995 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8997 sv_dat = HeVAL(he_str);
8998 if ( ! sv_dat ) { /* Didn't find group */
9000 /* It might be a forward reference; we can't fail until we
9001 * know, by completing the parse to get all the groups, and
9003 if (ALL_PARENS_COUNTED) {
9004 vFAIL("Reference to nonexistent named group");
9007 REQUIRE_PARENS_PASS;
9013 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9014 (unsigned long) flags);
9017 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9018 if (RExC_lastparse!=RExC_parse) { \
9019 Perl_re_printf( aTHX_ "%s", \
9020 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9021 RExC_end - RExC_parse, 16, \
9023 PERL_PV_ESCAPE_UNI_DETECT | \
9024 PERL_PV_PRETTY_ELLIPSES | \
9025 PERL_PV_PRETTY_LTGT | \
9026 PERL_PV_ESCAPE_RE | \
9027 PERL_PV_PRETTY_EXACTSIZE \
9031 Perl_re_printf( aTHX_ "%16s",""); \
9033 if (RExC_lastnum!=RExC_emit) \
9034 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9036 Perl_re_printf( aTHX_ "|%4s",""); \
9037 Perl_re_printf( aTHX_ "|%*s%-4s", \
9038 (int)((depth*2)), "", \
9041 RExC_lastnum=RExC_emit; \
9042 RExC_lastparse=RExC_parse; \
9047 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9048 DEBUG_PARSE_MSG((funcname)); \
9049 Perl_re_printf( aTHX_ "%4s","\n"); \
9051 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9052 DEBUG_PARSE_MSG((funcname)); \
9053 Perl_re_printf( aTHX_ fmt "\n",args); \
9056 /* This section of code defines the inversion list object and its methods. The
9057 * interfaces are highly subject to change, so as much as possible is static to
9058 * this file. An inversion list is here implemented as a malloc'd C UV array
9059 * as an SVt_INVLIST scalar.
9061 * An inversion list for Unicode is an array of code points, sorted by ordinal
9062 * number. Each element gives the code point that begins a range that extends
9063 * up-to but not including the code point given by the next element. The final
9064 * element gives the first code point of a range that extends to the platform's
9065 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9066 * ...) give ranges whose code points are all in the inversion list. We say
9067 * that those ranges are in the set. The odd-numbered elements give ranges
9068 * whose code points are not in the inversion list, and hence not in the set.
9069 * Thus, element [0] is the first code point in the list. Element [1]
9070 * is the first code point beyond that not in the list; and element [2] is the
9071 * first code point beyond that that is in the list. In other words, the first
9072 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9073 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9074 * all code points in that range are not in the inversion list. The third
9075 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9076 * list, and so forth. Thus every element whose index is divisible by two
9077 * gives the beginning of a range that is in the list, and every element whose
9078 * index is not divisible by two gives the beginning of a range not in the
9079 * list. If the final element's index is divisible by two, the inversion list
9080 * extends to the platform's infinity; otherwise the highest code point in the
9081 * inversion list is the contents of that element minus 1.
9083 * A range that contains just a single code point N will look like
9085 * invlist[i+1] == N+1
9087 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9088 * impossible to represent, so element [i+1] is omitted. The single element
9090 * invlist[0] == UV_MAX
9091 * contains just UV_MAX, but is interpreted as matching to infinity.
9093 * Taking the complement (inverting) an inversion list is quite simple, if the
9094 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9095 * This implementation reserves an element at the beginning of each inversion
9096 * list to always contain 0; there is an additional flag in the header which
9097 * indicates if the list begins at the 0, or is offset to begin at the next
9098 * element. This means that the inversion list can be inverted without any
9099 * copying; just flip the flag.
9101 * More about inversion lists can be found in "Unicode Demystified"
9102 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9104 * The inversion list data structure is currently implemented as an SV pointing
9105 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9106 * array of UV whose memory management is automatically handled by the existing
9107 * facilities for SV's.
9109 * Some of the methods should always be private to the implementation, and some
9110 * should eventually be made public */
9112 /* The header definitions are in F<invlist_inline.h> */
9114 #ifndef PERL_IN_XSUB_RE
9116 PERL_STATIC_INLINE UV*
9117 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9119 /* Returns a pointer to the first element in the inversion list's array.
9120 * This is called upon initialization of an inversion list. Where the
9121 * array begins depends on whether the list has the code point U+0000 in it
9122 * or not. The other parameter tells it whether the code that follows this
9123 * call is about to put a 0 in the inversion list or not. The first
9124 * element is either the element reserved for 0, if TRUE, or the element
9125 * after it, if FALSE */
9127 bool* offset = get_invlist_offset_addr(invlist);
9128 UV* zero_addr = (UV *) SvPVX(invlist);
9130 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9133 assert(! _invlist_len(invlist));
9137 /* 1^1 = 0; 1^0 = 1 */
9138 *offset = 1 ^ will_have_0;
9139 return zero_addr + *offset;
9143 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9145 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9146 * steals the list from 'src', so 'src' is made to have a NULL list. This
9147 * is similar to what SvSetMagicSV() would do, if it were implemented on
9148 * inversion lists, though this routine avoids a copy */
9150 const UV src_len = _invlist_len(src);
9151 const bool src_offset = *get_invlist_offset_addr(src);
9152 const STRLEN src_byte_len = SvLEN(src);
9153 char * array = SvPVX(src);
9155 const int oldtainted = TAINT_get;
9157 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9159 assert(is_invlist(src));
9160 assert(is_invlist(dest));
9161 assert(! invlist_is_iterating(src));
9162 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9164 /* Make sure it ends in the right place with a NUL, as our inversion list
9165 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9167 array[src_byte_len - 1] = '\0';
9169 TAINT_NOT; /* Otherwise it breaks */
9170 sv_usepvn_flags(dest,
9174 /* This flag is documented to cause a copy to be avoided */
9175 SV_HAS_TRAILING_NUL);
9176 TAINT_set(oldtainted);
9181 /* Finish up copying over the other fields in an inversion list */
9182 *get_invlist_offset_addr(dest) = src_offset;
9183 invlist_set_len(dest, src_len, src_offset);
9184 *get_invlist_previous_index_addr(dest) = 0;
9185 invlist_iterfinish(dest);
9188 PERL_STATIC_INLINE IV*
9189 S_get_invlist_previous_index_addr(SV* invlist)
9191 /* Return the address of the IV that is reserved to hold the cached index
9193 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9195 assert(is_invlist(invlist));
9197 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9200 PERL_STATIC_INLINE IV
9201 S_invlist_previous_index(SV* const invlist)
9203 /* Returns cached index of previous search */
9205 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9207 return *get_invlist_previous_index_addr(invlist);
9210 PERL_STATIC_INLINE void
9211 S_invlist_set_previous_index(SV* const invlist, const IV index)
9213 /* Caches <index> for later retrieval */
9215 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9217 assert(index == 0 || index < (int) _invlist_len(invlist));
9219 *get_invlist_previous_index_addr(invlist) = index;
9222 PERL_STATIC_INLINE void
9223 S_invlist_trim(SV* invlist)
9225 /* Free the not currently-being-used space in an inversion list */
9227 /* But don't free up the space needed for the 0 UV that is always at the
9228 * beginning of the list, nor the trailing NUL */
9229 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9231 PERL_ARGS_ASSERT_INVLIST_TRIM;
9233 assert(is_invlist(invlist));
9235 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9238 PERL_STATIC_INLINE void
9239 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9241 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9243 assert(is_invlist(invlist));
9245 invlist_set_len(invlist, 0, 0);
9246 invlist_trim(invlist);
9249 #endif /* ifndef PERL_IN_XSUB_RE */
9251 PERL_STATIC_INLINE bool
9252 S_invlist_is_iterating(SV* const invlist)
9254 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9256 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9259 #ifndef PERL_IN_XSUB_RE
9261 PERL_STATIC_INLINE UV
9262 S_invlist_max(SV* const invlist)
9264 /* Returns the maximum number of elements storable in the inversion list's
9265 * array, without having to realloc() */
9267 PERL_ARGS_ASSERT_INVLIST_MAX;
9269 assert(is_invlist(invlist));
9271 /* Assumes worst case, in which the 0 element is not counted in the
9272 * inversion list, so subtracts 1 for that */
9273 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9274 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9275 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9279 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9281 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9283 /* First 1 is in case the zero element isn't in the list; second 1 is for
9285 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9286 invlist_set_len(invlist, 0, 0);
9288 /* Force iterinit() to be used to get iteration to work */
9289 invlist_iterfinish(invlist);
9291 *get_invlist_previous_index_addr(invlist) = 0;
9292 SvPOK_on(invlist); /* This allows B to extract the PV */
9296 Perl__new_invlist(pTHX_ IV initial_size)
9299 /* Return a pointer to a newly constructed inversion list, with enough
9300 * space to store 'initial_size' elements. If that number is negative, a
9301 * system default is used instead */
9305 if (initial_size < 0) {
9309 new_list = newSV_type(SVt_INVLIST);
9310 initialize_invlist_guts(new_list, initial_size);
9316 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9318 /* Return a pointer to a newly constructed inversion list, initialized to
9319 * point to <list>, which has to be in the exact correct inversion list
9320 * form, including internal fields. Thus this is a dangerous routine that
9321 * should not be used in the wrong hands. The passed in 'list' contains
9322 * several header fields at the beginning that are not part of the
9323 * inversion list body proper */
9325 const STRLEN length = (STRLEN) list[0];
9326 const UV version_id = list[1];
9327 const bool offset = cBOOL(list[2]);
9328 #define HEADER_LENGTH 3
9329 /* If any of the above changes in any way, you must change HEADER_LENGTH
9330 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9331 * perl -E 'say int(rand 2**31-1)'
9333 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9334 data structure type, so that one being
9335 passed in can be validated to be an
9336 inversion list of the correct vintage.
9339 SV* invlist = newSV_type(SVt_INVLIST);
9341 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9343 if (version_id != INVLIST_VERSION_ID) {
9344 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9347 /* The generated array passed in includes header elements that aren't part
9348 * of the list proper, so start it just after them */
9349 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9351 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9352 shouldn't touch it */
9354 *(get_invlist_offset_addr(invlist)) = offset;
9356 /* The 'length' passed to us is the physical number of elements in the
9357 * inversion list. But if there is an offset the logical number is one
9359 invlist_set_len(invlist, length - offset, offset);
9361 invlist_set_previous_index(invlist, 0);
9363 /* Initialize the iteration pointer. */
9364 invlist_iterfinish(invlist);
9366 SvREADONLY_on(invlist);
9373 S__append_range_to_invlist(pTHX_ SV* const invlist,
9374 const UV start, const UV end)
9376 /* Subject to change or removal. Append the range from 'start' to 'end' at
9377 * the end of the inversion list. The range must be above any existing
9381 UV max = invlist_max(invlist);
9382 UV len = _invlist_len(invlist);
9385 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9387 if (len == 0) { /* Empty lists must be initialized */
9388 offset = start != 0;
9389 array = _invlist_array_init(invlist, ! offset);
9392 /* Here, the existing list is non-empty. The current max entry in the
9393 * list is generally the first value not in the set, except when the
9394 * set extends to the end of permissible values, in which case it is
9395 * the first entry in that final set, and so this call is an attempt to
9396 * append out-of-order */
9398 UV final_element = len - 1;
9399 array = invlist_array(invlist);
9400 if ( array[final_element] > start
9401 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9403 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9404 array[final_element], start,
9405 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9408 /* Here, it is a legal append. If the new range begins 1 above the end
9409 * of the range below it, it is extending the range below it, so the
9410 * new first value not in the set is one greater than the newly
9411 * extended range. */
9412 offset = *get_invlist_offset_addr(invlist);
9413 if (array[final_element] == start) {
9414 if (end != UV_MAX) {
9415 array[final_element] = end + 1;
9418 /* But if the end is the maximum representable on the machine,
9419 * assume that infinity was actually what was meant. Just let
9420 * the range that this would extend to have no end */
9421 invlist_set_len(invlist, len - 1, offset);
9427 /* Here the new range doesn't extend any existing set. Add it */
9429 len += 2; /* Includes an element each for the start and end of range */
9431 /* If wll overflow the existing space, extend, which may cause the array to
9434 invlist_extend(invlist, len);
9436 /* Have to set len here to avoid assert failure in invlist_array() */
9437 invlist_set_len(invlist, len, offset);
9439 array = invlist_array(invlist);
9442 invlist_set_len(invlist, len, offset);
9445 /* The next item on the list starts the range, the one after that is
9446 * one past the new range. */
9447 array[len - 2] = start;
9448 if (end != UV_MAX) {
9449 array[len - 1] = end + 1;
9452 /* But if the end is the maximum representable on the machine, just let
9453 * the range have no end */
9454 invlist_set_len(invlist, len - 1, offset);
9459 Perl__invlist_search(SV* const invlist, const UV cp)
9461 /* Searches the inversion list for the entry that contains the input code
9462 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9463 * return value is the index into the list's array of the range that
9464 * contains <cp>, that is, 'i' such that
9465 * array[i] <= cp < array[i+1]
9470 IV high = _invlist_len(invlist);
9471 const IV highest_element = high - 1;
9474 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9476 /* If list is empty, return failure. */
9481 /* (We can't get the array unless we know the list is non-empty) */
9482 array = invlist_array(invlist);
9484 mid = invlist_previous_index(invlist);
9486 if (mid > highest_element) {
9487 mid = highest_element;
9490 /* <mid> contains the cache of the result of the previous call to this
9491 * function (0 the first time). See if this call is for the same result,
9492 * or if it is for mid-1. This is under the theory that calls to this
9493 * function will often be for related code points that are near each other.
9494 * And benchmarks show that caching gives better results. We also test
9495 * here if the code point is within the bounds of the list. These tests
9496 * replace others that would have had to be made anyway to make sure that
9497 * the array bounds were not exceeded, and these give us extra information
9498 * at the same time */
9499 if (cp >= array[mid]) {
9500 if (cp >= array[highest_element]) {
9501 return highest_element;
9504 /* Here, array[mid] <= cp < array[highest_element]. This means that
9505 * the final element is not the answer, so can exclude it; it also
9506 * means that <mid> is not the final element, so can refer to 'mid + 1'
9508 if (cp < array[mid + 1]) {
9514 else { /* cp < aray[mid] */
9515 if (cp < array[0]) { /* Fail if outside the array */
9519 if (cp >= array[mid - 1]) {
9524 /* Binary search. What we are looking for is <i> such that
9525 * array[i] <= cp < array[i+1]
9526 * The loop below converges on the i+1. Note that there may not be an
9527 * (i+1)th element in the array, and things work nonetheless */
9528 while (low < high) {
9529 mid = (low + high) / 2;
9530 assert(mid <= highest_element);
9531 if (array[mid] <= cp) { /* cp >= array[mid] */
9534 /* We could do this extra test to exit the loop early.
9535 if (cp < array[low]) {
9540 else { /* cp < array[mid] */
9547 invlist_set_previous_index(invlist, high);
9552 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9553 const bool complement_b, SV** output)
9555 /* Take the union of two inversion lists and point '*output' to it. On
9556 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9557 * even 'a' or 'b'). If to an inversion list, the contents of the original
9558 * list will be replaced by the union. The first list, 'a', may be
9559 * NULL, in which case a copy of the second list is placed in '*output'.
9560 * If 'complement_b' is TRUE, the union is taken of the complement
9561 * (inversion) of 'b' instead of b itself.
9563 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9564 * Richard Gillam, published by Addison-Wesley, and explained at some
9565 * length there. The preface says to incorporate its examples into your
9566 * code at your own risk.
9568 * The algorithm is like a merge sort. */
9570 const UV* array_a; /* a's array */
9572 UV len_a; /* length of a's array */
9575 SV* u; /* the resulting union */
9579 UV i_a = 0; /* current index into a's array */
9583 /* running count, as explained in the algorithm source book; items are
9584 * stopped accumulating and are output when the count changes to/from 0.
9585 * The count is incremented when we start a range that's in an input's set,
9586 * and decremented when we start a range that's not in a set. So this
9587 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9588 * and hence nothing goes into the union; 1, just one of the inputs is in
9589 * its set (and its current range gets added to the union); and 2 when both
9590 * inputs are in their sets. */
9593 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9595 assert(*output == NULL || is_invlist(*output));
9597 len_b = _invlist_len(b);
9600 /* Here, 'b' is empty, hence it's complement is all possible code
9601 * points. So if the union includes the complement of 'b', it includes
9602 * everything, and we need not even look at 'a'. It's easiest to
9603 * create a new inversion list that matches everything. */
9605 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9607 if (*output == NULL) { /* If the output didn't exist, just point it
9609 *output = everything;
9611 else { /* Otherwise, replace its contents with the new list */
9612 invlist_replace_list_destroys_src(*output, everything);
9613 SvREFCNT_dec_NN(everything);
9619 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9620 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9621 * output will be empty */
9623 if (a == NULL || _invlist_len(a) == 0) {
9624 if (*output == NULL) {
9625 *output = _new_invlist(0);
9628 invlist_clear(*output);
9633 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9634 * union. We can just return a copy of 'a' if '*output' doesn't point
9635 * to an existing list */
9636 if (*output == NULL) {
9637 *output = invlist_clone(a, NULL);
9641 /* If the output is to overwrite 'a', we have a no-op, as it's
9647 /* Here, '*output' is to be overwritten by 'a' */
9648 u = invlist_clone(a, NULL);
9649 invlist_replace_list_destroys_src(*output, u);
9655 /* Here 'b' is not empty. See about 'a' */
9657 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9659 /* Here, 'a' is empty (and b is not). That means the union will come
9660 * entirely from 'b'. If '*output' is NULL, we can directly return a
9661 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9664 SV ** dest = (*output == NULL) ? output : &u;
9665 *dest = invlist_clone(b, NULL);
9667 _invlist_invert(*dest);
9671 invlist_replace_list_destroys_src(*output, u);
9678 /* Here both lists exist and are non-empty */
9679 array_a = invlist_array(a);
9680 array_b = invlist_array(b);
9682 /* If are to take the union of 'a' with the complement of b, set it
9683 * up so are looking at b's complement. */
9686 /* To complement, we invert: if the first element is 0, remove it. To
9687 * do this, we just pretend the array starts one later */
9688 if (array_b[0] == 0) {
9694 /* But if the first element is not zero, we pretend the list starts
9695 * at the 0 that is always stored immediately before the array. */
9701 /* Size the union for the worst case: that the sets are completely
9703 u = _new_invlist(len_a + len_b);
9705 /* Will contain U+0000 if either component does */
9706 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9707 || (len_b > 0 && array_b[0] == 0));
9709 /* Go through each input list item by item, stopping when have exhausted
9711 while (i_a < len_a && i_b < len_b) {
9712 UV cp; /* The element to potentially add to the union's array */
9713 bool cp_in_set; /* is it in the the input list's set or not */
9715 /* We need to take one or the other of the two inputs for the union.
9716 * Since we are merging two sorted lists, we take the smaller of the
9717 * next items. In case of a tie, we take first the one that is in its
9718 * set. If we first took the one not in its set, it would decrement
9719 * the count, possibly to 0 which would cause it to be output as ending
9720 * the range, and the next time through we would take the same number,
9721 * and output it again as beginning the next range. By doing it the
9722 * opposite way, there is no possibility that the count will be
9723 * momentarily decremented to 0, and thus the two adjoining ranges will
9724 * be seamlessly merged. (In a tie and both are in the set or both not
9725 * in the set, it doesn't matter which we take first.) */
9726 if ( array_a[i_a] < array_b[i_b]
9727 || ( array_a[i_a] == array_b[i_b]
9728 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9730 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9731 cp = array_a[i_a++];
9734 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9735 cp = array_b[i_b++];
9738 /* Here, have chosen which of the two inputs to look at. Only output
9739 * if the running count changes to/from 0, which marks the
9740 * beginning/end of a range that's in the set */
9743 array_u[i_u++] = cp;
9750 array_u[i_u++] = cp;
9756 /* The loop above increments the index into exactly one of the input lists
9757 * each iteration, and ends when either index gets to its list end. That
9758 * means the other index is lower than its end, and so something is
9759 * remaining in that one. We decrement 'count', as explained below, if
9760 * that list is in its set. (i_a and i_b each currently index the element
9761 * beyond the one we care about.) */
9762 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9763 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9768 /* Above we decremented 'count' if the list that had unexamined elements in
9769 * it was in its set. This has made it so that 'count' being non-zero
9770 * means there isn't anything left to output; and 'count' equal to 0 means
9771 * that what is left to output is precisely that which is left in the
9772 * non-exhausted input list.
9774 * To see why, note first that the exhausted input obviously has nothing
9775 * left to add to the union. If it was in its set at its end, that means
9776 * the set extends from here to the platform's infinity, and hence so does
9777 * the union and the non-exhausted set is irrelevant. The exhausted set
9778 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9779 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9780 * 'count' remains at 1. This is consistent with the decremented 'count'
9781 * != 0 meaning there's nothing left to add to the union.
9783 * But if the exhausted input wasn't in its set, it contributed 0 to
9784 * 'count', and the rest of the union will be whatever the other input is.
9785 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9786 * otherwise it gets decremented to 0. This is consistent with 'count'
9787 * == 0 meaning the remainder of the union is whatever is left in the
9788 * non-exhausted list. */
9793 IV copy_count = len_a - i_a;
9794 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9795 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9797 else { /* The non-exhausted input is b */
9798 copy_count = len_b - i_b;
9799 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9801 len_u = i_u + copy_count;
9804 /* Set the result to the final length, which can change the pointer to
9805 * array_u, so re-find it. (Note that it is unlikely that this will
9806 * change, as we are shrinking the space, not enlarging it) */
9807 if (len_u != _invlist_len(u)) {
9808 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9810 array_u = invlist_array(u);
9813 if (*output == NULL) { /* Simply return the new inversion list */
9817 /* Otherwise, overwrite the inversion list that was in '*output'. We
9818 * could instead free '*output', and then set it to 'u', but experience
9819 * has shown [perl #127392] that if the input is a mortal, we can get a
9820 * huge build-up of these during regex compilation before they get
9822 invlist_replace_list_destroys_src(*output, u);
9830 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9831 const bool complement_b, SV** i)
9833 /* Take the intersection of two inversion lists and point '*i' to it. On
9834 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9835 * even 'a' or 'b'). If to an inversion list, the contents of the original
9836 * list will be replaced by the intersection. The first list, 'a', may be
9837 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9838 * TRUE, the result will be the intersection of 'a' and the complement (or
9839 * inversion) of 'b' instead of 'b' directly.
9841 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9842 * Richard Gillam, published by Addison-Wesley, and explained at some
9843 * length there. The preface says to incorporate its examples into your
9844 * code at your own risk. In fact, it had bugs
9846 * The algorithm is like a merge sort, and is essentially the same as the
9850 const UV* array_a; /* a's array */
9852 UV len_a; /* length of a's array */
9855 SV* r; /* the resulting intersection */
9859 UV i_a = 0; /* current index into a's array */
9863 /* running count of how many of the two inputs are postitioned at ranges
9864 * that are in their sets. As explained in the algorithm source book,
9865 * items are stopped accumulating and are output when the count changes
9866 * to/from 2. The count is incremented when we start a range that's in an
9867 * input's set, and decremented when we start a range that's not in a set.
9868 * Only when it is 2 are we in the intersection. */
9871 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9873 assert(*i == NULL || is_invlist(*i));
9875 /* Special case if either one is empty */
9876 len_a = (a == NULL) ? 0 : _invlist_len(a);
9877 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9878 if (len_a != 0 && complement_b) {
9880 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9881 * must be empty. Here, also we are using 'b's complement, which
9882 * hence must be every possible code point. Thus the intersection
9885 if (*i == a) { /* No-op */
9890 *i = invlist_clone(a, NULL);
9894 r = invlist_clone(a, NULL);
9895 invlist_replace_list_destroys_src(*i, r);
9900 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9901 * intersection must be empty */
9903 *i = _new_invlist(0);
9911 /* Here both lists exist and are non-empty */
9912 array_a = invlist_array(a);
9913 array_b = invlist_array(b);
9915 /* If are to take the intersection of 'a' with the complement of b, set it
9916 * up so are looking at b's complement. */
9919 /* To complement, we invert: if the first element is 0, remove it. To
9920 * do this, we just pretend the array starts one later */
9921 if (array_b[0] == 0) {
9927 /* But if the first element is not zero, we pretend the list starts
9928 * at the 0 that is always stored immediately before the array. */
9934 /* Size the intersection for the worst case: that the intersection ends up
9935 * fragmenting everything to be completely disjoint */
9936 r= _new_invlist(len_a + len_b);
9938 /* Will contain U+0000 iff both components do */
9939 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9940 && len_b > 0 && array_b[0] == 0);
9942 /* Go through each list item by item, stopping when have exhausted one of
9944 while (i_a < len_a && i_b < len_b) {
9945 UV cp; /* The element to potentially add to the intersection's
9947 bool cp_in_set; /* Is it in the input list's set or not */
9949 /* We need to take one or the other of the two inputs for the
9950 * intersection. Since we are merging two sorted lists, we take the
9951 * smaller of the next items. In case of a tie, we take first the one
9952 * that is not in its set (a difference from the union algorithm). If
9953 * we first took the one in its set, it would increment the count,
9954 * possibly to 2 which would cause it to be output as starting a range
9955 * in the intersection, and the next time through we would take that
9956 * same number, and output it again as ending the set. By doing the
9957 * opposite of this, there is no possibility that the count will be
9958 * momentarily incremented to 2. (In a tie and both are in the set or
9959 * both not in the set, it doesn't matter which we take first.) */
9960 if ( array_a[i_a] < array_b[i_b]
9961 || ( array_a[i_a] == array_b[i_b]
9962 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9964 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9965 cp = array_a[i_a++];
9968 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9972 /* Here, have chosen which of the two inputs to look at. Only output
9973 * if the running count changes to/from 2, which marks the
9974 * beginning/end of a range that's in the intersection */
9978 array_r[i_r++] = cp;
9983 array_r[i_r++] = cp;
9990 /* The loop above increments the index into exactly one of the input lists
9991 * each iteration, and ends when either index gets to its list end. That
9992 * means the other index is lower than its end, and so something is
9993 * remaining in that one. We increment 'count', as explained below, if the
9994 * exhausted list was in its set. (i_a and i_b each currently index the
9995 * element beyond the one we care about.) */
9996 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9997 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10002 /* Above we incremented 'count' if the exhausted list was in its set. This
10003 * has made it so that 'count' being below 2 means there is nothing left to
10004 * output; otheriwse what's left to add to the intersection is precisely
10005 * that which is left in the non-exhausted input list.
10007 * To see why, note first that the exhausted input obviously has nothing
10008 * left to affect the intersection. If it was in its set at its end, that
10009 * means the set extends from here to the platform's infinity, and hence
10010 * anything in the non-exhausted's list will be in the intersection, and
10011 * anything not in it won't be. Hence, the rest of the intersection is
10012 * precisely what's in the non-exhausted list The exhausted set also
10013 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10014 * it means 'count' is now at least 2. This is consistent with the
10015 * incremented 'count' being >= 2 means to add the non-exhausted list to
10016 * the intersection.
10018 * But if the exhausted input wasn't in its set, it contributed 0 to
10019 * 'count', and the intersection can't include anything further; the
10020 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10021 * incremented. This is consistent with 'count' being < 2 meaning nothing
10022 * further to add to the intersection. */
10023 if (count < 2) { /* Nothing left to put in the intersection. */
10026 else { /* copy the non-exhausted list, unchanged. */
10027 IV copy_count = len_a - i_a;
10028 if (copy_count > 0) { /* a is the one with stuff left */
10029 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10031 else { /* b is the one with stuff left */
10032 copy_count = len_b - i_b;
10033 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10035 len_r = i_r + copy_count;
10038 /* Set the result to the final length, which can change the pointer to
10039 * array_r, so re-find it. (Note that it is unlikely that this will
10040 * change, as we are shrinking the space, not enlarging it) */
10041 if (len_r != _invlist_len(r)) {
10042 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10044 array_r = invlist_array(r);
10047 if (*i == NULL) { /* Simply return the calculated intersection */
10050 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10051 instead free '*i', and then set it to 'r', but experience has
10052 shown [perl #127392] that if the input is a mortal, we can get a
10053 huge build-up of these during regex compilation before they get
10056 invlist_replace_list_destroys_src(*i, r);
10061 SvREFCNT_dec_NN(r);
10068 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10070 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10071 * set. A pointer to the inversion list is returned. This may actually be
10072 * a new list, in which case the passed in one has been destroyed. The
10073 * passed-in inversion list can be NULL, in which case a new one is created
10074 * with just the one range in it. The new list is not necessarily
10075 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10076 * result of this function. The gain would not be large, and in many
10077 * cases, this is called multiple times on a single inversion list, so
10078 * anything freed may almost immediately be needed again.
10080 * This used to mostly call the 'union' routine, but that is much more
10081 * heavyweight than really needed for a single range addition */
10083 UV* array; /* The array implementing the inversion list */
10084 UV len; /* How many elements in 'array' */
10085 SSize_t i_s; /* index into the invlist array where 'start'
10087 SSize_t i_e = 0; /* And the index where 'end' should go */
10088 UV cur_highest; /* The highest code point in the inversion list
10089 upon entry to this function */
10091 /* This range becomes the whole inversion list if none already existed */
10092 if (invlist == NULL) {
10093 invlist = _new_invlist(2);
10094 _append_range_to_invlist(invlist, start, end);
10098 /* Likewise, if the inversion list is currently empty */
10099 len = _invlist_len(invlist);
10101 _append_range_to_invlist(invlist, start, end);
10105 /* Starting here, we have to know the internals of the list */
10106 array = invlist_array(invlist);
10108 /* If the new range ends higher than the current highest ... */
10109 cur_highest = invlist_highest(invlist);
10110 if (end > cur_highest) {
10112 /* If the whole range is higher, we can just append it */
10113 if (start > cur_highest) {
10114 _append_range_to_invlist(invlist, start, end);
10118 /* Otherwise, add the portion that is higher ... */
10119 _append_range_to_invlist(invlist, cur_highest + 1, end);
10121 /* ... and continue on below to handle the rest. As a result of the
10122 * above append, we know that the index of the end of the range is the
10123 * final even numbered one of the array. Recall that the final element
10124 * always starts a range that extends to infinity. If that range is in
10125 * the set (meaning the set goes from here to infinity), it will be an
10126 * even index, but if it isn't in the set, it's odd, and the final
10127 * range in the set is one less, which is even. */
10128 if (end == UV_MAX) {
10136 /* We have dealt with appending, now see about prepending. If the new
10137 * range starts lower than the current lowest ... */
10138 if (start < array[0]) {
10140 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10141 * Let the union code handle it, rather than having to know the
10142 * trickiness in two code places. */
10143 if (UNLIKELY(start == 0)) {
10146 range_invlist = _new_invlist(2);
10147 _append_range_to_invlist(range_invlist, start, end);
10149 _invlist_union(invlist, range_invlist, &invlist);
10151 SvREFCNT_dec_NN(range_invlist);
10156 /* If the whole new range comes before the first entry, and doesn't
10157 * extend it, we have to insert it as an additional range */
10158 if (end < array[0] - 1) {
10160 goto splice_in_new_range;
10163 /* Here the new range adjoins the existing first range, extending it
10167 /* And continue on below to handle the rest. We know that the index of
10168 * the beginning of the range is the first one of the array */
10171 else { /* Not prepending any part of the new range to the existing list.
10172 * Find where in the list it should go. This finds i_s, such that:
10173 * invlist[i_s] <= start < array[i_s+1]
10175 i_s = _invlist_search(invlist, start);
10178 /* At this point, any extending before the beginning of the inversion list
10179 * and/or after the end has been done. This has made it so that, in the
10180 * code below, each endpoint of the new range is either in a range that is
10181 * in the set, or is in a gap between two ranges that are. This means we
10182 * don't have to worry about exceeding the array bounds.
10184 * Find where in the list the new range ends (but we can skip this if we
10185 * have already determined what it is, or if it will be the same as i_s,
10186 * which we already have computed) */
10188 i_e = (start == end)
10190 : _invlist_search(invlist, end);
10193 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10194 * is a range that goes to infinity there is no element at invlist[i_e+1],
10195 * so only the first relation holds. */
10197 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10199 /* Here, the ranges on either side of the beginning of the new range
10200 * are in the set, and this range starts in the gap between them.
10202 * The new range extends the range above it downwards if the new range
10203 * ends at or above that range's start */
10204 const bool extends_the_range_above = ( end == UV_MAX
10205 || end + 1 >= array[i_s+1]);
10207 /* The new range extends the range below it upwards if it begins just
10208 * after where that range ends */
10209 if (start == array[i_s]) {
10211 /* If the new range fills the entire gap between the other ranges,
10212 * they will get merged together. Other ranges may also get
10213 * merged, depending on how many of them the new range spans. In
10214 * the general case, we do the merge later, just once, after we
10215 * figure out how many to merge. But in the case where the new
10216 * range exactly spans just this one gap (possibly extending into
10217 * the one above), we do the merge here, and an early exit. This
10218 * is done here to avoid having to special case later. */
10219 if (i_e - i_s <= 1) {
10221 /* If i_e - i_s == 1, it means that the new range terminates
10222 * within the range above, and hence 'extends_the_range_above'
10223 * must be true. (If the range above it extends to infinity,
10224 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10225 * will be 0, so no harm done.) */
10226 if (extends_the_range_above) {
10227 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10228 invlist_set_len(invlist,
10230 *(get_invlist_offset_addr(invlist)));
10234 /* Here, i_e must == i_s. We keep them in sync, as they apply
10235 * to the same range, and below we are about to decrement i_s
10240 /* Here, the new range is adjacent to the one below. (It may also
10241 * span beyond the range above, but that will get resolved later.)
10242 * Extend the range below to include this one. */
10243 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10245 start = array[i_s];
10247 else if (extends_the_range_above) {
10249 /* Here the new range only extends the range above it, but not the
10250 * one below. It merges with the one above. Again, we keep i_e
10251 * and i_s in sync if they point to the same range */
10256 array[i_s] = start;
10260 /* Here, we've dealt with the new range start extending any adjoining
10263 * If the new range extends to infinity, it is now the final one,
10264 * regardless of what was there before */
10265 if (UNLIKELY(end == UV_MAX)) {
10266 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10270 /* If i_e started as == i_s, it has also been dealt with,
10271 * and been updated to the new i_s, which will fail the following if */
10272 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10274 /* Here, the ranges on either side of the end of the new range are in
10275 * the set, and this range ends in the gap between them.
10277 * If this range is adjacent to (hence extends) the range above it, it
10278 * becomes part of that range; likewise if it extends the range below,
10279 * it becomes part of that range */
10280 if (end + 1 == array[i_e+1]) {
10282 array[i_e] = start;
10284 else if (start <= array[i_e]) {
10285 array[i_e] = end + 1;
10292 /* If the range fits entirely in an existing range (as possibly already
10293 * extended above), it doesn't add anything new */
10294 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10298 /* Here, no part of the range is in the list. Must add it. It will
10299 * occupy 2 more slots */
10300 splice_in_new_range:
10302 invlist_extend(invlist, len + 2);
10303 array = invlist_array(invlist);
10304 /* Move the rest of the array down two slots. Don't include any
10306 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10308 /* Do the actual splice */
10309 array[i_e+1] = start;
10310 array[i_e+2] = end + 1;
10311 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10315 /* Here the new range crossed the boundaries of a pre-existing range. The
10316 * code above has adjusted things so that both ends are in ranges that are
10317 * in the set. This means everything in between must also be in the set.
10318 * Just squash things together */
10319 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10320 invlist_set_len(invlist,
10322 *(get_invlist_offset_addr(invlist)));
10328 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10329 UV** other_elements_ptr)
10331 /* Create and return an inversion list whose contents are to be populated
10332 * by the caller. The caller gives the number of elements (in 'size') and
10333 * the very first element ('element0'). This function will set
10334 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10335 * are to be placed.
10337 * Obviously there is some trust involved that the caller will properly
10338 * fill in the other elements of the array.
10340 * (The first element needs to be passed in, as the underlying code does
10341 * things differently depending on whether it is zero or non-zero) */
10343 SV* invlist = _new_invlist(size);
10346 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10348 invlist = add_cp_to_invlist(invlist, element0);
10349 offset = *get_invlist_offset_addr(invlist);
10351 invlist_set_len(invlist, size, offset);
10352 *other_elements_ptr = invlist_array(invlist) + 1;
10358 #ifndef PERL_IN_XSUB_RE
10360 Perl__invlist_invert(pTHX_ SV* const invlist)
10362 /* Complement the input inversion list. This adds a 0 if the list didn't
10363 * have a zero; removes it otherwise. As described above, the data
10364 * structure is set up so that this is very efficient */
10366 PERL_ARGS_ASSERT__INVLIST_INVERT;
10368 assert(! invlist_is_iterating(invlist));
10370 /* The inverse of matching nothing is matching everything */
10371 if (_invlist_len(invlist) == 0) {
10372 _append_range_to_invlist(invlist, 0, UV_MAX);
10376 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10380 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10382 /* Return a new inversion list that is a copy of the input one, which is
10383 * unchanged. The new list will not be mortal even if the old one was. */
10385 const STRLEN nominal_length = _invlist_len(invlist);
10386 const STRLEN physical_length = SvCUR(invlist);
10387 const bool offset = *(get_invlist_offset_addr(invlist));
10389 PERL_ARGS_ASSERT_INVLIST_CLONE;
10391 if (new_invlist == NULL) {
10392 new_invlist = _new_invlist(nominal_length);
10395 sv_upgrade(new_invlist, SVt_INVLIST);
10396 initialize_invlist_guts(new_invlist, nominal_length);
10399 *(get_invlist_offset_addr(new_invlist)) = offset;
10400 invlist_set_len(new_invlist, nominal_length, offset);
10401 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10403 return new_invlist;
10408 PERL_STATIC_INLINE UV
10409 S_invlist_lowest(SV* const invlist)
10411 /* Returns the lowest code point that matches an inversion list. This API
10412 * has an ambiguity, as it returns 0 under either the lowest is actually
10413 * 0, or if the list is empty. If this distinction matters to you, check
10414 * for emptiness before calling this function */
10416 UV len = _invlist_len(invlist);
10419 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10425 array = invlist_array(invlist);
10431 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10433 /* Get the contents of an inversion list into a string SV so that they can
10434 * be printed out. If 'traditional_style' is TRUE, it uses the format
10435 * traditionally done for debug tracing; otherwise it uses a format
10436 * suitable for just copying to the output, with blanks between ranges and
10437 * a dash between range components */
10441 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10442 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10444 if (traditional_style) {
10445 output = newSVpvs("\n");
10448 output = newSVpvs("");
10451 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10453 assert(! invlist_is_iterating(invlist));
10455 invlist_iterinit(invlist);
10456 while (invlist_iternext(invlist, &start, &end)) {
10457 if (end == UV_MAX) {
10458 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10459 start, intra_range_delimiter,
10460 inter_range_delimiter);
10462 else if (end != start) {
10463 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10465 intra_range_delimiter,
10466 end, inter_range_delimiter);
10469 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10470 start, inter_range_delimiter);
10474 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10475 SvCUR_set(output, SvCUR(output) - 1);
10481 #ifndef PERL_IN_XSUB_RE
10483 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10484 const char * const indent, SV* const invlist)
10486 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10487 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10488 * the string 'indent'. The output looks like this:
10489 [0] 0x000A .. 0x000D
10491 [4] 0x2028 .. 0x2029
10492 [6] 0x3104 .. INFTY
10493 * This means that the first range of code points matched by the list are
10494 * 0xA through 0xD; the second range contains only the single code point
10495 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10496 * are used to define each range (except if the final range extends to
10497 * infinity, only a single element is needed). The array index of the
10498 * first element for the corresponding range is given in brackets. */
10503 PERL_ARGS_ASSERT__INVLIST_DUMP;
10505 if (invlist_is_iterating(invlist)) {
10506 Perl_dump_indent(aTHX_ level, file,
10507 "%sCan't dump inversion list because is in middle of iterating\n",
10512 invlist_iterinit(invlist);
10513 while (invlist_iternext(invlist, &start, &end)) {
10514 if (end == UV_MAX) {
10515 Perl_dump_indent(aTHX_ level, file,
10516 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10517 indent, (UV)count, start);
10519 else if (end != start) {
10520 Perl_dump_indent(aTHX_ level, file,
10521 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10522 indent, (UV)count, start, end);
10525 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10526 indent, (UV)count, start);
10534 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10536 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10538 /* Return a boolean as to if the two passed in inversion lists are
10539 * identical. The final argument, if TRUE, says to take the complement of
10540 * the second inversion list before doing the comparison */
10542 const UV len_a = _invlist_len(a);
10543 UV len_b = _invlist_len(b);
10545 const UV* array_a = NULL;
10546 const UV* array_b = NULL;
10548 PERL_ARGS_ASSERT__INVLISTEQ;
10550 /* This code avoids accessing the arrays unless it knows the length is
10555 return ! complement_b;
10559 array_a = invlist_array(a);
10563 array_b = invlist_array(b);
10566 /* If are to compare 'a' with the complement of b, set it
10567 * up so are looking at b's complement. */
10568 if (complement_b) {
10570 /* The complement of nothing is everything, so <a> would have to have
10571 * just one element, starting at zero (ending at infinity) */
10573 return (len_a == 1 && array_a[0] == 0);
10575 if (array_b[0] == 0) {
10577 /* Otherwise, to complement, we invert. Here, the first element is
10578 * 0, just remove it. To do this, we just pretend the array starts
10586 /* But if the first element is not zero, we pretend the list starts
10587 * at the 0 that is always stored immediately before the array. */
10593 return len_a == len_b
10594 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10600 * As best we can, determine the characters that can match the start of
10601 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10602 * can be false positive matches
10604 * Returns the invlist as a new SV*; it is the caller's responsibility to
10605 * call SvREFCNT_dec() when done with it.
10608 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10611 const U8 * s = (U8*)STRING(node);
10612 SSize_t bytelen = STR_LEN(node);
10614 /* Start out big enough for 2 separate code points */
10615 SV* invlist = _new_invlist(4);
10617 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10622 /* We punt and assume can match anything if the node begins
10623 * with a multi-character fold. Things are complicated. For
10624 * example, /ffi/i could match any of:
10625 * "\N{LATIN SMALL LIGATURE FFI}"
10626 * "\N{LATIN SMALL LIGATURE FF}I"
10627 * "F\N{LATIN SMALL LIGATURE FI}"
10628 * plus several other things; and making sure we have all the
10629 * possibilities is hard. */
10630 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10631 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10634 /* Any Latin1 range character can potentially match any
10635 * other depending on the locale, and in Turkic locales, U+130 and
10637 if (OP(node) == EXACTFL) {
10638 _invlist_union(invlist, PL_Latin1, &invlist);
10639 invlist = add_cp_to_invlist(invlist,
10640 LATIN_SMALL_LETTER_DOTLESS_I);
10641 invlist = add_cp_to_invlist(invlist,
10642 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10645 /* But otherwise, it matches at least itself. We can
10646 * quickly tell if it has a distinct fold, and if so,
10647 * it matches that as well */
10648 invlist = add_cp_to_invlist(invlist, uc);
10649 if (IS_IN_SOME_FOLD_L1(uc))
10650 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10653 /* Some characters match above-Latin1 ones under /i. This
10654 * is true of EXACTFL ones when the locale is UTF-8 */
10655 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10656 && (! isASCII(uc) || (OP(node) != EXACTFAA
10657 && OP(node) != EXACTFAA_NO_TRIE)))
10659 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10663 else { /* Pattern is UTF-8 */
10664 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10665 const U8* e = s + bytelen;
10668 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10670 /* The only code points that aren't folded in a UTF EXACTFish
10671 * node are are the problematic ones in EXACTFL nodes */
10672 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10673 /* We need to check for the possibility that this EXACTFL
10674 * node begins with a multi-char fold. Therefore we fold
10675 * the first few characters of it so that we can make that
10681 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10683 *(d++) = (U8) toFOLD(*s);
10684 if (fc < 0) { /* Save the first fold */
10691 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10692 if (fc < 0) { /* Save the first fold */
10700 /* And set up so the code below that looks in this folded
10701 * buffer instead of the node's string */
10706 /* When we reach here 's' points to the fold of the first
10707 * character(s) of the node; and 'e' points to far enough along
10708 * the folded string to be just past any possible multi-char
10711 * Unlike the non-UTF-8 case, the macro for determining if a
10712 * string is a multi-char fold requires all the characters to
10713 * already be folded. This is because of all the complications
10714 * if not. Note that they are folded anyway, except in EXACTFL
10715 * nodes. Like the non-UTF case above, we punt if the node
10716 * begins with a multi-char fold */
10718 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10719 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10721 else { /* Single char fold */
10724 const U32 * remaining_folds;
10725 Size_t folds_count;
10727 /* It matches itself */
10728 invlist = add_cp_to_invlist(invlist, fc);
10730 /* ... plus all the things that fold to it, which are found in
10731 * PL_utf8_foldclosures */
10732 folds_count = _inverse_folds(fc, &first_fold,
10734 for (k = 0; k < folds_count; k++) {
10735 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10737 /* /aa doesn't allow folds between ASCII and non- */
10738 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10739 && isASCII(c) != isASCII(fc))
10744 invlist = add_cp_to_invlist(invlist, c);
10747 if (OP(node) == EXACTFL) {
10749 /* If either [iI] are present in an EXACTFL node the above code
10750 * should have added its normal case pair, but under a Turkish
10751 * locale they could match instead the case pairs from it. Add
10752 * those as potential matches as well */
10753 if (isALPHA_FOLD_EQ(fc, 'I')) {
10754 invlist = add_cp_to_invlist(invlist,
10755 LATIN_SMALL_LETTER_DOTLESS_I);
10756 invlist = add_cp_to_invlist(invlist,
10757 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10759 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10760 invlist = add_cp_to_invlist(invlist, 'I');
10762 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10763 invlist = add_cp_to_invlist(invlist, 'i');
10772 #undef HEADER_LENGTH
10773 #undef TO_INTERNAL_SIZE
10774 #undef FROM_INTERNAL_SIZE
10775 #undef INVLIST_VERSION_ID
10777 /* End of inversion list object */
10780 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10782 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10783 * constructs, and updates RExC_flags with them. On input, RExC_parse
10784 * should point to the first flag; it is updated on output to point to the
10785 * final ')' or ':'. There needs to be at least one flag, or this will
10788 /* for (?g), (?gc), and (?o) warnings; warning
10789 about (?c) will warn about (?g) -- japhy */
10791 #define WASTED_O 0x01
10792 #define WASTED_G 0x02
10793 #define WASTED_C 0x04
10794 #define WASTED_GC (WASTED_G|WASTED_C)
10795 I32 wastedflags = 0x00;
10796 U32 posflags = 0, negflags = 0;
10797 U32 *flagsp = &posflags;
10798 char has_charset_modifier = '\0';
10800 bool has_use_defaults = FALSE;
10801 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10802 int x_mod_count = 0;
10804 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10806 /* '^' as an initial flag sets certain defaults */
10807 if (UCHARAT(RExC_parse) == '^') {
10809 has_use_defaults = TRUE;
10810 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10811 cs = (RExC_uni_semantics)
10812 ? REGEX_UNICODE_CHARSET
10813 : REGEX_DEPENDS_CHARSET;
10814 set_regex_charset(&RExC_flags, cs);
10817 cs = get_regex_charset(RExC_flags);
10818 if ( cs == REGEX_DEPENDS_CHARSET
10819 && RExC_uni_semantics)
10821 cs = REGEX_UNICODE_CHARSET;
10825 while (RExC_parse < RExC_end) {
10826 /* && memCHRs("iogcmsx", *RExC_parse) */
10827 /* (?g), (?gc) and (?o) are useless here
10828 and must be globally applied -- japhy */
10829 if ((RExC_pm_flags & PMf_WILDCARD)) {
10830 if (flagsp == & negflags) {
10831 if (*RExC_parse == 'm') {
10833 /* diag_listed_as: Use of %s is not allowed in Unicode
10834 property wildcard subpatterns in regex; marked by <--
10836 vFAIL("Use of modifier '-m' is not allowed in Unicode"
10837 " property wildcard subpatterns");
10841 if (*RExC_parse == 's') {
10842 goto modifier_illegal_in_wildcard;
10847 switch (*RExC_parse) {
10849 /* Code for the imsxn flags */
10850 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10852 case LOCALE_PAT_MOD:
10853 if (has_charset_modifier) {
10854 goto excess_modifier;
10856 else if (flagsp == &negflags) {
10859 cs = REGEX_LOCALE_CHARSET;
10860 has_charset_modifier = LOCALE_PAT_MOD;
10862 case UNICODE_PAT_MOD:
10863 if (has_charset_modifier) {
10864 goto excess_modifier;
10866 else if (flagsp == &negflags) {
10869 cs = REGEX_UNICODE_CHARSET;
10870 has_charset_modifier = UNICODE_PAT_MOD;
10872 case ASCII_RESTRICT_PAT_MOD:
10873 if (flagsp == &negflags) {
10876 if (has_charset_modifier) {
10877 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10878 goto excess_modifier;
10880 /* Doubled modifier implies more restricted */
10881 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10884 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10886 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10888 case DEPENDS_PAT_MOD:
10889 if (has_use_defaults) {
10890 goto fail_modifiers;
10892 else if (flagsp == &negflags) {
10895 else if (has_charset_modifier) {
10896 goto excess_modifier;
10899 /* The dual charset means unicode semantics if the
10900 * pattern (or target, not known until runtime) are
10901 * utf8, or something in the pattern indicates unicode
10903 cs = (RExC_uni_semantics)
10904 ? REGEX_UNICODE_CHARSET
10905 : REGEX_DEPENDS_CHARSET;
10906 has_charset_modifier = DEPENDS_PAT_MOD;
10910 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10911 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10913 else if (has_charset_modifier == *(RExC_parse - 1)) {
10914 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10915 *(RExC_parse - 1));
10918 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10920 NOT_REACHED; /*NOTREACHED*/
10923 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10924 *(RExC_parse - 1));
10925 NOT_REACHED; /*NOTREACHED*/
10926 case GLOBAL_PAT_MOD: /* 'g' */
10927 if (RExC_pm_flags & PMf_WILDCARD) {
10928 goto modifier_illegal_in_wildcard;
10931 case ONCE_PAT_MOD: /* 'o' */
10932 if (ckWARN(WARN_REGEXP)) {
10933 const I32 wflagbit = *RExC_parse == 'o'
10936 if (! (wastedflags & wflagbit) ) {
10937 wastedflags |= wflagbit;
10938 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10941 "Useless (%s%c) - %suse /%c modifier",
10942 flagsp == &negflags ? "?-" : "?",
10944 flagsp == &negflags ? "don't " : "",
10951 case CONTINUE_PAT_MOD: /* 'c' */
10952 if (RExC_pm_flags & PMf_WILDCARD) {
10953 goto modifier_illegal_in_wildcard;
10955 if (ckWARN(WARN_REGEXP)) {
10956 if (! (wastedflags & WASTED_C) ) {
10957 wastedflags |= WASTED_GC;
10958 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10961 "Useless (%sc) - %suse /gc modifier",
10962 flagsp == &negflags ? "?-" : "?",
10963 flagsp == &negflags ? "don't " : ""
10968 case KEEPCOPY_PAT_MOD: /* 'p' */
10969 if (RExC_pm_flags & PMf_WILDCARD) {
10970 goto modifier_illegal_in_wildcard;
10972 if (flagsp == &negflags) {
10973 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10975 *flagsp |= RXf_PMf_KEEPCOPY;
10979 /* A flag is a default iff it is following a minus, so
10980 * if there is a minus, it means will be trying to
10981 * re-specify a default which is an error */
10982 if (has_use_defaults || flagsp == &negflags) {
10983 goto fail_modifiers;
10985 flagsp = &negflags;
10986 wastedflags = 0; /* reset so (?g-c) warns twice */
10992 if ( (RExC_pm_flags & PMf_WILDCARD)
10993 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
10996 /* diag_listed_as: Use of %s is not allowed in Unicode
10997 property wildcard subpatterns in regex; marked by <--
10999 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11000 " property wildcard subpatterns",
11001 has_charset_modifier);
11004 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11005 negflags |= RXf_PMf_EXTENDED_MORE;
11007 RExC_flags |= posflags;
11009 if (negflags & RXf_PMf_EXTENDED) {
11010 negflags |= RXf_PMf_EXTENDED_MORE;
11012 RExC_flags &= ~negflags;
11013 set_regex_charset(&RExC_flags, cs);
11018 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11019 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11020 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11021 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11022 NOT_REACHED; /*NOTREACHED*/
11025 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11028 vFAIL("Sequence (?... not terminated");
11030 modifier_illegal_in_wildcard:
11032 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11033 subpatterns in regex; marked by <-- HERE in m/%s/ */
11034 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11035 " subpatterns", *(RExC_parse - 1));
11039 - reg - regular expression, i.e. main body or parenthesized thing
11041 * Caller must absorb opening parenthesis.
11043 * Combining parenthesis handling with the base level of regular expression
11044 * is a trifle forced, but the need to tie the tails of the branches to what
11045 * follows makes it hard to avoid.
11047 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11049 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11051 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11054 STATIC regnode_offset
11055 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11057 char * parse_start,
11061 regnode_offset ret;
11062 char* name_start = RExC_parse;
11064 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11065 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11067 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11069 if (RExC_parse == name_start || *RExC_parse != ch) {
11070 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11071 vFAIL2("Sequence %.3s... not terminated", parse_start);
11075 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11076 RExC_rxi->data->data[num]=(void*)sv_dat;
11077 SvREFCNT_inc_simple_void_NN(sv_dat);
11080 ret = reganode(pRExC_state,
11083 : (ASCII_FOLD_RESTRICTED)
11085 : (AT_LEAST_UNI_SEMANTICS)
11091 *flagp |= HASWIDTH;
11093 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11094 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11096 nextchar(pRExC_state);
11100 /* On success, returns the offset at which any next node should be placed into
11101 * the regex engine program being compiled.
11103 * Returns 0 otherwise, with *flagp set to indicate why:
11104 * TRYAGAIN at the end of (?) that only sets flags.
11105 * RESTART_PARSE if the parse needs to be restarted, or'd with
11106 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11107 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11109 STATIC regnode_offset
11110 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11111 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11112 * 2 is like 1, but indicates that nextchar() has been called to advance
11113 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11114 * this flag alerts us to the need to check for that */
11116 regnode_offset ret = 0; /* Will be the head of the group. */
11118 regnode_offset lastbr;
11119 regnode_offset ender = 0;
11122 U32 oregflags = RExC_flags;
11123 bool have_branch = 0;
11125 I32 freeze_paren = 0;
11126 I32 after_freeze = 0;
11127 I32 num; /* numeric backreferences */
11128 SV * max_open; /* Max number of unclosed parens */
11130 char * parse_start = RExC_parse; /* MJD */
11131 char * const oregcomp_parse = RExC_parse;
11133 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11135 PERL_ARGS_ASSERT_REG;
11136 DEBUG_PARSE("reg ");
11138 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11140 if (!SvIOK(max_open)) {
11141 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11143 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11145 vFAIL("Too many nested open parens");
11148 *flagp = 0; /* Tentatively. */
11150 if (RExC_in_lookbehind) {
11151 RExC_in_lookbehind++;
11153 if (RExC_in_lookahead) {
11154 RExC_in_lookahead++;
11157 /* Having this true makes it feasible to have a lot fewer tests for the
11158 * parse pointer being in scope. For example, we can write
11159 * while(isFOO(*RExC_parse)) RExC_parse++;
11161 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11163 assert(*RExC_end == '\0');
11165 /* Make an OPEN node, if parenthesized. */
11168 /* Under /x, space and comments can be gobbled up between the '(' and
11169 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11170 * intervening space, as the sequence is a token, and a token should be
11172 bool has_intervening_patws = (paren == 2)
11173 && *(RExC_parse - 1) != '(';
11175 if (RExC_parse >= RExC_end) {
11176 vFAIL("Unmatched (");
11179 if (paren == 'r') { /* Atomic script run */
11183 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11184 char *start_verb = RExC_parse + 1;
11186 char *start_arg = NULL;
11187 unsigned char op = 0;
11188 int arg_required = 0;
11189 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11190 bool has_upper = FALSE;
11192 if (has_intervening_patws) {
11193 RExC_parse++; /* past the '*' */
11195 /* For strict backwards compatibility, don't change the message
11196 * now that we also have lowercase operands */
11197 if (isUPPER(*RExC_parse)) {
11198 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11201 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11204 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11205 if ( *RExC_parse == ':' ) {
11206 start_arg = RExC_parse + 1;
11210 if (isUPPER(*RExC_parse)) {
11216 RExC_parse += UTF8SKIP(RExC_parse);
11219 verb_len = RExC_parse - start_verb;
11221 if (RExC_parse >= RExC_end) {
11222 goto unterminated_verb_pattern;
11225 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11226 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11227 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11229 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11230 unterminated_verb_pattern:
11232 vFAIL("Unterminated verb pattern argument");
11235 vFAIL("Unterminated '(*...' argument");
11239 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11241 vFAIL("Unterminated verb pattern");
11244 vFAIL("Unterminated '(*...' construct");
11249 /* Here, we know that RExC_parse < RExC_end */
11251 switch ( *start_verb ) {
11252 case 'A': /* (*ACCEPT) */
11253 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11255 internal_argval = RExC_nestroot;
11258 case 'C': /* (*COMMIT) */
11259 if ( memEQs(start_verb, verb_len,"COMMIT") )
11262 case 'F': /* (*FAIL) */
11263 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11267 case ':': /* (*:NAME) */
11268 case 'M': /* (*MARK:NAME) */
11269 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11274 case 'P': /* (*PRUNE) */
11275 if ( memEQs(start_verb, verb_len,"PRUNE") )
11278 case 'S': /* (*SKIP) */
11279 if ( memEQs(start_verb, verb_len,"SKIP") )
11282 case 'T': /* (*THEN) */
11283 /* [19:06] <TimToady> :: is then */
11284 if ( memEQs(start_verb, verb_len,"THEN") ) {
11286 RExC_seen |= REG_CUTGROUP_SEEN;
11290 if ( memEQs(start_verb, verb_len, "asr")
11291 || memEQs(start_verb, verb_len, "atomic_script_run"))
11293 paren = 'r'; /* Mnemonic: recursed run */
11296 else if (memEQs(start_verb, verb_len, "atomic")) {
11297 paren = 't'; /* AtOMIC */
11298 goto alpha_assertions;
11302 if ( memEQs(start_verb, verb_len, "plb")
11303 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11306 goto lookbehind_alpha_assertions;
11308 else if ( memEQs(start_verb, verb_len, "pla")
11309 || memEQs(start_verb, verb_len, "positive_lookahead"))
11312 goto alpha_assertions;
11316 if ( memEQs(start_verb, verb_len, "nlb")
11317 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11320 goto lookbehind_alpha_assertions;
11322 else if ( memEQs(start_verb, verb_len, "nla")
11323 || memEQs(start_verb, verb_len, "negative_lookahead"))
11326 goto alpha_assertions;
11330 if ( memEQs(start_verb, verb_len, "sr")
11331 || memEQs(start_verb, verb_len, "script_run"))
11333 regnode_offset atomic;
11339 /* This indicates Unicode rules. */
11340 REQUIRE_UNI_RULES(flagp, 0);
11346 RExC_parse = start_arg;
11348 if (RExC_in_script_run) {
11350 /* Nested script runs are treated as no-ops, because
11351 * if the nested one fails, the outer one must as
11352 * well. It could fail sooner, and avoid (??{} with
11353 * side effects, but that is explicitly documented as
11354 * undefined behavior. */
11358 if (paren == 's') {
11363 /* But, the atomic part of a nested atomic script run
11364 * isn't a no-op, but can be treated just like a '(?>'
11370 if (paren == 's') {
11371 /* Here, we're starting a new regular script run */
11372 ret = reg_node(pRExC_state, SROPEN);
11373 RExC_in_script_run = 1;
11378 /* Here, we are starting an atomic script run. This is
11379 * handled by recursing to deal with the atomic portion
11380 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11382 ret = reg_node(pRExC_state, SROPEN);
11384 RExC_in_script_run = 1;
11386 atomic = reg(pRExC_state, 'r', &flags, depth);
11387 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11388 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11392 if (! REGTAIL(pRExC_state, ret, atomic)) {
11393 REQUIRE_BRANCHJ(flagp, 0);
11396 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11399 REQUIRE_BRANCHJ(flagp, 0);
11402 RExC_in_script_run = 0;
11408 lookbehind_alpha_assertions:
11409 RExC_seen |= REG_LOOKBEHIND_SEEN;
11410 RExC_in_lookbehind++;
11415 RExC_seen_zerolen++;
11421 /* An empty negative lookahead assertion simply is failure */
11422 if (paren == 'A' && RExC_parse == start_arg) {
11423 ret=reganode(pRExC_state, OPFAIL, 0);
11424 nextchar(pRExC_state);
11428 RExC_parse = start_arg;
11433 "'(*%" UTF8f "' requires a terminating ':'",
11434 UTF8fARG(UTF, verb_len, start_verb));
11435 NOT_REACHED; /*NOTREACHED*/
11437 } /* End of switch */
11440 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11442 if (has_upper || verb_len == 0) {
11444 "Unknown verb pattern '%" UTF8f "'",
11445 UTF8fARG(UTF, verb_len, start_verb));
11449 "Unknown '(*...)' construct '%" UTF8f "'",
11450 UTF8fARG(UTF, verb_len, start_verb));
11453 if ( RExC_parse == start_arg ) {
11456 if ( arg_required && !start_arg ) {
11457 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11458 (int) verb_len, start_verb);
11460 if (internal_argval == -1) {
11461 ret = reganode(pRExC_state, op, 0);
11463 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11465 RExC_seen |= REG_VERBARG_SEEN;
11467 SV *sv = newSVpvn( start_arg,
11468 RExC_parse - start_arg);
11469 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11470 STR_WITH_LEN("S"));
11471 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11472 FLAGS(REGNODE_p(ret)) = 1;
11474 FLAGS(REGNODE_p(ret)) = 0;
11476 if ( internal_argval != -1 )
11477 ARG2L_SET(REGNODE_p(ret), internal_argval);
11478 nextchar(pRExC_state);
11481 else if (*RExC_parse == '?') { /* (?...) */
11482 bool is_logical = 0;
11483 const char * const seqstart = RExC_parse;
11484 const char * endptr;
11485 const char non_existent_group_msg[]
11486 = "Reference to nonexistent group";
11487 const char impossible_group[] = "Invalid reference to group";
11489 if (has_intervening_patws) {
11491 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11494 RExC_parse++; /* past the '?' */
11495 paren = *RExC_parse; /* might be a trailing NUL, if not
11497 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11498 if (RExC_parse > RExC_end) {
11501 ret = 0; /* For look-ahead/behind. */
11504 case 'P': /* (?P...) variants for those used to PCRE/Python */
11505 paren = *RExC_parse;
11506 if ( paren == '<') { /* (?P<...>) named capture */
11508 if (RExC_parse >= RExC_end) {
11509 vFAIL("Sequence (?P<... not terminated");
11511 goto named_capture;
11513 else if (paren == '>') { /* (?P>name) named recursion */
11515 if (RExC_parse >= RExC_end) {
11516 vFAIL("Sequence (?P>... not terminated");
11518 goto named_recursion;
11520 else if (paren == '=') { /* (?P=...) named backref */
11522 return handle_named_backref(pRExC_state, flagp,
11525 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11526 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11527 vFAIL3("Sequence (%.*s...) not recognized",
11528 (int) (RExC_parse - seqstart), seqstart);
11529 NOT_REACHED; /*NOTREACHED*/
11530 case '<': /* (?<...) */
11531 /* If you want to support (?<*...), first reconcile with GH #17363 */
11532 if (*RExC_parse == '!')
11534 else if (*RExC_parse != '=')
11541 case '\'': /* (?'...') */
11542 name_start = RExC_parse;
11543 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11544 if ( RExC_parse == name_start
11545 || RExC_parse >= RExC_end
11546 || *RExC_parse != paren)
11548 vFAIL2("Sequence (?%c... not terminated",
11549 paren=='>' ? '<' : (char) paren);
11554 if (!svname) /* shouldn't happen */
11556 "panic: reg_scan_name returned NULL");
11557 if (!RExC_paren_names) {
11558 RExC_paren_names= newHV();
11559 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11561 RExC_paren_name_list= newAV();
11562 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11565 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11567 sv_dat = HeVAL(he_str);
11569 /* croak baby croak */
11571 "panic: paren_name hash element allocation failed");
11572 } else if ( SvPOK(sv_dat) ) {
11573 /* (?|...) can mean we have dupes so scan to check
11574 its already been stored. Maybe a flag indicating
11575 we are inside such a construct would be useful,
11576 but the arrays are likely to be quite small, so
11577 for now we punt -- dmq */
11578 IV count = SvIV(sv_dat);
11579 I32 *pv = (I32*)SvPVX(sv_dat);
11581 for ( i = 0 ; i < count ; i++ ) {
11582 if ( pv[i] == RExC_npar ) {
11588 pv = (I32*)SvGROW(sv_dat,
11589 SvCUR(sv_dat) + sizeof(I32)+1);
11590 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11591 pv[count] = RExC_npar;
11592 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11595 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11596 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11599 SvIV_set(sv_dat, 1);
11602 /* Yes this does cause a memory leak in debugging Perls
11604 if (!av_store(RExC_paren_name_list,
11605 RExC_npar, SvREFCNT_inc_NN(svname)))
11606 SvREFCNT_dec_NN(svname);
11609 /*sv_dump(sv_dat);*/
11611 nextchar(pRExC_state);
11613 goto capturing_parens;
11616 RExC_seen |= REG_LOOKBEHIND_SEEN;
11617 RExC_in_lookbehind++;
11619 if (RExC_parse >= RExC_end) {
11620 vFAIL("Sequence (?... not terminated");
11622 RExC_seen_zerolen++;
11624 case '=': /* (?=...) */
11625 RExC_seen_zerolen++;
11626 RExC_in_lookahead++;
11628 case '!': /* (?!...) */
11629 RExC_seen_zerolen++;
11630 /* check if we're really just a "FAIL" assertion */
11631 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11632 FALSE /* Don't force to /x */ );
11633 if (*RExC_parse == ')') {
11634 ret=reganode(pRExC_state, OPFAIL, 0);
11635 nextchar(pRExC_state);
11639 case '|': /* (?|...) */
11640 /* branch reset, behave like a (?:...) except that
11641 buffers in alternations share the same numbers */
11643 after_freeze = freeze_paren = RExC_npar;
11645 /* XXX This construct currently requires an extra pass.
11646 * Investigation would be required to see if that could be
11648 REQUIRE_PARENS_PASS;
11650 case ':': /* (?:...) */
11651 case '>': /* (?>...) */
11653 case '$': /* (?$...) */
11654 case '@': /* (?@...) */
11655 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11657 case '0' : /* (?0) */
11658 case 'R' : /* (?R) */
11659 if (RExC_parse == RExC_end || *RExC_parse != ')')
11660 FAIL("Sequence (?R) not terminated");
11662 RExC_seen |= REG_RECURSE_SEEN;
11664 /* XXX These constructs currently require an extra pass.
11665 * It probably could be changed */
11666 REQUIRE_PARENS_PASS;
11668 *flagp |= POSTPONED;
11669 goto gen_recurse_regop;
11671 /* named and numeric backreferences */
11672 case '&': /* (?&NAME) */
11673 parse_start = RExC_parse - 1;
11676 SV *sv_dat = reg_scan_name(pRExC_state,
11677 REG_RSN_RETURN_DATA);
11678 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11680 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11681 vFAIL("Sequence (?&... not terminated");
11682 goto gen_recurse_regop;
11685 if (! inRANGE(RExC_parse[0], '1', '9')) {
11687 vFAIL("Illegal pattern");
11689 goto parse_recursion;
11691 case '-': /* (?-1) */
11692 if (! inRANGE(RExC_parse[0], '1', '9')) {
11693 RExC_parse--; /* rewind to let it be handled later */
11697 case '1': case '2': case '3': case '4': /* (?1) */
11698 case '5': case '6': case '7': case '8': case '9':
11699 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11702 bool is_neg = FALSE;
11704 parse_start = RExC_parse - 1; /* MJD */
11705 if (*RExC_parse == '-') {
11710 if (grok_atoUV(RExC_parse, &unum, &endptr)
11714 RExC_parse = (char*)endptr;
11716 else { /* Overflow, or something like that. Position
11717 beyond all digits for the message */
11718 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
11721 vFAIL(impossible_group);
11724 /* -num is always representable on 1 and 2's complement
11729 if (*RExC_parse!=')')
11730 vFAIL("Expecting close bracket");
11733 if (paren == '-' || paren == '+') {
11735 /* Don't overflow */
11736 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11738 vFAIL(impossible_group);
11742 Diagram of capture buffer numbering.
11743 Top line is the normal capture buffer numbers
11744 Bottom line is the negative indexing as from
11748 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11749 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11750 - 5 4 3 2 1 X Y x x
11752 Resolve to absolute group. Recall that RExC_npar is +1 of
11753 the actual parenthesis group number. For lookahead, we
11754 have to compensate for that. Using the above example, when
11755 we get to Y in the parse, num is 2 and RExC_npar is 6. We
11756 want 7 for +2, and 4 for -2.
11758 if ( paren == '+' ) {
11764 if (paren == '-' && num < 1) {
11766 vFAIL(non_existent_group_msg);
11770 if (num >= RExC_npar) {
11772 /* It might be a forward reference; we can't fail until we
11773 * know, by completing the parse to get all the groups, and
11774 * then reparsing */
11775 if (ALL_PARENS_COUNTED) {
11776 if (num >= RExC_total_parens) {
11778 vFAIL(non_existent_group_msg);
11782 REQUIRE_PARENS_PASS;
11786 /* We keep track how many GOSUB items we have produced.
11787 To start off the ARG2L() of the GOSUB holds its "id",
11788 which is used later in conjunction with RExC_recurse
11789 to calculate the offset we need to jump for the GOSUB,
11790 which it will store in the final representation.
11791 We have to defer the actual calculation until much later
11792 as the regop may move.
11794 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11795 RExC_recurse_count++;
11796 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11797 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11798 22, "| |", (int)(depth * 2 + 1), "",
11799 (UV)ARG(REGNODE_p(ret)),
11800 (IV)ARG2L(REGNODE_p(ret))));
11801 RExC_seen |= REG_RECURSE_SEEN;
11803 Set_Node_Length(REGNODE_p(ret),
11804 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11805 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11807 *flagp |= POSTPONED;
11808 assert(*RExC_parse == ')');
11809 nextchar(pRExC_state);
11814 case '?': /* (??...) */
11816 if (*RExC_parse != '{') {
11817 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11818 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11820 "Sequence (%" UTF8f "...) not recognized",
11821 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11822 NOT_REACHED; /*NOTREACHED*/
11824 *flagp |= POSTPONED;
11828 case '{': /* (?{...}) */
11831 struct reg_code_block *cb;
11834 RExC_seen_zerolen++;
11836 if ( !pRExC_state->code_blocks
11837 || pRExC_state->code_index
11838 >= pRExC_state->code_blocks->count
11839 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11840 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11843 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11844 FAIL("panic: Sequence (?{...}): no code block found\n");
11845 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11847 /* this is a pre-compiled code block (?{...}) */
11848 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11849 RExC_parse = RExC_start + cb->end;
11851 if (cb->src_regex) {
11852 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11853 RExC_rxi->data->data[n] =
11854 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11855 RExC_rxi->data->data[n+1] = (void*)o;
11858 n = add_data(pRExC_state,
11859 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11860 RExC_rxi->data->data[n] = (void*)o;
11862 pRExC_state->code_index++;
11863 nextchar(pRExC_state);
11866 regnode_offset eval;
11867 ret = reg_node(pRExC_state, LOGICAL);
11869 eval = reg2Lanode(pRExC_state, EVAL,
11872 /* for later propagation into (??{})
11874 RExC_flags & RXf_PMf_COMPILETIME
11876 FLAGS(REGNODE_p(ret)) = 2;
11877 if (! REGTAIL(pRExC_state, ret, eval)) {
11878 REQUIRE_BRANCHJ(flagp, 0);
11880 /* deal with the length of this later - MJD */
11883 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11884 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11885 Set_Node_Offset(REGNODE_p(ret), parse_start);
11888 case '(': /* (?(?{...})...) and (?(?=...)...) */
11891 const int DEFINE_len = sizeof("DEFINE") - 1;
11892 if ( RExC_parse < RExC_end - 1
11893 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11894 && ( RExC_parse[1] == '='
11895 || RExC_parse[1] == '!'
11896 || RExC_parse[1] == '<'
11897 || RExC_parse[1] == '{'))
11898 || ( RExC_parse[0] == '*' /* (?(*...)) */
11899 && ( memBEGINs(RExC_parse + 1,
11900 (Size_t) (RExC_end - (RExC_parse + 1)),
11902 || memBEGINs(RExC_parse + 1,
11903 (Size_t) (RExC_end - (RExC_parse + 1)),
11905 || memBEGINs(RExC_parse + 1,
11906 (Size_t) (RExC_end - (RExC_parse + 1)),
11908 || memBEGINs(RExC_parse + 1,
11909 (Size_t) (RExC_end - (RExC_parse + 1)),
11911 || memBEGINs(RExC_parse + 1,
11912 (Size_t) (RExC_end - (RExC_parse + 1)),
11913 "positive_lookahead:")
11914 || memBEGINs(RExC_parse + 1,
11915 (Size_t) (RExC_end - (RExC_parse + 1)),
11916 "positive_lookbehind:")
11917 || memBEGINs(RExC_parse + 1,
11918 (Size_t) (RExC_end - (RExC_parse + 1)),
11919 "negative_lookahead:")
11920 || memBEGINs(RExC_parse + 1,
11921 (Size_t) (RExC_end - (RExC_parse + 1)),
11922 "negative_lookbehind:"))))
11923 ) { /* Lookahead or eval. */
11925 regnode_offset tail;
11927 ret = reg_node(pRExC_state, LOGICAL);
11928 FLAGS(REGNODE_p(ret)) = 1;
11930 tail = reg(pRExC_state, 1, &flag, depth+1);
11931 RETURN_FAIL_ON_RESTART(flag, flagp);
11932 if (! REGTAIL(pRExC_state, ret, tail)) {
11933 REQUIRE_BRANCHJ(flagp, 0);
11937 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11938 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11940 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11941 char *name_start= RExC_parse++;
11943 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11944 if ( RExC_parse == name_start
11945 || RExC_parse >= RExC_end
11946 || *RExC_parse != ch)
11948 vFAIL2("Sequence (?(%c... not terminated",
11949 (ch == '>' ? '<' : ch));
11953 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11954 RExC_rxi->data->data[num]=(void*)sv_dat;
11955 SvREFCNT_inc_simple_void_NN(sv_dat);
11957 ret = reganode(pRExC_state, GROUPPN, num);
11958 goto insert_if_check_paren;
11960 else if (memBEGINs(RExC_parse,
11961 (STRLEN) (RExC_end - RExC_parse),
11964 ret = reganode(pRExC_state, DEFINEP, 0);
11965 RExC_parse += DEFINE_len;
11967 goto insert_if_check_paren;
11969 else if (RExC_parse[0] == 'R') {
11971 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11972 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11973 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11976 if (RExC_parse[0] == '0') {
11980 else if (inRANGE(RExC_parse[0], '1', '9')) {
11983 if (grok_atoUV(RExC_parse, &uv, &endptr)
11986 parno = (I32)uv + 1;
11987 RExC_parse = (char*)endptr;
11989 /* else "Switch condition not recognized" below */
11990 } else if (RExC_parse[0] == '&') {
11993 sv_dat = reg_scan_name(pRExC_state,
11994 REG_RSN_RETURN_DATA);
11996 parno = 1 + *((I32 *)SvPVX(sv_dat));
11998 ret = reganode(pRExC_state, INSUBP, parno);
11999 goto insert_if_check_paren;
12001 else if (inRANGE(RExC_parse[0], '1', '9')) {
12006 if (grok_atoUV(RExC_parse, &uv, &endptr)
12010 RExC_parse = (char*)endptr;
12013 vFAIL("panic: grok_atoUV returned FALSE");
12015 ret = reganode(pRExC_state, GROUPP, parno);
12017 insert_if_check_paren:
12018 if (UCHARAT(RExC_parse) != ')') {
12020 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12022 vFAIL("Switch condition not recognized");
12024 nextchar(pRExC_state);
12026 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12029 REQUIRE_BRANCHJ(flagp, 0);
12031 br = regbranch(pRExC_state, &flags, 1, depth+1);
12033 RETURN_FAIL_ON_RESTART(flags,flagp);
12034 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12037 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12040 REQUIRE_BRANCHJ(flagp, 0);
12042 c = UCHARAT(RExC_parse);
12043 nextchar(pRExC_state);
12044 if (flags&HASWIDTH)
12045 *flagp |= HASWIDTH;
12048 vFAIL("(?(DEFINE)....) does not allow branches");
12050 /* Fake one for optimizer. */
12051 lastbr = reganode(pRExC_state, IFTHEN, 0);
12053 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12054 RETURN_FAIL_ON_RESTART(flags, flagp);
12055 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12058 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12059 REQUIRE_BRANCHJ(flagp, 0);
12061 if (flags&HASWIDTH)
12062 *flagp |= HASWIDTH;
12063 c = UCHARAT(RExC_parse);
12064 nextchar(pRExC_state);
12069 if (RExC_parse >= RExC_end)
12070 vFAIL("Switch (?(condition)... not terminated");
12072 vFAIL("Switch (?(condition)... contains too many branches");
12074 ender = reg_node(pRExC_state, TAIL);
12075 if (! REGTAIL(pRExC_state, br, ender)) {
12076 REQUIRE_BRANCHJ(flagp, 0);
12079 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12080 REQUIRE_BRANCHJ(flagp, 0);
12082 if (! REGTAIL(pRExC_state,
12085 NEXTOPER(REGNODE_p(lastbr)))),
12088 REQUIRE_BRANCHJ(flagp, 0);
12092 if (! REGTAIL(pRExC_state, ret, ender)) {
12093 REQUIRE_BRANCHJ(flagp, 0);
12095 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12096 RExC_size++; /* XXX WHY do we need this?!!
12097 For large programs it seems to be required
12098 but I can't figure out why. -- dmq*/
12103 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12105 vFAIL("Unknown switch condition (?(...))");
12107 case '[': /* (?[ ... ]) */
12108 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12110 case 0: /* A NUL */
12111 RExC_parse--; /* for vFAIL to print correctly */
12112 vFAIL("Sequence (? incomplete");
12116 if (RExC_strict) { /* [perl #132851] */
12117 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12120 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12122 default: /* e.g., (?i) */
12123 RExC_parse = (char *) seqstart + 1;
12125 parse_lparen_question_flags(pRExC_state);
12126 if (UCHARAT(RExC_parse) != ':') {
12127 if (RExC_parse < RExC_end)
12128 nextchar(pRExC_state);
12133 nextchar(pRExC_state);
12138 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12142 if (! ALL_PARENS_COUNTED) {
12143 /* If we are in our first pass through (and maybe only pass),
12144 * we need to allocate memory for the capturing parentheses
12148 if (!RExC_parens_buf_size) {
12149 /* first guess at number of parens we might encounter */
12150 RExC_parens_buf_size = 10;
12152 /* setup RExC_open_parens, which holds the address of each
12153 * OPEN tag, and to make things simpler for the 0 index the
12154 * start of the program - this is used later for offsets */
12155 Newxz(RExC_open_parens, RExC_parens_buf_size,
12157 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12159 /* setup RExC_close_parens, which holds the address of each
12160 * CLOSE tag, and to make things simpler for the 0 index
12161 * the end of the program - this is used later for offsets
12163 Newxz(RExC_close_parens, RExC_parens_buf_size,
12165 /* we dont know where end op starts yet, so we dont need to
12166 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12169 else if (RExC_npar > RExC_parens_buf_size) {
12170 I32 old_size = RExC_parens_buf_size;
12172 RExC_parens_buf_size *= 2;
12174 Renew(RExC_open_parens, RExC_parens_buf_size,
12176 Zero(RExC_open_parens + old_size,
12177 RExC_parens_buf_size - old_size, regnode_offset);
12179 Renew(RExC_close_parens, RExC_parens_buf_size,
12181 Zero(RExC_close_parens + old_size,
12182 RExC_parens_buf_size - old_size, regnode_offset);
12186 ret = reganode(pRExC_state, OPEN, parno);
12187 if (!RExC_nestroot)
12188 RExC_nestroot = parno;
12189 if (RExC_open_parens && !RExC_open_parens[parno])
12191 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12192 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12193 22, "| |", (int)(depth * 2 + 1), "",
12195 RExC_open_parens[parno]= ret;
12198 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12199 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12202 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12211 /* Pick up the branches, linking them together. */
12212 parse_start = RExC_parse; /* MJD */
12213 br = regbranch(pRExC_state, &flags, 1, depth+1);
12215 /* branch_len = (paren != 0); */
12218 RETURN_FAIL_ON_RESTART(flags, flagp);
12219 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12221 if (*RExC_parse == '|') {
12222 if (RExC_use_BRANCHJ) {
12223 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12226 reginsert(pRExC_state, BRANCH, br, depth+1);
12227 Set_Node_Length(REGNODE_p(br), paren != 0);
12228 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12232 else if (paren == ':') {
12233 *flagp |= flags&SIMPLE;
12235 if (is_open) { /* Starts with OPEN. */
12236 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12237 REQUIRE_BRANCHJ(flagp, 0);
12240 else if (paren != '?') /* Not Conditional */
12242 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12244 while (*RExC_parse == '|') {
12245 if (RExC_use_BRANCHJ) {
12248 ender = reganode(pRExC_state, LONGJMP, 0);
12250 /* Append to the previous. */
12251 shut_gcc_up = REGTAIL(pRExC_state,
12252 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12254 PERL_UNUSED_VAR(shut_gcc_up);
12256 nextchar(pRExC_state);
12257 if (freeze_paren) {
12258 if (RExC_npar > after_freeze)
12259 after_freeze = RExC_npar;
12260 RExC_npar = freeze_paren;
12262 br = regbranch(pRExC_state, &flags, 0, depth+1);
12265 RETURN_FAIL_ON_RESTART(flags, flagp);
12266 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12268 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12269 REQUIRE_BRANCHJ(flagp, 0);
12272 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12275 if (have_branch || paren != ':') {
12278 /* Make a closing node, and hook it on the end. */
12281 ender = reg_node(pRExC_state, TAIL);
12284 ender = reganode(pRExC_state, CLOSE, parno);
12285 if ( RExC_close_parens ) {
12286 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12287 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12288 22, "| |", (int)(depth * 2 + 1), "",
12289 (IV)parno, ender));
12290 RExC_close_parens[parno]= ender;
12291 if (RExC_nestroot == parno)
12294 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12295 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12298 ender = reg_node(pRExC_state, SRCLOSE);
12299 RExC_in_script_run = 0;
12309 *flagp &= ~HASWIDTH;
12311 case 't': /* aTomic */
12313 ender = reg_node(pRExC_state, SUCCEED);
12316 ender = reg_node(pRExC_state, END);
12317 assert(!RExC_end_op); /* there can only be one! */
12318 RExC_end_op = REGNODE_p(ender);
12319 if (RExC_close_parens) {
12320 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12321 "%*s%*s Setting close paren #0 (END) to %zu\n",
12322 22, "| |", (int)(depth * 2 + 1), "",
12325 RExC_close_parens[0]= ender;
12330 DEBUG_PARSE_MSG("lsbr");
12331 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12332 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12333 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12334 SvPV_nolen_const(RExC_mysv1),
12336 SvPV_nolen_const(RExC_mysv2),
12338 (IV)(ender - lastbr)
12341 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12342 REQUIRE_BRANCHJ(flagp, 0);
12346 char is_nothing= 1;
12348 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12350 /* Hook the tails of the branches to the closing node. */
12351 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12352 const U8 op = PL_regkind[OP(br)];
12353 if (op == BRANCH) {
12354 if (! REGTAIL_STUDY(pRExC_state,
12355 REGNODE_OFFSET(NEXTOPER(br)),
12358 REQUIRE_BRANCHJ(flagp, 0);
12360 if ( OP(NEXTOPER(br)) != NOTHING
12361 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12364 else if (op == BRANCHJ) {
12365 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12366 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12368 PERL_UNUSED_VAR(shut_gcc_up);
12369 /* for now we always disable this optimisation * /
12370 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12371 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12377 regnode * ret_as_regnode = REGNODE_p(ret);
12378 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12379 ? regnext(ret_as_regnode)
12382 DEBUG_PARSE_MSG("NADA");
12383 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12384 NULL, pRExC_state);
12385 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12386 NULL, pRExC_state);
12387 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12388 SvPV_nolen_const(RExC_mysv1),
12389 (IV)REG_NODE_NUM(ret_as_regnode),
12390 SvPV_nolen_const(RExC_mysv2),
12396 if (OP(REGNODE_p(ender)) == TAIL) {
12398 RExC_emit= REGNODE_OFFSET(br) + 1;
12401 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12402 OP(opt)= OPTIMIZED;
12403 NEXT_OFF(br)= REGNODE_p(ender) - br;
12411 /* Even/odd or x=don't care: 010101x10x */
12412 static const char parens[] = "=!aA<,>Bbt";
12413 /* flag below is set to 0 up through 'A'; 1 for larger */
12415 if (paren && (p = strchr(parens, paren))) {
12416 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12417 int flag = (p - parens) > 3;
12419 if (paren == '>' || paren == 't') {
12420 node = SUSPEND, flag = 0;
12423 reginsert(pRExC_state, node, ret, depth+1);
12424 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12425 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12426 FLAGS(REGNODE_p(ret)) = flag;
12427 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12429 REQUIRE_BRANCHJ(flagp, 0);
12434 /* Check for proper termination. */
12436 /* restore original flags, but keep (?p) and, if we've encountered
12437 * something in the parse that changes /d rules into /u, keep the /u */
12438 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12439 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12440 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12442 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12443 RExC_parse = oregcomp_parse;
12444 vFAIL("Unmatched (");
12446 nextchar(pRExC_state);
12448 else if (!paren && RExC_parse < RExC_end) {
12449 if (*RExC_parse == ')') {
12451 vFAIL("Unmatched )");
12454 FAIL("Junk on end of regexp"); /* "Can't happen". */
12455 NOT_REACHED; /* NOTREACHED */
12458 if (RExC_in_lookbehind) {
12459 RExC_in_lookbehind--;
12461 if (RExC_in_lookahead) {
12462 RExC_in_lookahead--;
12464 if (after_freeze > RExC_npar)
12465 RExC_npar = after_freeze;
12470 - regbranch - one alternative of an | operator
12472 * Implements the concatenation operator.
12474 * On success, returns the offset at which any next node should be placed into
12475 * the regex engine program being compiled.
12477 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12478 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12481 STATIC regnode_offset
12482 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12484 regnode_offset ret;
12485 regnode_offset chain = 0;
12486 regnode_offset latest;
12487 I32 flags = 0, c = 0;
12488 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12490 PERL_ARGS_ASSERT_REGBRANCH;
12492 DEBUG_PARSE("brnc");
12497 if (RExC_use_BRANCHJ)
12498 ret = reganode(pRExC_state, BRANCHJ, 0);
12500 ret = reg_node(pRExC_state, BRANCH);
12501 Set_Node_Length(REGNODE_p(ret), 1);
12505 *flagp = WORST; /* Tentatively. */
12507 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12508 FALSE /* Don't force to /x */ );
12509 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12510 flags &= ~TRYAGAIN;
12511 latest = regpiece(pRExC_state, &flags, depth+1);
12513 if (flags & TRYAGAIN)
12515 RETURN_FAIL_ON_RESTART(flags, flagp);
12516 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12520 *flagp |= flags&(HASWIDTH|POSTPONED);
12521 if (chain == 0) /* First piece. */
12522 *flagp |= flags&SPSTART;
12524 /* FIXME adding one for every branch after the first is probably
12525 * excessive now we have TRIE support. (hv) */
12527 if (! REGTAIL(pRExC_state, chain, latest)) {
12528 /* XXX We could just redo this branch, but figuring out what
12529 * bookkeeping needs to be reset is a pain, and it's likely
12530 * that other branches that goto END will also be too large */
12531 REQUIRE_BRANCHJ(flagp, 0);
12537 if (chain == 0) { /* Loop ran zero times. */
12538 chain = reg_node(pRExC_state, NOTHING);
12543 *flagp |= flags&SIMPLE;
12550 - regpiece - something followed by possible quantifier * + ? {n,m}
12552 * Note that the branching code sequences used for ? and the general cases
12553 * of * and + are somewhat optimized: they use the same NOTHING node as
12554 * both the endmarker for their branch list and the body of the last branch.
12555 * It might seem that this node could be dispensed with entirely, but the
12556 * endmarker role is not redundant.
12558 * On success, returns the offset at which any next node should be placed into
12559 * the regex engine program being compiled.
12561 * Returns 0 otherwise, with *flagp set to indicate why:
12562 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12563 * RESTART_PARSE if the parse needs to be restarted, or'd with
12564 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12566 STATIC regnode_offset
12567 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12569 regnode_offset ret;
12573 const char * const origparse = RExC_parse;
12575 I32 max = REG_INFTY;
12576 #ifdef RE_TRACK_PATTERN_OFFSETS
12579 const char *maxpos = NULL;
12582 /* Save the original in case we change the emitted regop to a FAIL. */
12583 const regnode_offset orig_emit = RExC_emit;
12585 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12587 PERL_ARGS_ASSERT_REGPIECE;
12589 DEBUG_PARSE("piec");
12591 ret = regatom(pRExC_state, &flags, depth+1);
12593 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12594 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12599 if (op == '{' && regcurly(RExC_parse)) {
12601 #ifdef RE_TRACK_PATTERN_OFFSETS
12602 parse_start = RExC_parse; /* MJD */
12604 next = RExC_parse + 1;
12605 while (isDIGIT(*next) || *next == ',') {
12606 if (*next == ',') {
12614 if (*next == '}') { /* got one */
12615 const char* endptr;
12619 if (isDIGIT(*RExC_parse)) {
12621 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12622 vFAIL("Invalid quantifier in {,}");
12623 if (uv >= REG_INFTY)
12624 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12629 if (*maxpos == ',')
12632 maxpos = RExC_parse;
12633 if (isDIGIT(*maxpos)) {
12635 if (!grok_atoUV(maxpos, &uv, &endptr))
12636 vFAIL("Invalid quantifier in {,}");
12637 if (uv >= REG_INFTY)
12638 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12641 max = REG_INFTY; /* meaning "infinity" */
12644 nextchar(pRExC_state);
12645 if (max < min) { /* If can't match, warn and optimize to fail
12647 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12648 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12649 NEXT_OFF(REGNODE_p(orig_emit)) =
12650 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12653 else if (min == max && *RExC_parse == '?')
12655 ckWARN2reg(RExC_parse + 1,
12656 "Useless use of greediness modifier '%c'",
12661 if ((flags&SIMPLE)) {
12662 if (min == 0 && max == REG_INFTY) {
12664 /* Going from 0..inf is currently forbidden in wildcard
12665 * subpatterns. The only reason is to make it harder to
12666 * write patterns that take a long long time to halt, and
12667 * because the use of this construct isn't necessary in
12668 * matching Unicode property values */
12669 if (RExC_pm_flags & PMf_WILDCARD) {
12671 /* diag_listed_as: Use of %s is not allowed in Unicode
12672 property wildcard subpatterns in regex; marked by
12673 <-- HERE in m/%s/ */
12674 vFAIL("Use of quantifier '*' is not allowed in"
12675 " Unicode property wildcard subpatterns");
12676 /* Note, don't need to worry about {0,}, as a '}' isn't
12677 * legal at all in wildcards, so wouldn't get this far
12680 reginsert(pRExC_state, STAR, ret, depth+1);
12682 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12685 if (min == 1 && max == REG_INFTY) {
12686 reginsert(pRExC_state, PLUS, ret, depth+1);
12688 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12691 MARK_NAUGHTY_EXP(2, 2);
12692 reginsert(pRExC_state, CURLY, ret, depth+1);
12693 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12694 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12697 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12699 FLAGS(REGNODE_p(w)) = 0;
12700 if (! REGTAIL(pRExC_state, ret, w)) {
12701 REQUIRE_BRANCHJ(flagp, 0);
12703 if (RExC_use_BRANCHJ) {
12704 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12705 reginsert(pRExC_state, NOTHING, ret, depth+1);
12706 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12708 reginsert(pRExC_state, CURLYX, ret, depth+1);
12710 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12711 Set_Node_Length(REGNODE_p(ret),
12712 op == '{' ? (RExC_parse - parse_start) : 1);
12714 if (RExC_use_BRANCHJ)
12715 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12717 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12720 REQUIRE_BRANCHJ(flagp, 0);
12722 RExC_whilem_seen++;
12723 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12725 FLAGS(REGNODE_p(ret)) = 0;
12730 *flagp |= HASWIDTH;
12731 ARG1_SET(REGNODE_p(ret), (U16)min);
12732 ARG2_SET(REGNODE_p(ret), (U16)max);
12733 if (max == REG_INFTY)
12734 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12740 if (!ISMULT1(op)) {
12745 #if 0 /* Now runtime fix should be reliable. */
12747 /* if this is reinstated, don't forget to put this back into perldiag:
12749 =item Regexp *+ operand could be empty at {#} in regex m/%s/
12751 (F) The part of the regexp subject to either the * or + quantifier
12752 could match an empty string. The {#} shows in the regular
12753 expression about where the problem was discovered.
12757 if (!(flags&HASWIDTH) && op != '?')
12758 vFAIL("Regexp *+ operand could be empty");
12761 #ifdef RE_TRACK_PATTERN_OFFSETS
12762 parse_start = RExC_parse;
12764 nextchar(pRExC_state);
12766 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12772 else if (op == '+') {
12776 else if (op == '?') {
12781 if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12782 if (origparse[0] == '\\' && origparse[1] == 'K') {
12784 "%" UTF8f " is forbidden - matches null string many times",
12785 UTF8fARG(UTF, (RExC_parse >= origparse
12786 ? RExC_parse - origparse
12791 ckWARN2reg(RExC_parse,
12792 "%" UTF8f " matches null string many times",
12793 UTF8fARG(UTF, (RExC_parse >= origparse
12794 ? RExC_parse - origparse
12800 if (*RExC_parse == '?') {
12801 nextchar(pRExC_state);
12802 reginsert(pRExC_state, MINMOD, ret, depth+1);
12803 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12804 REQUIRE_BRANCHJ(flagp, 0);
12807 else if (*RExC_parse == '+') {
12808 regnode_offset ender;
12809 nextchar(pRExC_state);
12810 ender = reg_node(pRExC_state, SUCCEED);
12811 if (! REGTAIL(pRExC_state, ret, ender)) {
12812 REQUIRE_BRANCHJ(flagp, 0);
12814 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12815 ender = reg_node(pRExC_state, TAIL);
12816 if (! REGTAIL(pRExC_state, ret, ender)) {
12817 REQUIRE_BRANCHJ(flagp, 0);
12821 if (ISMULT2(RExC_parse)) {
12823 vFAIL("Nested quantifiers");
12830 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12831 regnode_offset * node_p,
12839 /* This routine teases apart the various meanings of \N and returns
12840 * accordingly. The input parameters constrain which meaning(s) is/are valid
12841 * in the current context.
12843 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12845 * If <code_point_p> is not NULL, the context is expecting the result to be a
12846 * single code point. If this \N instance turns out to a single code point,
12847 * the function returns TRUE and sets *code_point_p to that code point.
12849 * If <node_p> is not NULL, the context is expecting the result to be one of
12850 * the things representable by a regnode. If this \N instance turns out to be
12851 * one such, the function generates the regnode, returns TRUE and sets *node_p
12852 * to point to the offset of that regnode into the regex engine program being
12855 * If this instance of \N isn't legal in any context, this function will
12856 * generate a fatal error and not return.
12858 * On input, RExC_parse should point to the first char following the \N at the
12859 * time of the call. On successful return, RExC_parse will have been updated
12860 * to point to just after the sequence identified by this routine. Also
12861 * *flagp has been updated as needed.
12863 * When there is some problem with the current context and this \N instance,
12864 * the function returns FALSE, without advancing RExC_parse, nor setting
12865 * *node_p, nor *code_point_p, nor *flagp.
12867 * If <cp_count> is not NULL, the caller wants to know the length (in code
12868 * points) that this \N sequence matches. This is set, and the input is
12869 * parsed for errors, even if the function returns FALSE, as detailed below.
12871 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12873 * Probably the most common case is for the \N to specify a single code point.
12874 * *cp_count will be set to 1, and *code_point_p will be set to that code
12877 * Another possibility is for the input to be an empty \N{}. This is no
12878 * longer accepted, and will generate a fatal error.
12880 * Another possibility is for a custom charnames handler to be in effect which
12881 * translates the input name to an empty string. *cp_count will be set to 0.
12882 * *node_p will be set to a generated NOTHING node.
12884 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12885 * set to 0. *node_p will be set to a generated REG_ANY node.
12887 * The fifth possibility is that \N resolves to a sequence of more than one
12888 * code points. *cp_count will be set to the number of code points in the
12889 * sequence. *node_p will be set to a generated node returned by this
12890 * function calling S_reg().
12892 * The final possibility is that it is premature to be calling this function;
12893 * the parse needs to be restarted. This can happen when this changes from
12894 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12895 * latter occurs only when the fifth possibility would otherwise be in
12896 * effect, and is because one of those code points requires the pattern to be
12897 * recompiled as UTF-8. The function returns FALSE, and sets the
12898 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12899 * happens, the caller needs to desist from continuing parsing, and return
12900 * this information to its caller. This is not set for when there is only one
12901 * code point, as this can be called as part of an ANYOF node, and they can
12902 * store above-Latin1 code points without the pattern having to be in UTF-8.
12904 * For non-single-quoted regexes, the tokenizer has resolved character and
12905 * sequence names inside \N{...} into their Unicode values, normalizing the
12906 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12907 * hex-represented code points in the sequence. This is done there because
12908 * the names can vary based on what charnames pragma is in scope at the time,
12909 * so we need a way to take a snapshot of what they resolve to at the time of
12910 * the original parse. [perl #56444].
12912 * That parsing is skipped for single-quoted regexes, so here we may get
12913 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12914 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12915 * the native character set for non-ASCII platforms. The other possibilities
12916 * are already native, so no translation is done. */
12918 char * endbrace; /* points to '}' following the name */
12919 char* p = RExC_parse; /* Temporary */
12921 SV * substitute_parse = NULL;
12926 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12928 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12930 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12931 assert(! (node_p && cp_count)); /* At most 1 should be set */
12933 if (cp_count) { /* Initialize return for the most common case */
12937 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12938 * modifier. The other meanings do not, so use a temporary until we find
12939 * out which we are being called with */
12940 skip_to_be_ignored_text(pRExC_state, &p,
12941 FALSE /* Don't force to /x */ );
12943 /* Disambiguate between \N meaning a named character versus \N meaning
12944 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12945 * quantifier, or if there is no '{' at all */
12946 if (*p != '{' || regcurly(p)) {
12956 *node_p = reg_node(pRExC_state, REG_ANY);
12957 *flagp |= HASWIDTH|SIMPLE;
12959 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12963 /* The test above made sure that the next real character is a '{', but
12964 * under the /x modifier, it could be separated by space (or a comment and
12965 * \n) and this is not allowed (for consistency with \x{...} and the
12966 * tokenizer handling of \N{NAME}). */
12967 if (*RExC_parse != '{') {
12968 vFAIL("Missing braces on \\N{}");
12971 RExC_parse++; /* Skip past the '{' */
12973 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12974 if (! endbrace) { /* no trailing brace */
12975 vFAIL2("Missing right brace on \\%c{}", 'N');
12978 /* Here, we have decided it should be a named character or sequence. These
12979 * imply Unicode semantics */
12980 REQUIRE_UNI_RULES(flagp, FALSE);
12982 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12983 * nothing at all (not allowed under strict) */
12984 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12985 RExC_parse = endbrace;
12987 RExC_parse++; /* Position after the "}" */
12988 vFAIL("Zero length \\N{}");
12994 nextchar(pRExC_state);
12999 *node_p = reg_node(pRExC_state, NOTHING);
13003 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13005 /* Here, the name isn't of the form U+.... This can happen if the
13006 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13007 * is the time to find out what the name means */
13009 const STRLEN name_len = endbrace - RExC_parse;
13010 SV * value_sv; /* What does this name evaluate to */
13012 const U8 * value; /* string of name's value */
13013 STRLEN value_len; /* and its length */
13015 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13016 * toke.c, and their values. Make sure is initialized */
13017 if (! RExC_unlexed_names) {
13018 RExC_unlexed_names = newHV();
13021 /* If we have already seen this name in this pattern, use that. This
13022 * allows us to only call the charnames handler once per name per
13023 * pattern. A broken or malicious handler could return something
13024 * different each time, which could cause the results to vary depending
13025 * on if something gets added or subtracted from the pattern that
13026 * causes the number of passes to change, for example */
13027 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13030 value_sv = *value_svp;
13032 else { /* Otherwise we have to go out and get the name */
13033 const char * error_msg = NULL;
13034 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13038 RExC_parse = endbrace;
13042 /* If no error message, should have gotten a valid return */
13045 /* Save the name's meaning for later use */
13046 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13049 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13053 /* Here, we have the value the name evaluates to in 'value_sv' */
13054 value = (U8 *) SvPV(value_sv, value_len);
13056 /* See if the result is one code point vs 0 or multiple */
13057 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13061 /* Here, exactly one code point. If that isn't what is wanted,
13063 if (! code_point_p) {
13068 /* Convert from string to numeric code point */
13069 *code_point_p = (SvUTF8(value_sv))
13070 ? valid_utf8_to_uvchr(value, NULL)
13073 /* Have parsed this entire single code point \N{...}. *cp_count
13074 * has already been set to 1, so don't do it again. */
13075 RExC_parse = endbrace;
13076 nextchar(pRExC_state);
13078 } /* End of is a single code point */
13080 /* Count the code points, if caller desires. The API says to do this
13081 * even if we will later return FALSE */
13085 *cp_count = (SvUTF8(value_sv))
13086 ? utf8_length(value, value + value_len)
13090 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13091 * But don't back the pointer up if the caller wants to know how many
13092 * code points there are (they need to handle it themselves in this
13101 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13102 * reg recursively to parse it. That way, it retains its atomicness,
13103 * while not having to worry about any special handling that some code
13104 * points may have. */
13106 substitute_parse = newSVpvs("?:");
13107 sv_catsv(substitute_parse, value_sv);
13108 sv_catpv(substitute_parse, ")");
13110 /* The value should already be native, so no need to convert on EBCDIC
13112 assert(! RExC_recode_x_to_native);
13115 else { /* \N{U+...} */
13116 Size_t count = 0; /* code point count kept internally */
13118 /* We can get to here when the input is \N{U+...} or when toke.c has
13119 * converted a name to the \N{U+...} form. This include changing a
13120 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13122 RExC_parse += 2; /* Skip past the 'U+' */
13124 /* Code points are separated by dots. The '}' terminates the whole
13127 do { /* Loop until the ending brace */
13128 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13129 | PERL_SCAN_SILENT_ILLDIGIT
13130 | PERL_SCAN_NOTIFY_ILLDIGIT
13131 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13132 | PERL_SCAN_DISALLOW_PREFIX;
13133 STRLEN len = endbrace - RExC_parse;
13135 char * start_digit = RExC_parse;
13136 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13141 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13146 if (cp > MAX_LEGAL_CP) {
13147 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13150 if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13155 /* Here, is a single code point; fail if doesn't want that */
13156 if (! code_point_p) {
13161 /* A single code point is easy to handle; just return it */
13162 *code_point_p = UNI_TO_NATIVE(cp);
13163 RExC_parse = endbrace;
13164 nextchar(pRExC_state);
13168 /* Here, the parse stopped bfore the ending brace. This is legal
13169 * only if that character is a dot separating code points, like a
13170 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13171 * So the next character must be a dot (and the one after that
13172 * can't be the endbrace, or we'd have something like \N{U+100.} )
13174 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13175 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13176 ? UTF8SKIP(RExC_parse)
13178 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13183 /* Here, looks like its really a multiple character sequence. Fail
13184 * if that's not what the caller wants. But continue with counting
13185 * and error checking if they still want a count */
13186 if (! node_p && ! cp_count) {
13190 /* What is done here is to convert this to a sub-pattern of the
13191 * form \x{char1}\x{char2}... and then call reg recursively to
13192 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13193 * atomicness, while not having to worry about special handling
13194 * that some code points may have. We don't create a subpattern,
13195 * but go through the motions of code point counting and error
13196 * checking, if the caller doesn't want a node returned. */
13198 if (node_p && ! substitute_parse) {
13199 substitute_parse = newSVpvs("?:");
13205 /* Convert to notation the rest of the code understands */
13206 sv_catpvs(substitute_parse, "\\x{");
13207 sv_catpvn(substitute_parse, start_digit,
13208 RExC_parse - start_digit);
13209 sv_catpvs(substitute_parse, "}");
13212 /* Move to after the dot (or ending brace the final time through.)
13217 } while (RExC_parse < endbrace);
13219 if (! node_p) { /* Doesn't want the node */
13226 sv_catpvs(substitute_parse, ")");
13228 /* The values are Unicode, and therefore have to be converted to native
13229 * on a non-Unicode (meaning non-ASCII) platform. */
13230 SET_recode_x_to_native(1);
13233 /* Here, we have the string the name evaluates to, ready to be parsed,
13234 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13235 * constructs. This can be called from within a substitute parse already.
13236 * The error reporting mechanism doesn't work for 2 levels of this, but the
13237 * code above has validated this new construct, so there should be no
13238 * errors generated by the below. And this isn' an exact copy, so the
13239 * mechanism to seamlessly deal with this won't work, so turn off warnings
13241 save_start = RExC_start;
13242 orig_end = RExC_end;
13244 RExC_parse = RExC_start = SvPVX(substitute_parse);
13245 RExC_end = RExC_parse + SvCUR(substitute_parse);
13246 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13248 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13250 /* Restore the saved values */
13252 RExC_start = save_start;
13253 RExC_parse = endbrace;
13254 RExC_end = orig_end;
13255 SET_recode_x_to_native(0);
13257 SvREFCNT_dec_NN(substitute_parse);
13260 RETURN_FAIL_ON_RESTART(flags, flagp);
13261 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13264 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13266 nextchar(pRExC_state);
13273 S_compute_EXACTish(RExC_state_t *pRExC_state)
13277 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13285 op = get_regex_charset(RExC_flags);
13286 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13287 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13288 been, so there is no hole */
13291 return op + EXACTF;
13295 S_new_regcurly(const char *s, const char *e)
13297 /* This is a temporary function designed to match the most lenient form of
13298 * a {m,n} quantifier we ever envision, with either number omitted, and
13299 * spaces anywhere between/before/after them.
13301 * If this function fails, then the string it matches is very unlikely to
13302 * ever be considered a valid quantifier, so we can allow the '{' that
13303 * begins it to be considered as a literal */
13305 bool has_min = FALSE;
13306 bool has_max = FALSE;
13308 PERL_ARGS_ASSERT_NEW_REGCURLY;
13310 if (s >= e || *s++ != '{')
13313 while (s < e && isSPACE(*s)) {
13316 while (s < e && isDIGIT(*s)) {
13320 while (s < e && isSPACE(*s)) {
13326 while (s < e && isSPACE(*s)) {
13329 while (s < e && isDIGIT(*s)) {
13333 while (s < e && isSPACE(*s)) {
13338 return s < e && *s == '}' && (has_min || has_max);
13341 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13342 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13345 S_backref_value(char *p, char *e)
13347 const char* endptr = e;
13349 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13356 - regatom - the lowest level
13358 Try to identify anything special at the start of the current parse position.
13359 If there is, then handle it as required. This may involve generating a
13360 single regop, such as for an assertion; or it may involve recursing, such as
13361 to handle a () structure.
13363 If the string doesn't start with something special then we gobble up
13364 as much literal text as we can. If we encounter a quantifier, we have to
13365 back off the final literal character, as that quantifier applies to just it
13366 and not to the whole string of literals.
13368 Once we have been able to handle whatever type of thing started the
13369 sequence, we return the offset into the regex engine program being compiled
13370 at which any next regnode should be placed.
13372 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13373 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13374 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13375 Otherwise does not return 0.
13377 Note: we have to be careful with escapes, as they can be both literal
13378 and special, and in the case of \10 and friends, context determines which.
13380 A summary of the code structure is:
13382 switch (first_byte) {
13383 cases for each special:
13384 handle this special;
13387 switch (2nd byte) {
13388 cases for each unambiguous special:
13389 handle this special;
13391 cases for each ambigous special/literal:
13393 if (special) handle here
13395 default: // unambiguously literal:
13398 default: // is a literal char
13401 create EXACTish node for literal;
13402 while (more input and node isn't full) {
13403 switch (input_byte) {
13404 cases for each special;
13405 make sure parse pointer is set so that the next call to
13406 regatom will see this special first
13407 goto loopdone; // EXACTish node terminated by prev. char
13409 append char to EXACTISH node;
13411 get next input byte;
13415 return the generated node;
13417 Specifically there are two separate switches for handling
13418 escape sequences, with the one for handling literal escapes requiring
13419 a dummy entry for all of the special escapes that are actually handled
13424 STATIC regnode_offset
13425 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13428 regnode_offset ret = 0;
13434 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13436 *flagp = WORST; /* Tentatively. */
13438 DEBUG_PARSE("atom");
13440 PERL_ARGS_ASSERT_REGATOM;
13443 parse_start = RExC_parse;
13444 assert(RExC_parse < RExC_end);
13445 switch ((U8)*RExC_parse) {
13447 RExC_seen_zerolen++;
13448 nextchar(pRExC_state);
13449 if (RExC_flags & RXf_PMf_MULTILINE)
13450 ret = reg_node(pRExC_state, MBOL);
13452 ret = reg_node(pRExC_state, SBOL);
13453 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13456 nextchar(pRExC_state);
13458 RExC_seen_zerolen++;
13459 if (RExC_flags & RXf_PMf_MULTILINE)
13460 ret = reg_node(pRExC_state, MEOL);
13462 ret = reg_node(pRExC_state, SEOL);
13463 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13466 nextchar(pRExC_state);
13467 if (RExC_flags & RXf_PMf_SINGLELINE)
13468 ret = reg_node(pRExC_state, SANY);
13470 ret = reg_node(pRExC_state, REG_ANY);
13471 *flagp |= HASWIDTH|SIMPLE;
13473 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13477 char * const oregcomp_parse = ++RExC_parse;
13478 ret = regclass(pRExC_state, flagp, depth+1,
13479 FALSE, /* means parse the whole char class */
13480 TRUE, /* allow multi-char folds */
13481 FALSE, /* don't silence non-portable warnings. */
13482 (bool) RExC_strict,
13483 TRUE, /* Allow an optimized regnode result */
13486 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13487 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13490 if (*RExC_parse != ']') {
13491 RExC_parse = oregcomp_parse;
13492 vFAIL("Unmatched [");
13494 nextchar(pRExC_state);
13495 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13499 nextchar(pRExC_state);
13500 ret = reg(pRExC_state, 2, &flags, depth+1);
13502 if (flags & TRYAGAIN) {
13503 if (RExC_parse >= RExC_end) {
13504 /* Make parent create an empty node if needed. */
13505 *flagp |= TRYAGAIN;
13510 RETURN_FAIL_ON_RESTART(flags, flagp);
13511 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13514 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13518 if (flags & TRYAGAIN) {
13519 *flagp |= TRYAGAIN;
13522 vFAIL("Internal urp");
13523 /* Supposed to be caught earlier. */
13529 vFAIL("Quantifier follows nothing");
13534 This switch handles escape sequences that resolve to some kind
13535 of special regop and not to literal text. Escape sequences that
13536 resolve to literal text are handled below in the switch marked
13539 Every entry in this switch *must* have a corresponding entry
13540 in the literal escape switch. However, the opposite is not
13541 required, as the default for this switch is to jump to the
13542 literal text handling code.
13545 switch ((U8)*RExC_parse) {
13546 /* Special Escapes */
13548 RExC_seen_zerolen++;
13549 /* Under wildcards, this is changed to match \n; should be
13550 * invisible to the user, as they have to compile under /m */
13551 if (RExC_pm_flags & PMf_WILDCARD) {
13552 ret = reg_node(pRExC_state, MBOL);
13555 ret = reg_node(pRExC_state, SBOL);
13556 /* SBOL is shared with /^/ so we set the flags so we can tell
13557 * /\A/ from /^/ in split. */
13558 FLAGS(REGNODE_p(ret)) = 1;
13561 goto finish_meta_pat;
13563 if (RExC_pm_flags & PMf_WILDCARD) {
13565 /* diag_listed_as: Use of %s is not allowed in Unicode property
13566 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13568 vFAIL("Use of '\\G' is not allowed in Unicode property"
13569 " wildcard subpatterns");
13571 ret = reg_node(pRExC_state, GPOS);
13572 RExC_seen |= REG_GPOS_SEEN;
13574 goto finish_meta_pat;
13576 if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13577 RExC_seen_zerolen++;
13578 ret = reg_node(pRExC_state, KEEPS);
13580 /* XXX:dmq : disabling in-place substitution seems to
13581 * be necessary here to avoid cases of memory corruption, as
13582 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13584 RExC_seen |= REG_LOOKBEHIND_SEEN;
13585 goto finish_meta_pat;
13588 ++RExC_parse; /* advance past the 'K' */
13589 vFAIL("\\K not permitted in lookahead/lookbehind");
13592 if (RExC_pm_flags & PMf_WILDCARD) {
13593 /* See comment under \A above */
13594 ret = reg_node(pRExC_state, MEOL);
13597 ret = reg_node(pRExC_state, SEOL);
13600 RExC_seen_zerolen++; /* Do not optimize RE away */
13601 goto finish_meta_pat;
13603 if (RExC_pm_flags & PMf_WILDCARD) {
13604 /* See comment under \A above */
13605 ret = reg_node(pRExC_state, MEOL);
13608 ret = reg_node(pRExC_state, EOS);
13611 RExC_seen_zerolen++; /* Do not optimize RE away */
13612 goto finish_meta_pat;
13614 vFAIL("\\C no longer supported");
13616 ret = reg_node(pRExC_state, CLUMP);
13617 *flagp |= HASWIDTH;
13618 goto finish_meta_pat;
13626 regex_charset charset = get_regex_charset(RExC_flags);
13628 RExC_seen_zerolen++;
13629 RExC_seen |= REG_LOOKBEHIND_SEEN;
13630 op = BOUND + charset;
13632 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13633 flags = TRADITIONAL_BOUND;
13634 if (op > BOUNDA) { /* /aa is same as /a */
13640 char name = *RExC_parse;
13641 char * endbrace = NULL;
13643 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13646 vFAIL2("Missing right brace on \\%c{}", name);
13648 /* XXX Need to decide whether to take spaces or not. Should be
13649 * consistent with \p{}, but that currently is SPACE, which
13650 * means vertical too, which seems wrong
13651 * while (isBLANK(*RExC_parse)) {
13654 if (endbrace == RExC_parse) {
13655 RExC_parse++; /* After the '}' */
13656 vFAIL2("Empty \\%c{}", name);
13658 length = endbrace - RExC_parse;
13659 /*while (isBLANK(*(RExC_parse + length - 1))) {
13662 switch (*RExC_parse) {
13665 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13667 goto bad_bound_type;
13672 if (length != 2 || *(RExC_parse + 1) != 'b') {
13673 goto bad_bound_type;
13678 if (length != 2 || *(RExC_parse + 1) != 'b') {
13679 goto bad_bound_type;
13684 if (length != 2 || *(RExC_parse + 1) != 'b') {
13685 goto bad_bound_type;
13691 RExC_parse = endbrace;
13693 "'%" UTF8f "' is an unknown bound type",
13694 UTF8fARG(UTF, length, endbrace - length));
13695 NOT_REACHED; /*NOTREACHED*/
13697 RExC_parse = endbrace;
13698 REQUIRE_UNI_RULES(flagp, 0);
13703 else if (op >= BOUNDA) { /* /aa is same as /a */
13707 /* Don't have to worry about UTF-8, in this message because
13708 * to get here the contents of the \b must be ASCII */
13709 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13710 "Using /u for '%.*s' instead of /%s",
13712 endbrace - length + 1,
13713 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13714 ? ASCII_RESTRICT_PAT_MODS
13715 : ASCII_MORE_RESTRICT_PAT_MODS);
13720 RExC_seen_d_op = TRUE;
13722 else if (op == BOUNDL) {
13723 RExC_contains_locale = 1;
13727 op += NBOUND - BOUND;
13730 ret = reg_node(pRExC_state, op);
13731 FLAGS(REGNODE_p(ret)) = flags;
13735 goto finish_meta_pat;
13739 ret = reg_node(pRExC_state, LNBREAK);
13740 *flagp |= HASWIDTH|SIMPLE;
13741 goto finish_meta_pat;
13755 /* These all have the same meaning inside [brackets], and it knows
13756 * how to do the best optimizations for them. So, pretend we found
13757 * these within brackets, and let it do the work */
13760 ret = regclass(pRExC_state, flagp, depth+1,
13761 TRUE, /* means just parse this element */
13762 FALSE, /* don't allow multi-char folds */
13763 FALSE, /* don't silence non-portable warnings. It
13764 would be a bug if these returned
13766 (bool) RExC_strict,
13767 TRUE, /* Allow an optimized regnode result */
13769 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13770 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13771 * multi-char folds are allowed. */
13773 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13776 RExC_parse--; /* regclass() leaves this one too far ahead */
13779 /* The escapes above that don't take a parameter can't be
13780 * followed by a '{'. But 'pX', 'p{foo}' and
13781 * correspondingly 'P' can be */
13782 if ( RExC_parse - parse_start == 1
13783 && UCHARAT(RExC_parse + 1) == '{'
13784 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13787 vFAIL("Unescaped left brace in regex is illegal here");
13789 Set_Node_Offset(REGNODE_p(ret), parse_start);
13790 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13791 nextchar(pRExC_state);
13794 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13795 * \N{...} evaluates to a sequence of more than one code points).
13796 * The function call below returns a regnode, which is our result.
13797 * The parameters cause it to fail if the \N{} evaluates to a
13798 * single code point; we handle those like any other literal. The
13799 * reason that the multicharacter case is handled here and not as
13800 * part of the EXACtish code is because of quantifiers. In
13801 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13802 * this way makes that Just Happen. dmq.
13803 * join_exact() will join this up with adjacent EXACTish nodes
13804 * later on, if appropriate. */
13806 if (grok_bslash_N(pRExC_state,
13807 &ret, /* Want a regnode returned */
13808 NULL, /* Fail if evaluates to a single code
13810 NULL, /* Don't need a count of how many code
13819 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13821 /* Here, evaluates to a single code point. Go get that */
13822 RExC_parse = parse_start;
13825 case 'k': /* Handle \k<NAME> and \k'NAME' */
13829 if ( RExC_parse >= RExC_end - 1
13830 || (( ch = RExC_parse[1]) != '<'
13835 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13836 vFAIL2("Sequence %.2s... not terminated", parse_start);
13839 ret = handle_named_backref(pRExC_state,
13851 case '1': case '2': case '3': case '4':
13852 case '5': case '6': case '7': case '8': case '9':
13857 if (*RExC_parse == 'g') {
13861 if (*RExC_parse == '{') {
13865 if (*RExC_parse == '-') {
13869 if (hasbrace && !isDIGIT(*RExC_parse)) {
13870 if (isrel) RExC_parse--;
13872 goto parse_named_seq;
13875 if (RExC_parse >= RExC_end) {
13876 goto unterminated_g;
13878 num = S_backref_value(RExC_parse, RExC_end);
13880 vFAIL("Reference to invalid group 0");
13881 else if (num == I32_MAX) {
13882 if (isDIGIT(*RExC_parse))
13883 vFAIL("Reference to nonexistent group");
13886 vFAIL("Unterminated \\g... pattern");
13890 num = RExC_npar - num;
13892 vFAIL("Reference to nonexistent or unclosed group");
13896 num = S_backref_value(RExC_parse, RExC_end);
13897 /* bare \NNN might be backref or octal - if it is larger
13898 * than or equal RExC_npar then it is assumed to be an
13899 * octal escape. Note RExC_npar is +1 from the actual
13900 * number of parens. */
13901 /* Note we do NOT check if num == I32_MAX here, as that is
13902 * handled by the RExC_npar check */
13905 /* any numeric escape < 10 is always a backref */
13907 /* any numeric escape < RExC_npar is a backref */
13908 && num >= RExC_npar
13909 /* cannot be an octal escape if it starts with 8 */
13910 && *RExC_parse != '8'
13911 /* cannot be an octal escape if it starts with 9 */
13912 && *RExC_parse != '9'
13914 /* Probably not meant to be a backref, instead likely
13915 * to be an octal character escape, e.g. \35 or \777.
13916 * The above logic should make it obvious why using
13917 * octal escapes in patterns is problematic. - Yves */
13918 RExC_parse = parse_start;
13923 /* At this point RExC_parse points at a numeric escape like
13924 * \12 or \88 or something similar, which we should NOT treat
13925 * as an octal escape. It may or may not be a valid backref
13926 * escape. For instance \88888888 is unlikely to be a valid
13928 while (isDIGIT(*RExC_parse))
13931 if (*RExC_parse != '}')
13932 vFAIL("Unterminated \\g{...} pattern");
13935 if (num >= (I32)RExC_npar) {
13937 /* It might be a forward reference; we can't fail until we
13938 * know, by completing the parse to get all the groups, and
13939 * then reparsing */
13940 if (ALL_PARENS_COUNTED) {
13941 if (num >= RExC_total_parens) {
13942 vFAIL("Reference to nonexistent group");
13946 REQUIRE_PARENS_PASS;
13950 ret = reganode(pRExC_state,
13953 : (ASCII_FOLD_RESTRICTED)
13955 : (AT_LEAST_UNI_SEMANTICS)
13961 if (OP(REGNODE_p(ret)) == REFF) {
13962 RExC_seen_d_op = TRUE;
13964 *flagp |= HASWIDTH;
13966 /* override incorrect value set in reganode MJD */
13967 Set_Node_Offset(REGNODE_p(ret), parse_start);
13968 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13969 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13970 FALSE /* Don't force to /x */ );
13974 if (RExC_parse >= RExC_end)
13975 FAIL("Trailing \\");
13978 /* Do not generate "unrecognized" warnings here, we fall
13979 back into the quick-grab loop below */
13980 RExC_parse = parse_start;
13982 } /* end of switch on a \foo sequence */
13987 /* '#' comments should have been spaced over before this function was
13989 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13991 if (RExC_flags & RXf_PMf_EXTENDED) {
13992 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13993 if (RExC_parse < RExC_end)
14003 /* Here, we have determined that the next thing is probably a
14004 * literal character. RExC_parse points to the first byte of its
14005 * definition. (It still may be an escape sequence that evaluates
14006 * to a single character) */
14011 char *s, *old_s = NULL, *old_old_s = NULL;
14013 U32 max_string_len = 255;
14015 /* We may have to reparse the node, artificially stopping filling
14016 * it early, based on info gleaned in the first parse. This
14017 * variable gives where we stop. Make it above the normal stopping
14018 * place first time through; otherwise it would stop too early */
14019 U32 upper_fill = max_string_len + 1;
14021 /* We start out as an EXACT node, even if under /i, until we find a
14022 * character which is in a fold. The algorithm now segregates into
14023 * separate nodes, characters that fold from those that don't under
14024 * /i. (This hopefully will create nodes that are fixed strings
14025 * even under /i, giving the optimizer something to grab on to.)
14026 * So, if a node has something in it and the next character is in
14027 * the opposite category, that node is closed up, and the function
14028 * returns. Then regatom is called again, and a new node is
14029 * created for the new category. */
14030 U8 node_type = EXACT;
14032 /* Assume the node will be fully used; the excess is given back at
14033 * the end. Under /i, we may need to temporarily add the fold of
14034 * an extra character or two at the end to check for splitting
14035 * multi-char folds, so allocate extra space for that. We can't
14036 * make any other length assumptions, as a byte input sequence
14037 * could shrink down. */
14038 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14042 ? UTF8_MAXBYTES_CASE
14043 /* Max non-UTF-8 expansion is 2 */ : 2)));
14045 bool next_is_quantifier;
14046 char * oldp = NULL;
14048 /* We can convert EXACTF nodes to EXACTFU if they contain only
14049 * characters that match identically regardless of the target
14050 * string's UTF8ness. The reason to do this is that EXACTF is not
14051 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14054 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14055 * contain only above-Latin1 characters (hence must be in UTF8),
14056 * which don't participate in folds with Latin1-range characters,
14057 * as the latter's folds aren't known until runtime. */
14058 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14060 /* Single-character EXACTish nodes are almost always SIMPLE. This
14061 * allows us to override this as encountered */
14062 U8 maybe_SIMPLE = SIMPLE;
14064 /* Does this node contain something that can't match unless the
14065 * target string is (also) in UTF-8 */
14066 bool requires_utf8_target = FALSE;
14068 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14069 bool has_ss = FALSE;
14071 /* So is the MICRO SIGN */
14072 bool has_micro_sign = FALSE;
14074 /* Set when we fill up the current node and there is still more
14075 * text to process */
14078 /* Allocate an EXACT node. The node_type may change below to
14079 * another EXACTish node, but since the size of the node doesn't
14080 * change, it works */
14081 ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14083 FILL_NODE(ret, node_type);
14086 s = STRING(REGNODE_p(ret));
14097 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14098 maybe_SIMPLE = SIMPLE;
14099 requires_utf8_target = FALSE;
14101 has_micro_sign = FALSE;
14105 /* This breaks under rare circumstances. If folding, we do not
14106 * want to split a node at a character that is a non-final in a
14107 * multi-char fold, as an input string could just happen to want to
14108 * match across the node boundary. The code at the end of the loop
14109 * looks for this, and backs off until it finds not such a
14110 * character, but it is possible (though extremely, extremely
14111 * unlikely) for all characters in the node to be non-final fold
14112 * ones, in which case we just leave the node fully filled, and
14113 * hope that it doesn't match the string in just the wrong place */
14115 assert( ! UTF /* Is at the beginning of a character */
14116 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14117 || UTF8_IS_START(UCHARAT(RExC_parse)));
14119 overflowed = FALSE;
14121 /* Here, we have a literal character. Find the maximal string of
14122 * them in the input that we can fit into a single EXACTish node.
14123 * We quit at the first non-literal or when the node gets full, or
14124 * under /i the categorization of folding/non-folding character
14126 while (p < RExC_end && len < upper_fill) {
14128 /* In most cases each iteration adds one byte to the output.
14129 * The exceptions override this */
14130 Size_t added_len = 1;
14136 /* White space has already been ignored */
14137 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14138 || ! is_PATWS_safe((p), RExC_end, UTF));
14141 const char* message;
14154 /* Literal Escapes Switch
14156 This switch is meant to handle escape sequences that
14157 resolve to a literal character.
14159 Every escape sequence that represents something
14160 else, like an assertion or a char class, is handled
14161 in the switch marked 'Special Escapes' above in this
14162 routine, but also has an entry here as anything that
14163 isn't explicitly mentioned here will be treated as
14164 an unescaped equivalent literal.
14167 switch ((U8)*++p) {
14169 /* These are all the special escapes. */
14170 case 'A': /* Start assertion */
14171 case 'b': case 'B': /* Word-boundary assertion*/
14172 case 'C': /* Single char !DANGEROUS! */
14173 case 'd': case 'D': /* digit class */
14174 case 'g': case 'G': /* generic-backref, pos assertion */
14175 case 'h': case 'H': /* HORIZWS */
14176 case 'k': case 'K': /* named backref, keep marker */
14177 case 'p': case 'P': /* Unicode property */
14178 case 'R': /* LNBREAK */
14179 case 's': case 'S': /* space class */
14180 case 'v': case 'V': /* VERTWS */
14181 case 'w': case 'W': /* word class */
14182 case 'X': /* eXtended Unicode "combining
14183 character sequence" */
14184 case 'z': case 'Z': /* End of line/string assertion */
14188 /* Anything after here is an escape that resolves to a
14189 literal. (Except digits, which may or may not)
14195 case 'N': /* Handle a single-code point named character. */
14196 RExC_parse = p + 1;
14197 if (! grok_bslash_N(pRExC_state,
14198 NULL, /* Fail if evaluates to
14199 anything other than a
14200 single code point */
14201 &ender, /* The returned single code
14203 NULL, /* Don't need a count of
14204 how many code points */
14209 if (*flagp & NEED_UTF8)
14210 FAIL("panic: grok_bslash_N set NEED_UTF8");
14211 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14213 /* Here, it wasn't a single code point. Go close
14214 * up this EXACTish node. The switch() prior to
14215 * this switch handles the other cases */
14216 RExC_parse = p = oldp;
14220 RExC_parse = parse_start;
14222 /* The \N{} means the pattern, if previously /d,
14223 * becomes /u. That means it can't be an EXACTF node,
14224 * but an EXACTFU */
14225 if (node_type == EXACTF) {
14226 node_type = EXACTFU;
14228 /* If the node already contains something that
14229 * differs between EXACTF and EXACTFU, reparse it
14231 if (! maybe_exactfu) {
14252 ender = ESC_NATIVE;
14260 if (! grok_bslash_o(&p,
14265 (bool) RExC_strict,
14266 FALSE, /* No illegal cp's */
14269 RExC_parse = p; /* going to die anyway; point to
14270 exact spot of failure */
14274 if (message && TO_OUTPUT_WARNINGS(p)) {
14275 warn_non_literal_string(p, packed_warn, message);
14279 if (! grok_bslash_x(&p,
14284 (bool) RExC_strict,
14285 FALSE, /* No illegal cp's */
14288 RExC_parse = p; /* going to die anyway; point
14289 to exact spot of failure */
14293 if (message && TO_OUTPUT_WARNINGS(p)) {
14294 warn_non_literal_string(p, packed_warn, message);
14298 if (ender < 0x100) {
14299 if (RExC_recode_x_to_native) {
14300 ender = LATIN1_TO_NATIVE(ender);
14307 if (! grok_bslash_c(*p, &grok_c_char,
14308 &message, &packed_warn))
14310 /* going to die anyway; point to exact spot of
14312 RExC_parse = p + ((UTF)
14313 ? UTF8_SAFE_SKIP(p, RExC_end)
14318 ender = grok_c_char;
14320 if (message && TO_OUTPUT_WARNINGS(p)) {
14321 warn_non_literal_string(p, packed_warn, message);
14325 case '8': case '9': /* must be a backreference */
14327 /* we have an escape like \8 which cannot be an octal escape
14328 * so we exit the loop, and let the outer loop handle this
14329 * escape which may or may not be a legitimate backref. */
14331 case '1': case '2': case '3':case '4':
14332 case '5': case '6': case '7':
14333 /* When we parse backslash escapes there is ambiguity
14334 * between backreferences and octal escapes. Any escape
14335 * from \1 - \9 is a backreference, any multi-digit
14336 * escape which does not start with 0 and which when
14337 * evaluated as decimal could refer to an already
14338 * parsed capture buffer is a back reference. Anything
14341 * Note this implies that \118 could be interpreted as
14342 * 118 OR as "\11" . "8" depending on whether there
14343 * were 118 capture buffers defined already in the
14346 /* NOTE, RExC_npar is 1 more than the actual number of
14347 * parens we have seen so far, hence the "<" as opposed
14349 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14350 { /* Not to be treated as an octal constant, go
14358 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14359 | PERL_SCAN_NOTIFY_ILLDIGIT;
14361 ender = grok_oct(p, &numlen, &flags, NULL);
14363 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14364 && isDIGIT(*p) /* like \08, \178 */
14365 && ckWARN(WARN_REGEXP))
14367 reg_warn_non_literal_string(
14369 form_alien_digit_msg(8, numlen, p,
14370 RExC_end, UTF, FALSE));
14376 FAIL("Trailing \\");
14379 if (isALPHANUMERIC(*p)) {
14380 /* An alpha followed by '{' is going to fail next
14381 * iteration, so don't output this warning in that
14383 if (! isALPHA(*p) || *(p + 1) != '{') {
14384 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14385 " passed through", p);
14388 goto normal_default;
14389 } /* End of switch on '\' */
14392 /* Trying to gain new uses for '{' without breaking too
14393 * much existing code is hard. The solution currently
14395 * 1) If there is no ambiguity that a '{' should always
14396 * be taken literally, at the start of a construct, we
14398 * 2) If the literal '{' conflicts with our desired use
14399 * of it as a metacharacter, we die. The deprecation
14400 * cycles for this have come and gone.
14401 * 3) If there is ambiguity, we raise a simple warning.
14402 * This could happen, for example, if the user
14403 * intended it to introduce a quantifier, but slightly
14404 * misspelled the quantifier. Without this warning,
14405 * the quantifier would silently be taken as a literal
14406 * string of characters instead of a meta construct */
14407 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14409 || ( p > parse_start + 1
14410 && isALPHA_A(*(p - 1))
14411 && *(p - 2) == '\\')
14412 || new_regcurly(p, RExC_end))
14414 RExC_parse = p + 1;
14415 vFAIL("Unescaped left brace in regex is "
14418 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14419 " passed through");
14421 goto normal_default;
14424 if (p > RExC_parse && RExC_strict) {
14425 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14428 default: /* A literal character */
14430 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14432 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14433 &numlen, UTF8_ALLOW_DEFAULT);
14439 } /* End of switch on the literal */
14441 /* Here, have looked at the literal character, and <ender>
14442 * contains its ordinal; <p> points to the character after it.
14446 REQUIRE_UTF8(flagp);
14447 if ( UNICODE_IS_PERL_EXTENDED(ender)
14448 && TO_OUTPUT_WARNINGS(p))
14450 ckWARN2_non_literal_string(p,
14451 packWARN(WARN_PORTABLE),
14452 PL_extended_cp_format,
14457 /* We need to check if the next non-ignored thing is a
14458 * quantifier. Move <p> to after anything that should be
14459 * ignored, which, as a side effect, positions <p> for the next
14460 * loop iteration */
14461 skip_to_be_ignored_text(pRExC_state, &p,
14462 FALSE /* Don't force to /x */ );
14464 /* If the next thing is a quantifier, it applies to this
14465 * character only, which means that this character has to be in
14466 * its own node and can't just be appended to the string in an
14467 * existing node, so if there are already other characters in
14468 * the node, close the node with just them, and set up to do
14469 * this character again next time through, when it will be the
14470 * only thing in its new node */
14472 next_is_quantifier = LIKELY(p < RExC_end)
14473 && UNLIKELY(ISMULT2(p));
14475 if (next_is_quantifier && LIKELY(len)) {
14480 /* Ready to add 'ender' to the node */
14482 if (! FOLD) { /* The simple case, just append the literal */
14485 /* Don't output if it would overflow */
14486 if (UNLIKELY(len > max_string_len - ((UTF)
14487 ? UVCHR_SKIP(ender)
14494 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14495 *(s++) = (char) ender;
14498 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14499 added_len = (char *) new_s - s;
14500 s = (char *) new_s;
14503 requires_utf8_target = TRUE;
14507 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14509 /* Here are folding under /l, and the code point is
14510 * problematic. If this is the first character in the
14511 * node, change the node type to folding. Otherwise, if
14512 * this is the first problematic character, close up the
14513 * existing node, so can start a new node with this one */
14515 node_type = EXACTFL;
14516 RExC_contains_locale = 1;
14518 else if (node_type == EXACT) {
14523 /* This problematic code point means we can't simplify
14525 maybe_exactfu = FALSE;
14527 /* Here, we are adding a problematic fold character.
14528 * "Problematic" in this context means that its fold isn't
14529 * known until runtime. (The non-problematic code points
14530 * are the above-Latin1 ones that fold to also all
14531 * above-Latin1. Their folds don't vary no matter what the
14532 * locale is.) But here we have characters whose fold
14533 * depends on the locale. We just add in the unfolded
14534 * character, and wait until runtime to fold it */
14535 goto not_fold_common;
14537 else /* regular fold; see if actually is in a fold */
14538 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14540 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14542 /* Here, folding, but the character isn't in a fold.
14544 * Start a new node if previous characters in the node were
14546 if (len && node_type != EXACT) {
14551 /* Here, continuing a node with non-folded characters. Add
14553 goto not_fold_common;
14555 else { /* Here, does participate in some fold */
14557 /* If this is the first character in the node, change its
14558 * type to folding. Otherwise, if this is the first
14559 * folding character in the node, close up the existing
14560 * node, so can start a new node with this one. */
14562 node_type = compute_EXACTish(pRExC_state);
14564 else if (node_type == EXACT) {
14569 if (UTF) { /* Alway use the folded value for UTF-8
14571 if (UVCHR_IS_INVARIANT(ender)) {
14572 if (UNLIKELY(len + 1 > max_string_len)) {
14577 *(s)++ = (U8) toFOLD(ender);
14580 UV folded = _to_uni_fold_flags(
14582 (U8 *) s, /* We have allocated extra space
14583 in 's' so can't run off the
14586 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14587 ? FOLD_FLAGS_NOMIX_ASCII
14589 if (UNLIKELY(len + added_len > max_string_len)) {
14597 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14599 /* U+B5 folds to the MU, so its possible for a
14600 * non-UTF-8 target to match it */
14601 requires_utf8_target = TRUE;
14605 else { /* Here is non-UTF8. */
14607 /* The fold will be one or (rarely) two characters.
14608 * Check that there's room for at least a single one
14609 * before setting any flags, etc. Because otherwise an
14610 * overflowing character could cause a flag to be set
14611 * even though it doesn't end up in this node. (For
14612 * the two character fold, we check again, before
14613 * setting any flags) */
14614 if (UNLIKELY(len + 1 > max_string_len)) {
14619 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14620 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14621 || UNICODE_DOT_DOT_VERSION > 0)
14623 /* On non-ancient Unicodes, check for the only possible
14624 * multi-char fold */
14625 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14627 /* This potential multi-char fold means the node
14628 * can't be simple (because it could match more
14629 * than a single char). And in some cases it will
14630 * match 'ss', so set that flag */
14634 /* It can't change to be an EXACTFU (unless already
14635 * is one). We fold it iff under /u rules. */
14636 if (node_type != EXACTFU) {
14637 maybe_exactfu = FALSE;
14640 if (UNLIKELY(len + 2 > max_string_len)) {
14649 goto done_with_this_char;
14652 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14654 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14656 /* Also, the sequence 'ss' is special when not
14657 * under /u. If the target string is UTF-8, it
14658 * should match SHARP S; otherwise it won't. So,
14659 * here we have to exclude the possibility of this
14660 * node moving to /u.*/
14662 maybe_exactfu = FALSE;
14665 /* Here, the fold will be a single character */
14667 if (UNLIKELY(ender == MICRO_SIGN)) {
14668 has_micro_sign = TRUE;
14670 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14672 /* If the character's fold differs between /d and
14673 * /u, this can't change to be an EXACTFU node */
14674 maybe_exactfu = FALSE;
14677 *(s++) = (DEPENDS_SEMANTICS)
14678 ? (char) toFOLD(ender)
14680 /* Under /u, the fold of any character in
14681 * the 0-255 range happens to be its
14682 * lowercase equivalent, except for LATIN
14683 * SMALL LETTER SHARP S, which was handled
14684 * above, and the MICRO SIGN, whose fold
14685 * requires UTF-8 to represent. */
14686 : (char) toLOWER_L1(ender);
14688 } /* End of adding current character to the node */
14690 done_with_this_char:
14694 if (next_is_quantifier) {
14696 /* Here, the next input is a quantifier, and to get here,
14697 * the current character is the only one in the node. */
14701 } /* End of loop through literal characters */
14703 /* Here we have either exhausted the input or run out of room in
14704 * the node. If the former, we are done. (If we encountered a
14705 * character that can't be in the node, transfer is made directly
14706 * to <loopdone>, and so we wouldn't have fallen off the end of the
14708 if (LIKELY(! overflowed)) {
14712 /* Here we have run out of room. We can grow plain EXACT and
14713 * LEXACT nodes. If the pattern is gigantic enough, though,
14714 * eventually we'll have to artificially chunk the pattern into
14715 * multiple nodes. */
14716 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14717 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14718 Size_t overhead_expansion = 0;
14720 Size_t max_nodes_for_string;
14724 /* Here we couldn't fit the final character in the current
14725 * node, so it will have to be reparsed, no matter what else we
14729 /* If would have overflowed a regular EXACT node, switch
14730 * instead to an LEXACT. The code below is structured so that
14731 * the actual growing code is common to changing from an EXACT
14732 * or just increasing the LEXACT size. This means that we have
14733 * to save the string in the EXACT case before growing, and
14734 * then copy it afterwards to its new location */
14735 if (node_type == EXACT) {
14736 overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14737 RExC_emit += overhead_expansion;
14738 Copy(s0, temp, len, char);
14741 /* Ready to grow. If it was a plain EXACT, the string was
14742 * saved, and the first few bytes of it overwritten by adding
14743 * an argument field. We assume, as we do elsewhere in this
14744 * file, that one byte of remaining input will translate into
14745 * one byte of output, and if that's too small, we grow again,
14746 * if too large the excess memory is freed at the end */
14748 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14749 achievable = MIN(max_nodes_for_string,
14750 current_string_nodes + STR_SZ(RExC_end - p));
14751 delta = achievable - current_string_nodes;
14753 /* If there is just no more room, go finish up this chunk of
14759 change_engine_size(pRExC_state, delta + overhead_expansion);
14760 current_string_nodes += delta;
14762 = sizeof(struct regnode) * current_string_nodes;
14763 upper_fill = max_string_len + 1;
14765 /* If the length was small, we know this was originally an
14766 * EXACT node now converted to LEXACT, and the string has to be
14767 * restored. Otherwise the string was untouched. 260 is just
14768 * a number safely above 255 so don't have to worry about
14769 * getting it precise */
14771 node_type = LEXACT;
14772 FILL_NODE(ret, node_type);
14773 s0 = STRING(REGNODE_p(ret));
14774 Copy(temp, s0, len, char);
14778 goto continue_parse;
14781 bool splittable = FALSE;
14782 bool backed_up = FALSE;
14783 char * e; /* should this be U8? */
14784 char * s_start; /* should this be U8? */
14786 /* Here is /i. Running out of room creates a problem if we are
14787 * folding, and the split happens in the middle of a
14788 * multi-character fold, as a match that should have occurred,
14789 * won't, due to the way nodes are matched, and our artificial
14790 * boundary. So back off until we aren't splitting such a
14791 * fold. If there is no such place to back off to, we end up
14792 * taking the entire node as-is. This can happen if the node
14793 * consists entirely of 'f' or entirely of 's' characters (or
14794 * things that fold to them) as 'ff' and 'ss' are
14795 * multi-character folds.
14797 * The Unicode standard says that multi character folds consist
14798 * of either two or three characters. That means we would be
14799 * splitting one if the final character in the node is at the
14800 * beginning of either type, or is the second of a three
14804 * ender is the code point of the character that won't fit
14806 * s points to just beyond the final byte in the node.
14807 * It's where we would place ender if there were
14808 * room, and where in fact we do place ender's fold
14809 * in the code below, as we've over-allocated space
14810 * for s0 (hence s) to allow for this
14811 * e starts at 's' and advances as we append things.
14812 * old_s is the same as 's'. (If ender had fit, 's' would
14813 * have been advanced to beyond it).
14814 * old_old_s points to the beginning byte of the final
14815 * character in the node
14816 * p points to the beginning byte in the input of the
14817 * character beyond 'ender'.
14818 * oldp points to the beginning byte in the input of
14821 * In the case of /il, we haven't folded anything that could be
14822 * affected by the locale. That means only above-Latin1
14823 * characters that fold to other above-latin1 characters get
14824 * folded at compile time. To check where a good place to
14825 * split nodes is, everything in it will have to be folded.
14826 * The boolean 'maybe_exactfu' keeps track in /il if there are
14827 * any unfolded characters in the node. */
14828 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14830 /* If we do need to fold the node, we need a place to store the
14831 * folded copy, and a way to map back to the unfolded original
14833 char * locfold_buf = NULL;
14834 Size_t * loc_correspondence = NULL;
14836 if (! need_to_fold_loc) { /* The normal case. Just
14837 initialize to the actual node */
14840 s = old_old_s; /* Point to the beginning of the final char
14841 that fits in the node */
14845 /* Here, we have filled a /il node, and there are unfolded
14846 * characters in it. If the runtime locale turns out to be
14847 * UTF-8, there are possible multi-character folds, just
14848 * like when not under /l. The node hence can't terminate
14849 * in the middle of such a fold. To determine this, we
14850 * have to create a folded copy of this node. That means
14851 * reparsing the node, folding everything assuming a UTF-8
14852 * locale. (If at runtime it isn't such a locale, the
14853 * actions here wouldn't have been necessary, but we have
14854 * to assume the worst case.) If we find we need to back
14855 * off the folded string, we do so, and then map that
14856 * position back to the original unfolded node, which then
14857 * gets output, truncated at that spot */
14859 char * redo_p = RExC_parse;
14863 /* Allow enough space assuming a single byte input folds to
14864 * a single byte output, plus assume that the two unparsed
14865 * characters (that we may need) fold to the largest number
14866 * of bytes possible, plus extra for one more worst case
14867 * scenario. In the loop below, if we start eating into
14868 * that final spare space, we enlarge this initial space */
14869 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14871 Newxz(locfold_buf, size, char);
14872 Newxz(loc_correspondence, size, Size_t);
14874 /* Redo this node's parse, folding into 'locfold_buf' */
14875 redo_p = RExC_parse;
14876 old_redo_e = redo_e = locfold_buf;
14877 while (redo_p <= oldp) {
14879 old_redo_e = redo_e;
14880 loc_correspondence[redo_e - locfold_buf]
14881 = redo_p - RExC_parse;
14886 (void) _to_utf8_fold_flags((U8 *) redo_p,
14891 redo_e += added_len;
14892 redo_p += UTF8SKIP(redo_p);
14896 /* Note that if this code is run on some ancient
14897 * Unicode versions, SHARP S doesn't fold to 'ss',
14898 * but rather than clutter the code with #ifdef's,
14899 * as is done above, we ignore that possibility.
14900 * This is ok because this code doesn't affect what
14901 * gets matched, but merely where the node gets
14903 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14904 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14914 /* If we're getting so close to the end that a
14915 * worst-case fold in the next character would cause us
14916 * to overflow, increase, assuming one byte output byte
14917 * per one byte input one, plus room for another worst
14919 if ( redo_p <= oldp
14920 && redo_e > locfold_buf + size
14921 - (UTF8_MAXBYTES_CASE + 1))
14923 Size_t new_size = size
14925 + UTF8_MAXBYTES_CASE + 1;
14926 Ptrdiff_t e_offset = redo_e - locfold_buf;
14928 Renew(locfold_buf, new_size, char);
14929 Renew(loc_correspondence, new_size, Size_t);
14932 redo_e = locfold_buf + e_offset;
14936 /* Set so that things are in terms of the folded, temporary
14939 s_start = locfold_buf;
14944 /* Here, we have 's', 's_start' and 'e' set up to point to the
14945 * input that goes into the node, folded.
14947 * If the final character of the node and the fold of ender
14948 * form the first two characters of a three character fold, we
14949 * need to peek ahead at the next (unparsed) character in the
14950 * input to determine if the three actually do form such a
14951 * fold. Just looking at that character is not generally
14952 * sufficient, as it could be, for example, an escape sequence
14953 * that evaluates to something else, and it needs to be folded.
14955 * khw originally thought to just go through the parse loop one
14956 * extra time, but that doesn't work easily as that iteration
14957 * could cause things to think that the parse is over and to
14958 * goto loopdone. The character could be a '$' for example, or
14959 * the character beyond could be a quantifier, and other
14960 * glitches as well.
14962 * The solution used here for peeking ahead is to look at that
14963 * next character. If it isn't ASCII punctuation, then it will
14964 * be something that continues in an EXACTish node if there
14965 * were space. We append the fold of it to s, having reserved
14966 * enough room in s0 for the purpose. If we can't reasonably
14967 * peek ahead, we instead assume the worst case: that it is
14968 * something that would form the completion of a multi-char
14971 * If we can't split between s and ender, we work backwards
14972 * character-by-character down to s0. At each current point
14973 * see if we are at the beginning of a multi-char fold. If so,
14974 * that means we would be splitting the fold across nodes, and
14975 * so we back up one and try again.
14977 * If we're not at the beginning, we still could be at the
14978 * final two characters of a (rare) three character fold. We
14979 * check if the sequence starting at the character before the
14980 * current position (and including the current and next
14981 * characters) is a three character fold. If not, the node can
14982 * be split here. If it is, we have to backup two characters
14985 * Otherwise, the node can be split at the current position.
14987 * The same logic is used for UTF-8 patterns and not */
14991 /* Append the fold of ender */
14992 (void) _to_uni_fold_flags(
14996 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14997 ? FOLD_FLAGS_NOMIX_ASCII
15001 /* 's' and the character folded to by ender may be the
15002 * first two of a three-character fold, in which case the
15003 * node should not be split here. That may mean examining
15004 * the so-far unparsed character starting at 'p'. But if
15005 * ender folded to more than one character, we already have
15006 * three characters to look at. Also, we first check if
15007 * the sequence consisting of s and the next character form
15008 * the first two of some three character fold. If not,
15009 * there's no need to peek ahead. */
15010 if ( added_len <= UTF8SKIP(e - added_len)
15011 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15013 /* Here, the two do form the beginning of a potential
15014 * three character fold. The unexamined character may
15015 * or may not complete it. Peek at it. It might be
15016 * something that ends the node or an escape sequence,
15017 * in which case we don't know without a lot of work
15018 * what it evaluates to, so we have to assume the worst
15019 * case: that it does complete the fold, and so we
15020 * can't split here. All such instances will have
15021 * that character be an ASCII punctuation character,
15022 * like a backslash. So, for that case, backup one and
15023 * drop down to try at that position */
15025 s = (char *) utf8_hop_back((U8 *) s, -1,
15030 /* Here, since it's not punctuation, it must be a
15031 * real character, and we can append its fold to
15032 * 'e' (having deliberately reserved enough space
15033 * for this eventuality) and drop down to check if
15034 * the three actually do form a folded sequence */
15035 (void) _to_utf8_fold_flags(
15036 (U8 *) p, (U8 *) RExC_end,
15039 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15040 ? FOLD_FLAGS_NOMIX_ASCII
15046 /* Here, we either have three characters available in
15047 * sequence starting at 's', or we have two characters and
15048 * know that the following one can't possibly be part of a
15049 * three character fold. We go through the node backwards
15050 * until we find a place where we can split it without
15051 * breaking apart a multi-character fold. At any given
15052 * point we have to worry about if such a fold begins at
15053 * the current 's', and also if a three-character fold
15054 * begins at s-1, (containing s and s+1). Splitting in
15055 * either case would break apart a fold */
15057 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15060 /* If is a multi-char fold, can't split here. Backup
15061 * one char and try again */
15062 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15068 /* If the two characters beginning at 's' are part of a
15069 * three character fold starting at the character
15070 * before s, we can't split either before or after s.
15071 * Backup two chars and try again */
15072 if ( LIKELY(s > s_start)
15073 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15076 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15081 /* Here there's no multi-char fold between s and the
15082 * next character following it. We can split */
15086 } while (s > s_start); /* End of loops backing up through the node */
15088 /* Here we either couldn't find a place to split the node,
15089 * or else we broke out of the loop setting 'splittable' to
15090 * true. In the latter case, the place to split is between
15091 * the first and second characters in the sequence starting
15097 else { /* Pattern not UTF-8 */
15098 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15099 || ASCII_FOLD_RESTRICTED)
15101 assert( toLOWER_L1(ender) < 256 );
15102 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15110 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15117 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15118 || ASCII_FOLD_RESTRICTED)
15120 assert( toLOWER_L1(ender) < 256 );
15121 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15131 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15137 if ( LIKELY(s > s_start)
15138 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15148 } while (s > s_start);
15155 /* Here, we are done backing up. If we didn't backup at all
15156 * (the likely case), just proceed */
15159 /* If we did find a place to split, reparse the entire node
15160 * stopping where we have calculated. */
15163 /* If we created a temporary folded string under /l, we
15164 * have to map that back to the original */
15165 if (need_to_fold_loc) {
15166 upper_fill = loc_correspondence[s - s_start];
15167 Safefree(locfold_buf);
15168 Safefree(loc_correspondence);
15170 if (upper_fill == 0) {
15171 FAIL2("panic: loc_correspondence[%d] is 0",
15172 (int) (s - s_start));
15176 upper_fill = s - s0;
15180 else if (need_to_fold_loc) {
15181 Safefree(locfold_buf);
15182 Safefree(loc_correspondence);
15185 /* Here the node consists entirely of non-final multi-char
15186 * folds. (Likely it is all 'f's or all 's's.) There's no
15187 * decent place to split it, so give up and just take the
15191 } /* End of verifying node ends with an appropriate char */
15193 /* We need to start the next node at the character that didn't fit
15197 loopdone: /* Jumped to when encounters something that shouldn't be
15200 /* Free up any over-allocated space; cast is to silence bogus
15201 * warning in MS VC */
15202 change_engine_size(pRExC_state,
15203 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15205 /* I (khw) don't know if you can get here with zero length, but the
15206 * old code handled this situation by creating a zero-length EXACT
15207 * node. Might as well be NOTHING instead */
15209 OP(REGNODE_p(ret)) = NOTHING;
15213 /* If the node type is EXACT here, check to see if it
15214 * should be EXACTL, or EXACT_REQ8. */
15215 if (node_type == EXACT) {
15217 node_type = EXACTL;
15219 else if (requires_utf8_target) {
15220 node_type = EXACT_REQ8;
15223 else if (node_type == LEXACT) {
15224 if (requires_utf8_target) {
15225 node_type = LEXACT_REQ8;
15229 if ( UNLIKELY(has_micro_sign || has_ss)
15230 && (node_type == EXACTFU || ( node_type == EXACTF
15231 && maybe_exactfu)))
15232 { /* These two conditions are problematic in non-UTF-8
15235 node_type = EXACTFUP;
15237 else if (node_type == EXACTFL) {
15239 /* 'maybe_exactfu' is deliberately set above to
15240 * indicate this node type, where all code points in it
15242 if (maybe_exactfu) {
15243 node_type = EXACTFLU8;
15246 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15248 /* A character that folds to more than one will
15249 * match multiple characters, so can't be SIMPLE.
15250 * We don't have to worry about this with EXACTFLU8
15251 * nodes just above, as they have already been
15252 * folded (since the fold doesn't vary at run
15253 * time). Here, if the final character in the node
15254 * folds to multiple, it can't be simple. (This
15255 * only has an effect if the node has only a single
15256 * character, hence the final one, as elsewhere we
15257 * turn off simple for nodes whose length > 1 */
15261 else if (node_type == EXACTF) { /* Means is /di */
15263 /* This intermediate variable is needed solely because
15264 * the asserts in the macro where used exceed Win32's
15265 * literal string capacity */
15266 char first_char = * STRING(REGNODE_p(ret));
15268 /* If 'maybe_exactfu' is clear, then we need to stay
15269 * /di. If it is set, it means there are no code
15270 * points that match differently depending on UTF8ness
15271 * of the target string, so it can become an EXACTFU
15273 if (! maybe_exactfu) {
15274 RExC_seen_d_op = TRUE;
15276 else if ( isALPHA_FOLD_EQ(first_char, 's')
15277 || isALPHA_FOLD_EQ(ender, 's'))
15279 /* But, if the node begins or ends in an 's' we
15280 * have to defer changing it into an EXACTFU, as
15281 * the node could later get joined with another one
15282 * that ends or begins with 's' creating an 'ss'
15283 * sequence which would then wrongly match the
15284 * sharp s without the target being UTF-8. We
15285 * create a special node that we resolve later when
15286 * we join nodes together */
15288 node_type = EXACTFU_S_EDGE;
15291 node_type = EXACTFU;
15295 if (requires_utf8_target && node_type == EXACTFU) {
15296 node_type = EXACTFU_REQ8;
15300 OP(REGNODE_p(ret)) = node_type;
15301 setSTR_LEN(REGNODE_p(ret), len);
15302 RExC_emit += STR_SZ(len);
15304 /* If the node isn't a single character, it can't be SIMPLE */
15305 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15309 *flagp |= HASWIDTH | maybe_SIMPLE;
15312 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15316 /* len is STRLEN which is unsigned, need to copy to signed */
15319 vFAIL("Internal disaster");
15322 } /* End of label 'defchar:' */
15324 } /* End of giant switch on input character */
15326 /* Position parse to next real character */
15327 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15328 FALSE /* Don't force to /x */ );
15329 if ( *RExC_parse == '{'
15330 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15332 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15334 vFAIL("Unescaped left brace in regex is illegal here");
15336 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15337 " passed through");
15345 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15347 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
15348 * sets up the bitmap and any flags, removing those code points from the
15349 * inversion list, setting it to NULL should it become completely empty */
15353 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15354 assert(PL_regkind[OP(node)] == ANYOF);
15356 /* There is no bitmap for this node type */
15357 if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15361 ANYOF_BITMAP_ZERO(node);
15362 if (*invlist_ptr) {
15364 /* This gets set if we actually need to modify things */
15365 bool change_invlist = FALSE;
15369 /* Start looking through *invlist_ptr */
15370 invlist_iterinit(*invlist_ptr);
15371 while (invlist_iternext(*invlist_ptr, &start, &end)) {
15375 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15376 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15379 /* Quit if are above what we should change */
15380 if (start >= NUM_ANYOF_CODE_POINTS) {
15384 change_invlist = TRUE;
15386 /* Set all the bits in the range, up to the max that we are doing */
15387 high = (end < NUM_ANYOF_CODE_POINTS - 1)
15389 : NUM_ANYOF_CODE_POINTS - 1;
15390 for (i = start; i <= (int) high; i++) {
15391 if (! ANYOF_BITMAP_TEST(node, i)) {
15392 ANYOF_BITMAP_SET(node, i);
15396 invlist_iterfinish(*invlist_ptr);
15398 /* Done with loop; remove any code points that are in the bitmap from
15399 * *invlist_ptr; similarly for code points above the bitmap if we have
15400 * a flag to match all of them anyways */
15401 if (change_invlist) {
15402 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15404 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15405 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15408 /* If have completely emptied it, remove it completely */
15409 if (_invlist_len(*invlist_ptr) == 0) {
15410 SvREFCNT_dec_NN(*invlist_ptr);
15411 *invlist_ptr = NULL;
15416 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15417 Character classes ([:foo:]) can also be negated ([:^foo:]).
15418 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15419 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15420 but trigger failures because they are currently unimplemented. */
15422 #define POSIXCC_DONE(c) ((c) == ':')
15423 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15424 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15425 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15427 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
15428 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
15429 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
15431 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15433 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15435 #define ADD_POSIX_WARNING(p, text) STMT_START { \
15436 if (posix_warnings) { \
15437 if (! RExC_warn_text ) RExC_warn_text = \
15438 (AV *) sv_2mortal((SV *) newAV()); \
15439 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
15443 REPORT_LOCATION_ARGS(p))); \
15446 #define CLEAR_POSIX_WARNINGS() \
15448 if (posix_warnings && RExC_warn_text) \
15449 av_clear(RExC_warn_text); \
15452 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
15454 CLEAR_POSIX_WARNINGS(); \
15459 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15461 const char * const s, /* Where the putative posix class begins.
15462 Normally, this is one past the '['. This
15463 parameter exists so it can be somewhere
15464 besides RExC_parse. */
15465 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15467 AV ** posix_warnings, /* Where to place any generated warnings, or
15469 const bool check_only /* Don't die if error */
15472 /* This parses what the caller thinks may be one of the three POSIX
15474 * 1) a character class, like [:blank:]
15475 * 2) a collating symbol, like [. .]
15476 * 3) an equivalence class, like [= =]
15477 * In the latter two cases, it croaks if it finds a syntactically legal
15478 * one, as these are not handled by Perl.
15480 * The main purpose is to look for a POSIX character class. It returns:
15481 * a) the class number
15482 * if it is a completely syntactically and semantically legal class.
15483 * 'updated_parse_ptr', if not NULL, is set to point to just after the
15484 * closing ']' of the class
15485 * b) OOB_NAMEDCLASS
15486 * if it appears that one of the three POSIX constructs was meant, but
15487 * its specification was somehow defective. 'updated_parse_ptr', if
15488 * not NULL, is set to point to the character just after the end
15489 * character of the class. See below for handling of warnings.
15490 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15491 * if it doesn't appear that a POSIX construct was intended.
15492 * 'updated_parse_ptr' is not changed. No warnings nor errors are
15495 * In b) there may be errors or warnings generated. If 'check_only' is
15496 * TRUE, then any errors are discarded. Warnings are returned to the
15497 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
15498 * instead it is NULL, warnings are suppressed.
15500 * The reason for this function, and its complexity is that a bracketed
15501 * character class can contain just about anything. But it's easy to
15502 * mistype the very specific posix class syntax but yielding a valid
15503 * regular bracketed class, so it silently gets compiled into something
15504 * quite unintended.
15506 * The solution adopted here maintains backward compatibility except that
15507 * it adds a warning if it looks like a posix class was intended but
15508 * improperly specified. The warning is not raised unless what is input
15509 * very closely resembles one of the 14 legal posix classes. To do this,
15510 * it uses fuzzy parsing. It calculates how many single-character edits it
15511 * would take to transform what was input into a legal posix class. Only
15512 * if that number is quite small does it think that the intention was a
15513 * posix class. Obviously these are heuristics, and there will be cases
15514 * where it errs on one side or another, and they can be tweaked as
15515 * experience informs.
15517 * The syntax for a legal posix class is:
15519 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15521 * What this routine considers syntactically to be an intended posix class
15522 * is this (the comments indicate some restrictions that the pattern
15525 * qr/(?x: \[? # The left bracket, possibly
15527 * \h* # possibly followed by blanks
15528 * (?: \^ \h* )? # possibly a misplaced caret
15529 * [:;]? # The opening class character,
15530 * # possibly omitted. A typo
15531 * # semi-colon can also be used.
15533 * \^? # possibly a correctly placed
15534 * # caret, but not if there was also
15535 * # a misplaced one
15537 * .{3,15} # The class name. If there are
15538 * # deviations from the legal syntax,
15539 * # its edit distance must be close
15540 * # to a real class name in order
15541 * # for it to be considered to be
15542 * # an intended posix class.
15544 * [[:punct:]]? # The closing class character,
15545 * # possibly omitted. If not a colon
15546 * # nor semi colon, the class name
15547 * # must be even closer to a valid
15550 * \]? # The right bracket, possibly
15554 * In the above, \h must be ASCII-only.
15556 * These are heuristics, and can be tweaked as field experience dictates.
15557 * There will be cases when someone didn't intend to specify a posix class
15558 * that this warns as being so. The goal is to minimize these, while
15559 * maximizing the catching of things intended to be a posix class that
15560 * aren't parsed as such.
15564 const char * const e = RExC_end;
15565 unsigned complement = 0; /* If to complement the class */
15566 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15567 bool has_opening_bracket = FALSE;
15568 bool has_opening_colon = FALSE;
15569 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15571 const char * possible_end = NULL; /* used for a 2nd parse pass */
15572 const char* name_start; /* ptr to class name first char */
15574 /* If the number of single-character typos the input name is away from a
15575 * legal name is no more than this number, it is considered to have meant
15576 * the legal name */
15577 int max_distance = 2;
15579 /* to store the name. The size determines the maximum length before we
15580 * decide that no posix class was intended. Should be at least
15581 * sizeof("alphanumeric") */
15583 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15585 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15587 CLEAR_POSIX_WARNINGS();
15590 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15593 if (*(p - 1) != '[') {
15594 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15595 found_problem = TRUE;
15598 has_opening_bracket = TRUE;
15601 /* They could be confused and think you can put spaces between the
15604 found_problem = TRUE;
15608 } while (p < e && isBLANK(*p));
15610 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15613 /* For [. .] and [= =]. These are quite different internally from [: :],
15614 * so they are handled separately. */
15615 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15616 and 1 for at least one char in it
15619 const char open_char = *p;
15620 const char * temp_ptr = p + 1;
15622 /* These two constructs are not handled by perl, and if we find a
15623 * syntactically valid one, we croak. khw, who wrote this code, finds
15624 * this explanation of them very unclear:
15625 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15626 * And searching the rest of the internet wasn't very helpful either.
15627 * It looks like just about any byte can be in these constructs,
15628 * depending on the locale. But unless the pattern is being compiled
15629 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15630 * In that case, it looks like [= =] isn't allowed at all, and that
15631 * [. .] could be any single code point, but for longer strings the
15632 * constituent characters would have to be the ASCII alphabetics plus
15633 * the minus-hyphen. Any sensible locale definition would limit itself
15634 * to these. And any portable one definitely should. Trying to parse
15635 * the general case is a nightmare (see [perl #127604]). So, this code
15636 * looks only for interiors of these constructs that match:
15638 * Using \w relaxes the apparent rules a little, without adding much
15639 * danger of mistaking something else for one of these constructs.
15641 * [. .] in some implementations described on the internet is usable to
15642 * escape a character that otherwise is special in bracketed character
15643 * classes. For example [.].] means a literal right bracket instead of
15644 * the ending of the class
15646 * [= =] can legitimately contain a [. .] construct, but we don't
15647 * handle this case, as that [. .] construct will later get parsed
15648 * itself and croak then. And [= =] is checked for even when not under
15649 * /l, as Perl has long done so.
15651 * The code below relies on there being a trailing NUL, so it doesn't
15652 * have to keep checking if the parse ptr < e.
15654 if (temp_ptr[1] == open_char) {
15657 else while ( temp_ptr < e
15658 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15663 if (*temp_ptr == open_char) {
15665 if (*temp_ptr == ']') {
15667 if (! found_problem && ! check_only) {
15668 RExC_parse = (char *) temp_ptr;
15669 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15670 "extensions", open_char, open_char);
15673 /* Here, the syntax wasn't completely valid, or else the call
15674 * is to check-only */
15675 if (updated_parse_ptr) {
15676 *updated_parse_ptr = (char *) temp_ptr;
15679 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15683 /* If we find something that started out to look like one of these
15684 * constructs, but isn't, we continue below so that it can be checked
15685 * for being a class name with a typo of '.' or '=' instead of a colon.
15689 /* Here, we think there is a possibility that a [: :] class was meant, and
15690 * we have the first real character. It could be they think the '^' comes
15693 found_problem = TRUE;
15694 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15699 found_problem = TRUE;
15703 } while (p < e && isBLANK(*p));
15705 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15709 /* But the first character should be a colon, which they could have easily
15710 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15711 * distinguish from a colon, so treat that as a colon). */
15714 has_opening_colon = TRUE;
15716 else if (*p == ';') {
15717 found_problem = TRUE;
15719 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15720 has_opening_colon = TRUE;
15723 found_problem = TRUE;
15724 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15726 /* Consider an initial punctuation (not one of the recognized ones) to
15727 * be a left terminator */
15728 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15733 /* They may think that you can put spaces between the components */
15735 found_problem = TRUE;
15739 } while (p < e && isBLANK(*p));
15741 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15746 /* We consider something like [^:^alnum:]] to not have been intended to
15747 * be a posix class, but XXX maybe we should */
15749 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15756 /* Again, they may think that you can put spaces between the components */
15758 found_problem = TRUE;
15762 } while (p < e && isBLANK(*p));
15764 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15769 /* XXX This ']' may be a typo, and something else was meant. But
15770 * treating it as such creates enough complications, that that
15771 * possibility isn't currently considered here. So we assume that the
15772 * ']' is what is intended, and if we've already found an initial '[',
15773 * this leaves this construct looking like [:] or [:^], which almost
15774 * certainly weren't intended to be posix classes */
15775 if (has_opening_bracket) {
15776 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15779 /* But this function can be called when we parse the colon for
15780 * something like qr/[alpha:]]/, so we back up to look for the
15785 found_problem = TRUE;
15786 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15788 else if (*p != ':') {
15790 /* XXX We are currently very restrictive here, so this code doesn't
15791 * consider the possibility that, say, /[alpha.]]/ was intended to
15792 * be a posix class. */
15793 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15796 /* Here we have something like 'foo:]'. There was no initial colon,
15797 * and we back up over 'foo. XXX Unlike the going forward case, we
15798 * don't handle typos of non-word chars in the middle */
15799 has_opening_colon = FALSE;
15802 while (p > RExC_start && isWORDCHAR(*p)) {
15807 /* Here, we have positioned ourselves to where we think the first
15808 * character in the potential class is */
15811 /* Now the interior really starts. There are certain key characters that
15812 * can end the interior, or these could just be typos. To catch both
15813 * cases, we may have to do two passes. In the first pass, we keep on
15814 * going unless we come to a sequence that matches
15815 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15816 * This means it takes a sequence to end the pass, so two typos in a row if
15817 * that wasn't what was intended. If the class is perfectly formed, just
15818 * this one pass is needed. We also stop if there are too many characters
15819 * being accumulated, but this number is deliberately set higher than any
15820 * real class. It is set high enough so that someone who thinks that
15821 * 'alphanumeric' is a correct name would get warned that it wasn't.
15822 * While doing the pass, we keep track of where the key characters were in
15823 * it. If we don't find an end to the class, and one of the key characters
15824 * was found, we redo the pass, but stop when we get to that character.
15825 * Thus the key character was considered a typo in the first pass, but a
15826 * terminator in the second. If two key characters are found, we stop at
15827 * the second one in the first pass. Again this can miss two typos, but
15828 * catches a single one
15830 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15831 * point to the first key character. For the second pass, it starts as -1.
15837 bool has_blank = FALSE;
15838 bool has_upper = FALSE;
15839 bool has_terminating_colon = FALSE;
15840 bool has_terminating_bracket = FALSE;
15841 bool has_semi_colon = FALSE;
15842 unsigned int name_len = 0;
15843 int punct_count = 0;
15847 /* Squeeze out blanks when looking up the class name below */
15848 if (isBLANK(*p) ) {
15850 found_problem = TRUE;
15855 /* The name will end with a punctuation */
15857 const char * peek = p + 1;
15859 /* Treat any non-']' punctuation followed by a ']' (possibly
15860 * with intervening blanks) as trying to terminate the class.
15861 * ']]' is very likely to mean a class was intended (but
15862 * missing the colon), but the warning message that gets
15863 * generated shows the error position better if we exit the
15864 * loop at the bottom (eventually), so skip it here. */
15866 if (peek < e && isBLANK(*peek)) {
15868 found_problem = TRUE;
15871 } while (peek < e && isBLANK(*peek));
15874 if (peek < e && *peek == ']') {
15875 has_terminating_bracket = TRUE;
15877 has_terminating_colon = TRUE;
15879 else if (*p == ';') {
15880 has_semi_colon = TRUE;
15881 has_terminating_colon = TRUE;
15884 found_problem = TRUE;
15891 /* Here we have punctuation we thought didn't end the class.
15892 * Keep track of the position of the key characters that are
15893 * more likely to have been class-enders */
15894 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15896 /* Allow just one such possible class-ender not actually
15897 * ending the class. */
15898 if (possible_end) {
15904 /* If we have too many punctuation characters, no use in
15906 if (++punct_count > max_distance) {
15910 /* Treat the punctuation as a typo. */
15911 input_text[name_len++] = *p;
15914 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15915 input_text[name_len++] = toLOWER(*p);
15917 found_problem = TRUE;
15919 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15920 input_text[name_len++] = *p;
15924 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15928 /* The declaration of 'input_text' is how long we allow a potential
15929 * class name to be, before saying they didn't mean a class name at
15931 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15936 /* We get to here when the possible class name hasn't been properly
15937 * terminated before:
15938 * 1) we ran off the end of the pattern; or
15939 * 2) found two characters, each of which might have been intended to
15940 * be the name's terminator
15941 * 3) found so many punctuation characters in the purported name,
15942 * that the edit distance to a valid one is exceeded
15943 * 4) we decided it was more characters than anyone could have
15944 * intended to be one. */
15946 found_problem = TRUE;
15948 /* In the final two cases, we know that looking up what we've
15949 * accumulated won't lead to a match, even a fuzzy one. */
15950 if ( name_len >= C_ARRAY_LENGTH(input_text)
15951 || punct_count > max_distance)
15953 /* If there was an intermediate key character that could have been
15954 * an intended end, redo the parse, but stop there */
15955 if (possible_end && possible_end != (char *) -1) {
15956 possible_end = (char *) -1; /* Special signal value to say
15957 we've done a first pass */
15962 /* Otherwise, it can't have meant to have been a class */
15963 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15966 /* If we ran off the end, and the final character was a punctuation
15967 * one, back up one, to look at that final one just below. Later, we
15968 * will restore the parse pointer if appropriate */
15969 if (name_len && p == e && isPUNCT(*(p-1))) {
15974 if (p < e && isPUNCT(*p)) {
15976 has_terminating_bracket = TRUE;
15978 /* If this is a 2nd ']', and the first one is just below this
15979 * one, consider that to be the real terminator. This gives a
15980 * uniform and better positioning for the warning message */
15982 && possible_end != (char *) -1
15983 && *possible_end == ']'
15984 && name_len && input_text[name_len - 1] == ']')
15989 /* And this is actually equivalent to having done the 2nd
15990 * pass now, so set it to not try again */
15991 possible_end = (char *) -1;
15996 has_terminating_colon = TRUE;
15998 else if (*p == ';') {
15999 has_semi_colon = TRUE;
16000 has_terminating_colon = TRUE;
16008 /* Here, we have a class name to look up. We can short circuit the
16009 * stuff below for short names that can't possibly be meant to be a
16010 * class name. (We can do this on the first pass, as any second pass
16011 * will yield an even shorter name) */
16012 if (name_len < 3) {
16013 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16016 /* Find which class it is. Initially switch on the length of the name.
16018 switch (name_len) {
16020 if (memEQs(name_start, 4, "word")) {
16021 /* this is not POSIX, this is the Perl \w */
16022 class_number = ANYOF_WORDCHAR;
16026 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16027 * graph lower print punct space upper
16028 * Offset 4 gives the best switch position. */
16029 switch (name_start[4]) {
16031 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16032 class_number = ANYOF_ALPHA;
16035 if (memBEGINs(name_start, 5, "spac")) /* space */
16036 class_number = ANYOF_SPACE;
16039 if (memBEGINs(name_start, 5, "grap")) /* graph */
16040 class_number = ANYOF_GRAPH;
16043 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16044 class_number = ANYOF_ASCII;
16047 if (memBEGINs(name_start, 5, "blan")) /* blank */
16048 class_number = ANYOF_BLANK;
16051 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16052 class_number = ANYOF_CNTRL;
16055 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16056 class_number = ANYOF_ALPHANUMERIC;
16059 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16060 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16061 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16062 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16065 if (memBEGINs(name_start, 5, "digi")) /* digit */
16066 class_number = ANYOF_DIGIT;
16067 else if (memBEGINs(name_start, 5, "prin")) /* print */
16068 class_number = ANYOF_PRINT;
16069 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16070 class_number = ANYOF_PUNCT;
16075 if (memEQs(name_start, 6, "xdigit"))
16076 class_number = ANYOF_XDIGIT;
16080 /* If the name exactly matches a posix class name the class number will
16081 * here be set to it, and the input almost certainly was meant to be a
16082 * posix class, so we can skip further checking. If instead the syntax
16083 * is exactly correct, but the name isn't one of the legal ones, we
16084 * will return that as an error below. But if neither of these apply,
16085 * it could be that no posix class was intended at all, or that one
16086 * was, but there was a typo. We tease these apart by doing fuzzy
16087 * matching on the name */
16088 if (class_number == OOB_NAMEDCLASS && found_problem) {
16089 const UV posix_names[][6] = {
16090 { 'a', 'l', 'n', 'u', 'm' },
16091 { 'a', 'l', 'p', 'h', 'a' },
16092 { 'a', 's', 'c', 'i', 'i' },
16093 { 'b', 'l', 'a', 'n', 'k' },
16094 { 'c', 'n', 't', 'r', 'l' },
16095 { 'd', 'i', 'g', 'i', 't' },
16096 { 'g', 'r', 'a', 'p', 'h' },
16097 { 'l', 'o', 'w', 'e', 'r' },
16098 { 'p', 'r', 'i', 'n', 't' },
16099 { 'p', 'u', 'n', 'c', 't' },
16100 { 's', 'p', 'a', 'c', 'e' },
16101 { 'u', 'p', 'p', 'e', 'r' },
16102 { 'w', 'o', 'r', 'd' },
16103 { 'x', 'd', 'i', 'g', 'i', 't' }
16105 /* The names of the above all have added NULs to make them the same
16106 * size, so we need to also have the real lengths */
16107 const UV posix_name_lengths[] = {
16108 sizeof("alnum") - 1,
16109 sizeof("alpha") - 1,
16110 sizeof("ascii") - 1,
16111 sizeof("blank") - 1,
16112 sizeof("cntrl") - 1,
16113 sizeof("digit") - 1,
16114 sizeof("graph") - 1,
16115 sizeof("lower") - 1,
16116 sizeof("print") - 1,
16117 sizeof("punct") - 1,
16118 sizeof("space") - 1,
16119 sizeof("upper") - 1,
16120 sizeof("word") - 1,
16121 sizeof("xdigit")- 1
16124 int temp_max = max_distance; /* Use a temporary, so if we
16125 reparse, we haven't changed the
16128 /* Use a smaller max edit distance if we are missing one of the
16130 if ( has_opening_bracket + has_opening_colon < 2
16131 || has_terminating_bracket + has_terminating_colon < 2)
16136 /* See if the input name is close to a legal one */
16137 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16139 /* Short circuit call if the lengths are too far apart to be
16141 if (abs( (int) (name_len - posix_name_lengths[i]))
16147 if (edit_distance(input_text,
16150 posix_name_lengths[i],
16154 { /* If it is close, it probably was intended to be a class */
16155 goto probably_meant_to_be;
16159 /* Here the input name is not close enough to a valid class name
16160 * for us to consider it to be intended to be a posix class. If
16161 * we haven't already done so, and the parse found a character that
16162 * could have been terminators for the name, but which we absorbed
16163 * as typos during the first pass, repeat the parse, signalling it
16164 * to stop at that character */
16165 if (possible_end && possible_end != (char *) -1) {
16166 possible_end = (char *) -1;
16171 /* Here neither pass found a close-enough class name */
16172 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16175 probably_meant_to_be:
16177 /* Here we think that a posix specification was intended. Update any
16179 if (updated_parse_ptr) {
16180 *updated_parse_ptr = (char *) p;
16183 /* If a posix class name was intended but incorrectly specified, we
16184 * output or return the warnings */
16185 if (found_problem) {
16187 /* We set flags for these issues in the parse loop above instead of
16188 * adding them to the list of warnings, because we can parse it
16189 * twice, and we only want one warning instance */
16191 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16194 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16196 if (has_semi_colon) {
16197 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16199 else if (! has_terminating_colon) {
16200 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16202 if (! has_terminating_bracket) {
16203 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16206 if ( posix_warnings
16208 && av_top_index(RExC_warn_text) > -1)
16210 *posix_warnings = RExC_warn_text;
16213 else if (class_number != OOB_NAMEDCLASS) {
16214 /* If it is a known class, return the class. The class number
16215 * #defines are structured so each complement is +1 to the normal
16217 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16219 else if (! check_only) {
16221 /* Here, it is an unrecognized class. This is an error (unless the
16222 * call is to check only, which we've already handled above) */
16223 const char * const complement_string = (complement)
16226 RExC_parse = (char *) p;
16227 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16229 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16233 return OOB_NAMEDCLASS;
16235 #undef ADD_POSIX_WARNING
16237 STATIC unsigned int
16238 S_regex_set_precedence(const U8 my_operator) {
16240 /* Returns the precedence in the (?[...]) construct of the input operator,
16241 * specified by its character representation. The precedence follows
16242 * general Perl rules, but it extends this so that ')' and ']' have (low)
16243 * precedence even though they aren't really operators */
16245 switch (my_operator) {
16261 NOT_REACHED; /* NOTREACHED */
16262 return 0; /* Silence compiler warning */
16265 STATIC regnode_offset
16266 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16267 I32 *flagp, U32 depth,
16268 char * const oregcomp_parse)
16270 /* Handle the (?[...]) construct to do set operations */
16272 U8 curchar; /* Current character being parsed */
16273 UV start, end; /* End points of code point ranges */
16274 SV* final = NULL; /* The end result inversion list */
16275 SV* result_string; /* 'final' stringified */
16276 AV* stack; /* stack of operators and operands not yet
16278 AV* fence_stack = NULL; /* A stack containing the positions in
16279 'stack' of where the undealt-with left
16280 parens would be if they were actually
16282 /* The 'volatile' is a workaround for an optimiser bug
16283 * in Solaris Studio 12.3. See RT #127455 */
16284 volatile IV fence = 0; /* Position of where most recent undealt-
16285 with left paren in stack is; -1 if none.
16287 STRLEN len; /* Temporary */
16288 regnode_offset node; /* Temporary, and final regnode returned by
16290 const bool save_fold = FOLD; /* Temporary */
16291 char *save_end, *save_parse; /* Temporaries */
16292 const bool in_locale = LOC; /* we turn off /l during processing */
16294 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16296 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16297 PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16299 DEBUG_PARSE("xcls");
16302 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16305 /* The use of this operator implies /u. This is required so that the
16306 * compile time values are valid in all runtime cases */
16307 REQUIRE_UNI_RULES(flagp, 0);
16309 ckWARNexperimental(RExC_parse,
16310 WARN_EXPERIMENTAL__REGEX_SETS,
16311 "The regex_sets feature is experimental");
16313 /* Everything in this construct is a metacharacter. Operands begin with
16314 * either a '\' (for an escape sequence), or a '[' for a bracketed
16315 * character class. Any other character should be an operator, or
16316 * parenthesis for grouping. Both types of operands are handled by calling
16317 * regclass() to parse them. It is called with a parameter to indicate to
16318 * return the computed inversion list. The parsing here is implemented via
16319 * a stack. Each entry on the stack is a single character representing one
16320 * of the operators; or else a pointer to an operand inversion list. */
16322 #define IS_OPERATOR(a) SvIOK(a)
16323 #define IS_OPERAND(a) (! IS_OPERATOR(a))
16325 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
16326 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16327 * with pronouncing it called it Reverse Polish instead, but now that YOU
16328 * know how to pronounce it you can use the correct term, thus giving due
16329 * credit to the person who invented it, and impressing your geek friends.
16330 * Wikipedia says that the pronounciation of "Ł" has been changing so that
16331 * it is now more like an English initial W (as in wonk) than an L.)
16333 * This means that, for example, 'a | b & c' is stored on the stack as
16341 * where the numbers in brackets give the stack [array] element number.
16342 * In this implementation, parentheses are not stored on the stack.
16343 * Instead a '(' creates a "fence" so that the part of the stack below the
16344 * fence is invisible except to the corresponding ')' (this allows us to
16345 * replace testing for parens, by using instead subtraction of the fence
16346 * position). As new operands are processed they are pushed onto the stack
16347 * (except as noted in the next paragraph). New operators of higher
16348 * precedence than the current final one are inserted on the stack before
16349 * the lhs operand (so that when the rhs is pushed next, everything will be
16350 * in the correct positions shown above. When an operator of equal or
16351 * lower precedence is encountered in parsing, all the stacked operations
16352 * of equal or higher precedence are evaluated, leaving the result as the
16353 * top entry on the stack. This makes higher precedence operations
16354 * evaluate before lower precedence ones, and causes operations of equal
16355 * precedence to left associate.
16357 * The only unary operator '!' is immediately pushed onto the stack when
16358 * encountered. When an operand is encountered, if the top of the stack is
16359 * a '!", the complement is immediately performed, and the '!' popped. The
16360 * resulting value is treated as a new operand, and the logic in the
16361 * previous paragraph is executed. Thus in the expression
16363 * the stack looks like
16369 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16376 * A ')' is treated as an operator with lower precedence than all the
16377 * aforementioned ones, which causes all operations on the stack above the
16378 * corresponding '(' to be evaluated down to a single resultant operand.
16379 * Then the fence for the '(' is removed, and the operand goes through the
16380 * algorithm above, without the fence.
16382 * A separate stack is kept of the fence positions, so that the position of
16383 * the latest so-far unbalanced '(' is at the top of it.
16385 * The ']' ending the construct is treated as the lowest operator of all,
16386 * so that everything gets evaluated down to a single operand, which is the
16389 sv_2mortal((SV *)(stack = newAV()));
16390 sv_2mortal((SV *)(fence_stack = newAV()));
16392 while (RExC_parse < RExC_end) {
16393 I32 top_index; /* Index of top-most element in 'stack' */
16394 SV** top_ptr; /* Pointer to top 'stack' element */
16395 SV* current = NULL; /* To contain the current inversion list
16397 SV* only_to_avoid_leaks;
16399 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16400 TRUE /* Force /x */ );
16401 if (RExC_parse >= RExC_end) { /* Fail */
16405 curchar = UCHARAT(RExC_parse);
16409 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16410 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16411 DEBUG_U(dump_regex_sets_structures(pRExC_state,
16412 stack, fence, fence_stack));
16415 top_index = av_tindex_skip_len_mg(stack);
16418 SV** stacked_ptr; /* Ptr to something already on 'stack' */
16419 char stacked_operator; /* The topmost operator on the 'stack'. */
16420 SV* lhs; /* Operand to the left of the operator */
16421 SV* rhs; /* Operand to the right of the operator */
16422 SV* fence_ptr; /* Pointer to top element of the fence
16426 if ( RExC_parse < RExC_end - 2
16427 && UCHARAT(RExC_parse + 1) == '?'
16428 && UCHARAT(RExC_parse + 2) == '^')
16430 const regnode_offset orig_emit = RExC_emit;
16431 SV * resultant_invlist;
16433 /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16434 * This happens when we have some thing like
16436 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16438 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
16440 * Here we would be handling the interpolated
16441 * '$thai_or_lao'. We handle this by a recursive call to
16442 * reg which returns the inversion list the
16443 * interpolated expression evaluates to. Actually, the
16444 * return is a special regnode containing a pointer to that
16445 * inversion list. If the return isn't that regnode alone,
16446 * we know that this wasn't such an interpolation, which is
16447 * an error: we need to get a single inversion list back
16448 * from the recursion */
16453 node = reg(pRExC_state, 2, flagp, depth+1);
16454 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16456 if ( OP(REGNODE_p(node)) != REGEX_SET
16457 /* If more than a single node returned, the nested
16458 * parens evaluated to more than just a (?[...]),
16459 * which isn't legal */
16461 vFAIL("Expecting interpolated extended charclass");
16463 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16464 current = invlist_clone(resultant_invlist, NULL);
16465 SvREFCNT_dec(resultant_invlist);
16468 RExC_emit = orig_emit;
16469 goto handle_operand;
16472 /* A regular '('. Look behind for illegal syntax */
16473 if (top_index - fence >= 0) {
16474 /* If the top entry on the stack is an operator, it had
16475 * better be a '!', otherwise the entry below the top
16476 * operand should be an operator */
16477 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
16478 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16479 || ( IS_OPERAND(*top_ptr)
16480 && ( top_index - fence < 1
16481 || ! (stacked_ptr = av_fetch(stack,
16484 || ! IS_OPERATOR(*stacked_ptr))))
16487 vFAIL("Unexpected '(' with no preceding operator");
16491 /* Stack the position of this undealt-with left paren */
16492 av_push(fence_stack, newSViv(fence));
16493 fence = top_index + 1;
16497 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16498 * multi-char folds are allowed. */
16499 if (!regclass(pRExC_state, flagp, depth+1,
16500 TRUE, /* means parse just the next thing */
16501 FALSE, /* don't allow multi-char folds */
16502 FALSE, /* don't silence non-portable warnings. */
16504 FALSE, /* Require return to be an ANYOF */
16507 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16508 goto regclass_failed;
16511 /* regclass() will return with parsing just the \ sequence,
16512 * leaving the parse pointer at the next thing to parse */
16514 goto handle_operand;
16516 case '[': /* Is a bracketed character class */
16518 /* See if this is a [:posix:] class. */
16519 bool is_posix_class = (OOB_NAMEDCLASS
16520 < handle_possible_posix(pRExC_state,
16524 TRUE /* checking only */));
16525 /* If it is a posix class, leave the parse pointer at the '['
16526 * to fool regclass() into thinking it is part of a
16527 * '[[:posix:]]'. */
16528 if (! is_posix_class) {
16532 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16533 * multi-char folds are allowed. */
16534 if (!regclass(pRExC_state, flagp, depth+1,
16535 is_posix_class, /* parse the whole char
16536 class only if not a
16538 FALSE, /* don't allow multi-char folds */
16539 TRUE, /* silence non-portable warnings. */
16541 FALSE, /* Require return to be an ANYOF */
16544 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16545 goto regclass_failed;
16552 /* function call leaves parse pointing to the ']', except if we
16554 if (is_posix_class) {
16558 goto handle_operand;
16562 if (top_index >= 1) {
16563 goto join_operators;
16566 /* Only a single operand on the stack: are done */
16570 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16571 if (UCHARAT(RExC_parse - 1) == ']') {
16575 vFAIL("Unexpected ')'");
16578 /* If nothing after the fence, is missing an operand */
16579 if (top_index - fence < 0) {
16583 /* If at least two things on the stack, treat this as an
16585 if (top_index - fence >= 1) {
16586 goto join_operators;
16589 /* Here only a single thing on the fenced stack, and there is a
16590 * fence. Get rid of it */
16591 fence_ptr = av_pop(fence_stack);
16593 fence = SvIV(fence_ptr);
16594 SvREFCNT_dec_NN(fence_ptr);
16601 /* Having gotten rid of the fence, we pop the operand at the
16602 * stack top and process it as a newly encountered operand */
16603 current = av_pop(stack);
16604 if (IS_OPERAND(current)) {
16605 goto handle_operand;
16617 /* These binary operators should have a left operand already
16619 if ( top_index - fence < 0
16620 || top_index - fence == 1
16621 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16622 || ! IS_OPERAND(*top_ptr))
16624 goto unexpected_binary;
16627 /* If only the one operand is on the part of the stack visible
16628 * to us, we just place this operator in the proper position */
16629 if (top_index - fence < 2) {
16631 /* Place the operator before the operand */
16633 SV* lhs = av_pop(stack);
16634 av_push(stack, newSVuv(curchar));
16635 av_push(stack, lhs);
16639 /* But if there is something else on the stack, we need to
16640 * process it before this new operator if and only if the
16641 * stacked operation has equal or higher precedence than the
16646 /* The operator on the stack is supposed to be below both its
16648 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16649 || IS_OPERAND(*stacked_ptr))
16651 /* But if not, it's legal and indicates we are completely
16652 * done if and only if we're currently processing a ']',
16653 * which should be the final thing in the expression */
16654 if (curchar == ']') {
16660 vFAIL2("Unexpected binary operator '%c' with no "
16661 "preceding operand", curchar);
16663 stacked_operator = (char) SvUV(*stacked_ptr);
16665 if (regex_set_precedence(curchar)
16666 > regex_set_precedence(stacked_operator))
16668 /* Here, the new operator has higher precedence than the
16669 * stacked one. This means we need to add the new one to
16670 * the stack to await its rhs operand (and maybe more
16671 * stuff). We put it before the lhs operand, leaving
16672 * untouched the stacked operator and everything below it
16674 lhs = av_pop(stack);
16675 assert(IS_OPERAND(lhs));
16677 av_push(stack, newSVuv(curchar));
16678 av_push(stack, lhs);
16682 /* Here, the new operator has equal or lower precedence than
16683 * what's already there. This means the operation already
16684 * there should be performed now, before the new one. */
16686 rhs = av_pop(stack);
16687 if (! IS_OPERAND(rhs)) {
16689 /* This can happen when a ! is not followed by an operand,
16690 * like in /(?[\t &!])/ */
16694 lhs = av_pop(stack);
16696 if (! IS_OPERAND(lhs)) {
16698 /* This can happen when there is an empty (), like in
16699 * /(?[[0]+()+])/ */
16703 switch (stacked_operator) {
16705 _invlist_intersection(lhs, rhs, &rhs);
16710 _invlist_union(lhs, rhs, &rhs);
16714 _invlist_subtract(lhs, rhs, &rhs);
16717 case '^': /* The union minus the intersection */
16722 _invlist_union(lhs, rhs, &u);
16723 _invlist_intersection(lhs, rhs, &i);
16724 _invlist_subtract(u, i, &rhs);
16725 SvREFCNT_dec_NN(i);
16726 SvREFCNT_dec_NN(u);
16732 /* Here, the higher precedence operation has been done, and the
16733 * result is in 'rhs'. We overwrite the stacked operator with
16734 * the result. Then we redo this code to either push the new
16735 * operator onto the stack or perform any higher precedence
16736 * stacked operation */
16737 only_to_avoid_leaks = av_pop(stack);
16738 SvREFCNT_dec(only_to_avoid_leaks);
16739 av_push(stack, rhs);
16742 case '!': /* Highest priority, right associative */
16744 /* If what's already at the top of the stack is another '!",
16745 * they just cancel each other out */
16746 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16747 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16749 only_to_avoid_leaks = av_pop(stack);
16750 SvREFCNT_dec(only_to_avoid_leaks);
16752 else { /* Otherwise, since it's right associative, just push
16754 av_push(stack, newSVuv(curchar));
16759 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16760 if (RExC_parse >= RExC_end) {
16763 vFAIL("Unexpected character");
16767 /* Here 'current' is the operand. If something is already on the
16768 * stack, we have to check if it is a !. But first, the code above
16769 * may have altered the stack in the time since we earlier set
16772 top_index = av_tindex_skip_len_mg(stack);
16773 if (top_index - fence >= 0) {
16774 /* If the top entry on the stack is an operator, it had better
16775 * be a '!', otherwise the entry below the top operand should
16776 * be an operator */
16777 top_ptr = av_fetch(stack, top_index, FALSE);
16779 if (IS_OPERATOR(*top_ptr)) {
16781 /* The only permissible operator at the top of the stack is
16782 * '!', which is applied immediately to this operand. */
16783 curchar = (char) SvUV(*top_ptr);
16784 if (curchar != '!') {
16785 SvREFCNT_dec(current);
16786 vFAIL2("Unexpected binary operator '%c' with no "
16787 "preceding operand", curchar);
16790 _invlist_invert(current);
16792 only_to_avoid_leaks = av_pop(stack);
16793 SvREFCNT_dec(only_to_avoid_leaks);
16795 /* And we redo with the inverted operand. This allows
16796 * handling multiple ! in a row */
16797 goto handle_operand;
16799 /* Single operand is ok only for the non-binary ')'
16801 else if ((top_index - fence == 0 && curchar != ')')
16802 || (top_index - fence > 0
16803 && (! (stacked_ptr = av_fetch(stack,
16806 || IS_OPERAND(*stacked_ptr))))
16808 SvREFCNT_dec(current);
16809 vFAIL("Operand with no preceding operator");
16813 /* Here there was nothing on the stack or the top element was
16814 * another operand. Just add this new one */
16815 av_push(stack, current);
16817 } /* End of switch on next parse token */
16819 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16820 } /* End of loop parsing through the construct */
16822 vFAIL("Syntax error in (?[...])");
16826 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16827 if (RExC_parse < RExC_end) {
16831 vFAIL("Unexpected ']' with no following ')' in (?[...");
16834 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16835 vFAIL("Unmatched (");
16838 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16839 || ((final = av_pop(stack)) == NULL)
16840 || ! IS_OPERAND(final)
16841 || ! is_invlist(final)
16842 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16845 SvREFCNT_dec(final);
16846 vFAIL("Incomplete expression within '(?[ ])'");
16849 /* Here, 'final' is the resultant inversion list from evaluating the
16850 * expression. Return it if so requested */
16851 if (return_invlist) {
16852 *return_invlist = final;
16856 if (RExC_sets_depth) { /* If within a recursive call, return in a special
16859 node = regpnode(pRExC_state, REGEX_SET, final);
16863 /* Otherwise generate a resultant node, based on 'final'. regclass()
16864 * is expecting a string of ranges and individual code points */
16865 invlist_iterinit(final);
16866 result_string = newSVpvs("");
16867 while (invlist_iternext(final, &start, &end)) {
16868 if (start == end) {
16869 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16872 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16873 UVXf "}", start, end);
16877 /* About to generate an ANYOF (or similar) node from the inversion list
16878 * we have calculated */
16879 save_parse = RExC_parse;
16880 RExC_parse = SvPV(result_string, len);
16881 save_end = RExC_end;
16882 RExC_end = RExC_parse + len;
16883 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16885 /* We turn off folding around the call, as the class we have
16886 * constructed already has all folding taken into consideration, and we
16887 * don't want regclass() to add to that */
16888 RExC_flags &= ~RXf_PMf_FOLD;
16889 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16890 * folds are allowed. */
16891 node = regclass(pRExC_state, flagp, depth+1,
16892 FALSE, /* means parse the whole char class */
16893 FALSE, /* don't allow multi-char folds */
16894 TRUE, /* silence non-portable warnings. The above may
16895 very well have generated non-portable code
16896 points, but they're valid on this machine */
16897 FALSE, /* similarly, no need for strict */
16899 /* We can optimize into something besides an ANYOF,
16900 * except under /l, which needs to be ANYOF because of
16901 * runtime checks for locale sanity, etc */
16907 RExC_parse = save_parse + 1;
16908 RExC_end = save_end;
16909 SvREFCNT_dec_NN(final);
16910 SvREFCNT_dec_NN(result_string);
16913 RExC_flags |= RXf_PMf_FOLD;
16917 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16918 goto regclass_failed;
16921 /* Fix up the node type if we are in locale. (We have pretended we are
16922 * under /u for the purposes of regclass(), as this construct will only
16923 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
16924 * (so as to cause any warnings about bad locales to be output in
16925 * regexec.c), and add the flag that indicates to check if not in a
16926 * UTF-8 locale. The reason we above forbid optimization into
16927 * something other than an ANYOF node is simply to minimize the number
16928 * of code changes in regexec.c. Otherwise we would have to create new
16929 * EXACTish node types and deal with them. This decision could be
16930 * revisited should this construct become popular.
16932 * (One might think we could look at the resulting ANYOF node and
16933 * suppress the flag if everything is above 255, as those would be
16934 * UTF-8 only, but this isn't true, as the components that led to that
16935 * result could have been locale-affected, and just happen to cancel
16936 * each other out under UTF-8 locales.) */
16938 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16940 assert(OP(REGNODE_p(node)) == ANYOF);
16942 OP(REGNODE_p(node)) = ANYOFL;
16943 ANYOF_FLAGS(REGNODE_p(node))
16944 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16948 nextchar(pRExC_state);
16949 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16953 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16957 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16960 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16961 AV * stack, const IV fence, AV * fence_stack)
16962 { /* Dumps the stacks in handle_regex_sets() */
16964 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16965 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16968 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16970 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16972 if (stack_top < 0) {
16973 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16976 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16977 for (i = stack_top; i >= 0; i--) {
16978 SV ** element_ptr = av_fetch(stack, i, FALSE);
16979 if (! element_ptr) {
16982 if (IS_OPERATOR(*element_ptr)) {
16983 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16984 (int) i, (int) SvIV(*element_ptr));
16987 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16988 sv_dump(*element_ptr);
16993 if (fence_stack_top < 0) {
16994 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16997 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16998 for (i = fence_stack_top; i >= 0; i--) {
16999 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17000 if (! element_ptr) {
17003 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17004 (int) i, (int) SvIV(*element_ptr));
17015 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17017 /* This adds the Latin1/above-Latin1 folding rules.
17019 * This should be called only for a Latin1-range code points, cp, which is
17020 * known to be involved in a simple fold with other code points above
17021 * Latin1. It would give false results if /aa has been specified.
17022 * Multi-char folds are outside the scope of this, and must be handled
17025 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17027 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17029 /* The rules that are valid for all Unicode versions are hard-coded in */
17034 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17038 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17041 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17042 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17044 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17045 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17046 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17048 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17049 *invlist = add_cp_to_invlist(*invlist,
17050 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17053 default: /* Other code points are checked against the data for the
17054 current Unicode version */
17056 Size_t folds_count;
17058 const U32 * remaining_folds;
17062 folded_cp = toFOLD(cp);
17065 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17067 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17070 if (folded_cp > 255) {
17071 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17074 folds_count = _inverse_folds(folded_cp, &first_fold,
17076 if (folds_count == 0) {
17078 /* Use deprecated warning to increase the chances of this being
17080 ckWARN2reg_d(RExC_parse,
17081 "Perl folding rules are not up-to-date for 0x%02X;"
17082 " please use the perlbug utility to report;", cp);
17087 if (first_fold > 255) {
17088 *invlist = add_cp_to_invlist(*invlist, first_fold);
17090 for (i = 0; i < folds_count - 1; i++) {
17091 if (remaining_folds[i] > 255) {
17092 *invlist = add_cp_to_invlist(*invlist,
17093 remaining_folds[i]);
17103 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17105 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17109 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17111 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17113 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17114 CLEAR_POSIX_WARNINGS();
17118 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17119 if (first_is_fatal) { /* Avoid leaking this */
17120 av_undef(posix_warnings); /* This isn't necessary if the
17121 array is mortal, but is a
17123 (void) sv_2mortal(msg);
17126 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17127 SvREFCNT_dec_NN(msg);
17130 UPDATE_WARNINGS_LOC(RExC_parse);
17133 PERL_STATIC_INLINE Size_t
17134 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17136 const U8 * const start = s1;
17137 const U8 * const send = start + max;
17139 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17141 while (s1 < send && *s1 == *s2) {
17150 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17152 /* This adds the string scalar <multi_string> to the array
17153 * <multi_char_matches>. <multi_string> is known to have exactly
17154 * <cp_count> code points in it. This is used when constructing a
17155 * bracketed character class and we find something that needs to match more
17156 * than a single character.
17158 * <multi_char_matches> is actually an array of arrays. Each top-level
17159 * element is an array that contains all the strings known so far that are
17160 * the same length. And that length (in number of code points) is the same
17161 * as the index of the top-level array. Hence, the [2] element is an
17162 * array, each element thereof is a string containing TWO code points;
17163 * while element [3] is for strings of THREE characters, and so on. Since
17164 * this is for multi-char strings there can never be a [0] nor [1] element.
17166 * When we rewrite the character class below, we will do so such that the
17167 * longest strings are written first, so that it prefers the longest
17168 * matching strings first. This is done even if it turns out that any
17169 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17170 * Christiansen has agreed that this is ok. This makes the test for the
17171 * ligature 'ffi' come before the test for 'ff', for example */
17174 AV** this_array_ptr;
17176 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17178 if (! multi_char_matches) {
17179 multi_char_matches = newAV();
17182 if (av_exists(multi_char_matches, cp_count)) {
17183 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17184 this_array = *this_array_ptr;
17187 this_array = newAV();
17188 av_store(multi_char_matches, cp_count,
17191 av_push(this_array, multi_string);
17193 return multi_char_matches;
17196 /* The names of properties whose definitions are not known at compile time are
17197 * stored in this SV, after a constant heading. So if the length has been
17198 * changed since initialization, then there is a run-time definition. */
17199 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17200 (SvCUR(listsv) != initial_listsv_len)
17202 /* There is a restricted set of white space characters that are legal when
17203 * ignoring white space in a bracketed character class. This generates the
17204 * code to skip them.
17206 * There is a line below that uses the same white space criteria but is outside
17207 * this macro. Both here and there must use the same definition */
17208 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
17211 while (isBLANK_A(UCHARAT(p))) \
17218 STATIC regnode_offset
17219 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17220 const bool stop_at_1, /* Just parse the next thing, don't
17221 look for a full character class */
17222 bool allow_mutiple_chars,
17223 const bool silence_non_portable, /* Don't output warnings
17227 bool optimizable, /* ? Allow a non-ANYOF return
17229 SV** ret_invlist /* Return an inversion list, not a node */
17232 /* parse a bracketed class specification. Most of these will produce an
17233 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17234 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17235 * under /i with multi-character folds: it will be rewritten following the
17236 * paradigm of this example, where the <multi-fold>s are characters which
17237 * fold to multiple character sequences:
17238 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17239 * gets effectively rewritten as:
17240 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17241 * reg() gets called (recursively) on the rewritten version, and this
17242 * function will return what it constructs. (Actually the <multi-fold>s
17243 * aren't physically removed from the [abcdefghi], it's just that they are
17244 * ignored in the recursion by means of a flag:
17245 * <RExC_in_multi_char_class>.)
17247 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17248 * characters, with the corresponding bit set if that character is in the
17249 * list. For characters above this, an inversion list is used. There
17250 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17251 * determinable at compile time
17253 * On success, returns the offset at which any next node should be placed
17254 * into the regex engine program being compiled.
17256 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17257 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17262 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17264 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17265 regnode_offset ret = -1; /* Initialized to an illegal value */
17267 int namedclass = OOB_NAMEDCLASS;
17268 char *rangebegin = NULL;
17269 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17270 aren't available at the time this was called */
17271 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17272 than just initialized. */
17273 SV* properties = NULL; /* Code points that match \p{} \P{} */
17274 SV* posixes = NULL; /* Code points that match classes like [:word:],
17275 extended beyond the Latin1 range. These have to
17276 be kept separate from other code points for much
17277 of this function because their handling is
17278 different under /i, and for most classes under
17280 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17281 separate for a while from the non-complemented
17282 versions because of complications with /d
17284 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17285 treated more simply than the general case,
17286 leading to less compilation and execution
17288 UV element_count = 0; /* Number of distinct elements in the class.
17289 Optimizations may be possible if this is tiny */
17290 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17291 character; used under /i */
17293 char * stop_ptr = RExC_end; /* where to stop parsing */
17295 /* ignore unescaped whitespace? */
17296 const bool skip_white = cBOOL( ret_invlist
17297 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17299 /* inversion list of code points this node matches only when the target
17300 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17302 SV* upper_latin1_only_utf8_matches = NULL;
17304 /* Inversion list of code points this node matches regardless of things
17305 * like locale, folding, utf8ness of the target string */
17306 SV* cp_list = NULL;
17308 /* Like cp_list, but code points on this list need to be checked for things
17309 * that fold to/from them under /i */
17310 SV* cp_foldable_list = NULL;
17312 /* Like cp_list, but code points on this list are valid only when the
17313 * runtime locale is UTF-8 */
17314 SV* only_utf8_locale_list = NULL;
17316 /* In a range, if one of the endpoints is non-character-set portable,
17317 * meaning that it hard-codes a code point that may mean a different
17318 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17319 * mnemonic '\t' which each mean the same character no matter which
17320 * character set the platform is on. */
17321 unsigned int non_portable_endpoint = 0;
17323 /* Is the range unicode? which means on a platform that isn't 1-1 native
17324 * to Unicode (i.e. non-ASCII), each code point in it should be considered
17325 * to be a Unicode value. */
17326 bool unicode_range = FALSE;
17327 bool invert = FALSE; /* Is this class to be complemented */
17329 bool warn_super = ALWAYS_WARN_SUPER;
17331 const char * orig_parse = RExC_parse;
17333 /* This variable is used to mark where the end in the input is of something
17334 * that looks like a POSIX construct but isn't. During the parse, when
17335 * something looks like it could be such a construct is encountered, it is
17336 * checked for being one, but not if we've already checked this area of the
17337 * input. Only after this position is reached do we check again */
17338 char *not_posix_region_end = RExC_parse - 1;
17340 AV* posix_warnings = NULL;
17341 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17342 U8 op = END; /* The returned node-type, initialized to an impossible
17344 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
17345 U32 posixl = 0; /* bit field of posix classes matched under /l */
17348 /* Flags as to what things aren't knowable until runtime. (Note that these are
17349 * mutually exclusive.) */
17350 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
17351 haven't been defined as of yet */
17352 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
17354 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
17355 what gets folded */
17356 U32 has_runtime_dependency = 0; /* OR of the above flags */
17358 DECLARE_AND_GET_RE_DEBUG_FLAGS;
17360 PERL_ARGS_ASSERT_REGCLASS;
17362 PERL_UNUSED_ARG(depth);
17366 /* If wants an inversion list returned, we can't optimize to something
17369 optimizable = FALSE;
17372 DEBUG_PARSE("clas");
17374 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
17375 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
17376 && UNICODE_DOT_DOT_VERSION == 0)
17377 allow_mutiple_chars = FALSE;
17380 /* We include the /i status at the beginning of this so that we can
17381 * know it at runtime */
17382 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17383 initial_listsv_len = SvCUR(listsv);
17384 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
17386 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17388 assert(RExC_parse <= RExC_end);
17390 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
17393 allow_mutiple_chars = FALSE;
17395 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17398 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17399 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17400 int maybe_class = handle_possible_posix(pRExC_state,
17402 ¬_posix_region_end,
17404 TRUE /* checking only */);
17405 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17406 ckWARN4reg(not_posix_region_end,
17407 "POSIX syntax [%c %c] belongs inside character classes%s",
17408 *RExC_parse, *RExC_parse,
17409 (maybe_class == OOB_NAMEDCLASS)
17410 ? ((POSIXCC_NOTYET(*RExC_parse))
17411 ? " (but this one isn't implemented)"
17412 : " (but this one isn't fully valid)")
17418 /* If the caller wants us to just parse a single element, accomplish this
17419 * by faking the loop ending condition */
17420 if (stop_at_1 && RExC_end > RExC_parse) {
17421 stop_ptr = RExC_parse + 1;
17424 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17425 if (UCHARAT(RExC_parse) == ']')
17426 goto charclassloop;
17430 if ( posix_warnings
17431 && av_tindex_skip_len_mg(posix_warnings) >= 0
17432 && RExC_parse > not_posix_region_end)
17434 /* Warnings about posix class issues are considered tentative until
17435 * we are far enough along in the parse that we can no longer
17436 * change our mind, at which point we output them. This is done
17437 * each time through the loop so that a later class won't zap them
17438 * before they have been dealt with. */
17439 output_posix_warnings(pRExC_state, posix_warnings);
17442 if (RExC_parse >= stop_ptr) {
17446 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17448 if (UCHARAT(RExC_parse) == ']') {
17454 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17455 save_value = value;
17456 save_prevvalue = prevvalue;
17459 rangebegin = RExC_parse;
17461 non_portable_endpoint = 0;
17463 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17464 value = utf8n_to_uvchr((U8*)RExC_parse,
17465 RExC_end - RExC_parse,
17466 &numlen, UTF8_ALLOW_DEFAULT);
17467 RExC_parse += numlen;
17470 value = UCHARAT(RExC_parse++);
17472 if (value == '[') {
17473 char * posix_class_end;
17474 namedclass = handle_possible_posix(pRExC_state,
17477 do_posix_warnings ? &posix_warnings : NULL,
17478 FALSE /* die if error */);
17479 if (namedclass > OOB_NAMEDCLASS) {
17481 /* If there was an earlier attempt to parse this particular
17482 * posix class, and it failed, it was a false alarm, as this
17483 * successful one proves */
17484 if ( posix_warnings
17485 && av_tindex_skip_len_mg(posix_warnings) >= 0
17486 && not_posix_region_end >= RExC_parse
17487 && not_posix_region_end <= posix_class_end)
17489 av_undef(posix_warnings);
17492 RExC_parse = posix_class_end;
17494 else if (namedclass == OOB_NAMEDCLASS) {
17495 not_posix_region_end = posix_class_end;
17498 namedclass = OOB_NAMEDCLASS;
17501 else if ( RExC_parse - 1 > not_posix_region_end
17502 && MAYBE_POSIXCC(value))
17504 (void) handle_possible_posix(
17506 RExC_parse - 1, /* -1 because parse has already been
17508 ¬_posix_region_end,
17509 do_posix_warnings ? &posix_warnings : NULL,
17510 TRUE /* checking only */);
17512 else if ( strict && ! skip_white
17513 && ( _generic_isCC(value, _CC_VERTSPACE)
17514 || is_VERTWS_cp_high(value)))
17516 vFAIL("Literal vertical space in [] is illegal except under /x");
17518 else if (value == '\\') {
17519 /* Is a backslash; get the code point of the char after it */
17521 if (RExC_parse >= RExC_end) {
17522 vFAIL("Unmatched [");
17525 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17526 value = utf8n_to_uvchr((U8*)RExC_parse,
17527 RExC_end - RExC_parse,
17528 &numlen, UTF8_ALLOW_DEFAULT);
17529 RExC_parse += numlen;
17532 value = UCHARAT(RExC_parse++);
17534 /* Some compilers cannot handle switching on 64-bit integer
17535 * values, therefore value cannot be an UV. Yes, this will
17536 * be a problem later if we want switch on Unicode.
17537 * A similar issue a little bit later when switching on
17538 * namedclass. --jhi */
17540 /* If the \ is escaping white space when white space is being
17541 * skipped, it means that that white space is wanted literally, and
17542 * is already in 'value'. Otherwise, need to translate the escape
17543 * into what it signifies. */
17544 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17545 const char * message;
17549 case 'w': namedclass = ANYOF_WORDCHAR; break;
17550 case 'W': namedclass = ANYOF_NWORDCHAR; break;
17551 case 's': namedclass = ANYOF_SPACE; break;
17552 case 'S': namedclass = ANYOF_NSPACE; break;
17553 case 'd': namedclass = ANYOF_DIGIT; break;
17554 case 'D': namedclass = ANYOF_NDIGIT; break;
17555 case 'v': namedclass = ANYOF_VERTWS; break;
17556 case 'V': namedclass = ANYOF_NVERTWS; break;
17557 case 'h': namedclass = ANYOF_HORIZWS; break;
17558 case 'H': namedclass = ANYOF_NHORIZWS; break;
17559 case 'N': /* Handle \N{NAME} in class */
17561 const char * const backslash_N_beg = RExC_parse - 2;
17564 if (! grok_bslash_N(pRExC_state,
17565 NULL, /* No regnode */
17566 &value, /* Yes single value */
17567 &cp_count, /* Multiple code pt count */
17573 if (*flagp & NEED_UTF8)
17574 FAIL("panic: grok_bslash_N set NEED_UTF8");
17576 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17578 if (cp_count < 0) {
17579 vFAIL("\\N in a character class must be a named character: \\N{...}");
17581 else if (cp_count == 0) {
17582 ckWARNreg(RExC_parse,
17583 "Ignoring zero length \\N{} in character class");
17585 else { /* cp_count > 1 */
17586 assert(cp_count > 1);
17587 if (! RExC_in_multi_char_class) {
17588 if ( ! allow_mutiple_chars
17591 || *RExC_parse == '-')
17595 vFAIL("\\N{} here is restricted to one character");
17597 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17598 break; /* <value> contains the first code
17599 point. Drop out of the switch to
17603 SV * multi_char_N = newSVpvn(backslash_N_beg,
17604 RExC_parse - backslash_N_beg);
17606 = add_multi_match(multi_char_matches,
17611 } /* End of cp_count != 1 */
17613 /* This element should not be processed further in this
17616 value = save_value;
17617 prevvalue = save_prevvalue;
17618 continue; /* Back to top of loop to get next char */
17621 /* Here, is a single code point, and <value> contains it */
17622 unicode_range = TRUE; /* \N{} are Unicode */
17630 if (RExC_pm_flags & PMf_WILDCARD) {
17632 /* diag_listed_as: Use of %s is not allowed in Unicode
17633 property wildcard subpatterns in regex; marked by <--
17635 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17636 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17639 /* \p means they want Unicode semantics */
17640 REQUIRE_UNI_RULES(flagp, 0);
17642 if (RExC_parse >= RExC_end)
17643 vFAIL2("Empty \\%c", (U8)value);
17644 if (*RExC_parse == '{') {
17645 const U8 c = (U8)value;
17646 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17649 vFAIL2("Missing right brace on \\%c{}", c);
17654 /* White space is allowed adjacent to the braces and after
17655 * any '^', even when not under /x */
17656 while (isSPACE(*RExC_parse)) {
17660 if (UCHARAT(RExC_parse) == '^') {
17662 /* toggle. (The rhs xor gets the single bit that
17663 * differs between P and p; the other xor inverts just
17665 value ^= 'P' ^ 'p';
17668 while (isSPACE(*RExC_parse)) {
17673 if (e == RExC_parse)
17674 vFAIL2("Empty \\%c{}", c);
17676 n = e - RExC_parse;
17677 while (isSPACE(*(RExC_parse + n - 1)))
17680 } /* The \p isn't immediately followed by a '{' */
17681 else if (! isALPHA(*RExC_parse)) {
17682 RExC_parse += (UTF)
17683 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17685 vFAIL2("Character following \\%c must be '{' or a "
17686 "single-character Unicode property name",
17694 char* name = RExC_parse;
17696 /* Any message returned about expanding the definition */
17697 SV* msg = newSVpvs_flags("", SVs_TEMP);
17699 /* If set TRUE, the property is user-defined as opposed to
17700 * official Unicode */
17701 bool user_defined = FALSE;
17702 AV * strings = NULL;
17704 SV * prop_definition = parse_uniprop_string(
17705 name, n, UTF, FOLD,
17706 FALSE, /* This is compile-time */
17708 /* We can't defer this defn when
17709 * the full result is required in
17711 ! cBOOL(ret_invlist),
17718 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17719 assert(prop_definition == NULL);
17720 RExC_parse = e + 1;
17721 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17722 thing so, or else the display is
17726 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17727 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17728 SvCUR(msg), SvPVX(msg)));
17731 assert(prop_definition || strings);
17734 if (! RExC_in_multi_char_class) {
17735 if (invert ^ (value == 'P')) {
17736 RExC_parse = e + 1;
17737 vFAIL("Inverting a character class which contains"
17738 " a multi-character sequence is illegal");
17741 /* For each multi-character string ... */
17742 while (av_tindex(strings) >= 0) {
17743 /* ... Each entry is itself an array of code
17745 AV * this_string = (AV *) av_shift( strings);
17746 STRLEN cp_count = av_tindex(this_string) + 1;
17747 SV * final = newSV(cp_count * 4);
17750 /* Create another string of sequences of \x{...} */
17751 while (av_tindex(this_string) >= 0) {
17752 SV * character = av_shift(this_string);
17753 UV cp = SvUV(character);
17756 REQUIRE_UTF8(flagp);
17758 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17760 SvREFCNT_dec_NN(character);
17762 SvREFCNT_dec_NN(this_string);
17764 /* And add that to the list of such things */
17766 = add_multi_match(multi_char_matches,
17771 SvREFCNT_dec_NN(strings);
17774 if (! prop_definition) { /* If we got only a string,
17775 this iteration didn't really
17776 find a character */
17779 else if (! is_invlist(prop_definition)) {
17781 /* Here, the definition isn't known, so we have gotten
17782 * returned a string that will be evaluated if and when
17783 * encountered at runtime. We add it to the list of
17784 * such properties, along with whether it should be
17785 * complemented or not */
17786 if (value == 'P') {
17787 sv_catpvs(listsv, "!");
17790 sv_catpvs(listsv, "+");
17792 sv_catsv(listsv, prop_definition);
17794 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17796 /* We don't know yet what this matches, so have to flag
17798 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17801 assert (prop_definition && is_invlist(prop_definition));
17803 /* Here we do have the complete property definition
17805 * Temporary workaround for [perl #133136]. For this
17806 * precise input that is in the .t that is failing,
17807 * load utf8.pm, which is what the test wants, so that
17808 * that .t passes */
17809 if ( memEQs(RExC_start, e + 1 - RExC_start,
17811 && ! hv_common(GvHVn(PL_incgv),
17813 "utf8.pm", sizeof("utf8.pm") - 1,
17814 0, HV_FETCH_ISEXISTS, NULL, 0))
17816 require_pv("utf8.pm");
17819 if (! user_defined &&
17820 /* We warn on matching an above-Unicode code point
17821 * if the match would return true, except don't
17822 * warn for \p{All}, which has exactly one element
17824 (_invlist_contains_cp(prop_definition, 0x110000)
17825 && (! (_invlist_len(prop_definition) == 1
17826 && *invlist_array(prop_definition) == 0))))
17831 /* Invert if asking for the complement */
17832 if (value == 'P') {
17833 _invlist_union_complement_2nd(properties,
17838 _invlist_union(properties, prop_definition, &properties);
17843 RExC_parse = e + 1;
17844 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17848 case 'n': value = '\n'; break;
17849 case 'r': value = '\r'; break;
17850 case 't': value = '\t'; break;
17851 case 'f': value = '\f'; break;
17852 case 'b': value = '\b'; break;
17853 case 'e': value = ESC_NATIVE; break;
17854 case 'a': value = '\a'; break;
17856 RExC_parse--; /* function expects to be pointed at the 'o' */
17857 if (! grok_bslash_o(&RExC_parse,
17863 cBOOL(range), /* MAX_UV allowed for range
17869 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17870 warn_non_literal_string(RExC_parse, packed_warn, message);
17874 non_portable_endpoint++;
17878 RExC_parse--; /* function expects to be pointed at the 'x' */
17879 if (! grok_bslash_x(&RExC_parse,
17885 cBOOL(range), /* MAX_UV allowed for range
17891 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17892 warn_non_literal_string(RExC_parse, packed_warn, message);
17896 non_portable_endpoint++;
17900 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17903 /* going to die anyway; point to exact spot of
17905 RExC_parse += (UTF)
17906 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17911 value = grok_c_char;
17913 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17914 warn_non_literal_string(RExC_parse, packed_warn, message);
17917 non_portable_endpoint++;
17919 case '0': case '1': case '2': case '3': case '4':
17920 case '5': case '6': case '7':
17922 /* Take 1-3 octal digits */
17923 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17924 | PERL_SCAN_NOTIFY_ILLDIGIT;
17925 numlen = (strict) ? 4 : 3;
17926 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17927 RExC_parse += numlen;
17930 RExC_parse += (UTF)
17931 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17933 vFAIL("Need exactly 3 octal digits");
17935 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17936 && RExC_parse < RExC_end
17937 && isDIGIT(*RExC_parse)
17938 && ckWARN(WARN_REGEXP))
17940 reg_warn_non_literal_string(
17942 form_alien_digit_msg(8, numlen, RExC_parse,
17943 RExC_end, UTF, FALSE));
17947 non_portable_endpoint++;
17952 /* Allow \_ to not give an error */
17953 if (isWORDCHAR(value) && value != '_') {
17955 vFAIL2("Unrecognized escape \\%c in character class",
17959 ckWARN2reg(RExC_parse,
17960 "Unrecognized escape \\%c in character class passed through",
17965 } /* End of switch on char following backslash */
17966 } /* end of handling backslash escape sequences */
17968 /* Here, we have the current token in 'value' */
17970 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17973 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
17974 * literal, as is the character that began the false range, i.e.
17975 * the 'a' in the examples */
17977 const int w = (RExC_parse >= rangebegin)
17978 ? RExC_parse - rangebegin
17982 "False [] range \"%" UTF8f "\"",
17983 UTF8fARG(UTF, w, rangebegin));
17986 ckWARN2reg(RExC_parse,
17987 "False [] range \"%" UTF8f "\"",
17988 UTF8fARG(UTF, w, rangebegin));
17989 cp_list = add_cp_to_invlist(cp_list, '-');
17990 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17994 range = 0; /* this was not a true range */
17995 element_count += 2; /* So counts for three values */
17998 classnum = namedclass_to_classnum(namedclass);
18000 if (LOC && namedclass < ANYOF_POSIXL_MAX
18001 #ifndef HAS_ISASCII
18002 && classnum != _CC_ASCII
18005 SV* scratch_list = NULL;
18007 /* What the Posix classes (like \w, [:space:]) match isn't
18008 * generally knowable under locale until actual match time. A
18009 * special node is used for these which has extra space for a
18010 * bitmap, with a bit reserved for each named class that is to
18011 * be matched against. (This isn't needed for \p{} and
18012 * pseudo-classes, as they are not affected by locale, and
18013 * hence are dealt with separately.) However, if a named class
18014 * and its complement are both present, then it matches
18015 * everything, and there is no runtime dependency. Odd numbers
18016 * are the complements of the next lower number, so xor works.
18017 * (Note that something like [\w\D] should match everything,
18018 * because \d should be a proper subset of \w. But rather than
18019 * trust that the locale is well behaved, we leave this to
18020 * runtime to sort out) */
18021 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18022 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18023 POSIXL_ZERO(posixl);
18024 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18025 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18026 continue; /* We could ignore the rest of the class, but
18027 best to parse it for any errors */
18029 else { /* Here, isn't the complement of any already parsed
18031 POSIXL_SET(posixl, namedclass);
18032 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18033 anyof_flags |= ANYOF_MATCHES_POSIXL;
18035 /* The above-Latin1 characters are not subject to locale
18036 * rules. Just add them to the unconditionally-matched
18039 /* Get the list of the above-Latin1 code points this
18041 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18042 PL_XPosix_ptrs[classnum],
18044 /* Odd numbers are complements,
18045 * like NDIGIT, NASCII, ... */
18046 namedclass % 2 != 0,
18048 /* Checking if 'cp_list' is NULL first saves an extra
18049 * clone. Its reference count will be decremented at the
18050 * next union, etc, or if this is the only instance, at the
18051 * end of the routine */
18053 cp_list = scratch_list;
18056 _invlist_union(cp_list, scratch_list, &cp_list);
18057 SvREFCNT_dec_NN(scratch_list);
18059 continue; /* Go get next character */
18064 /* Here, is not /l, or is a POSIX class for which /l doesn't
18065 * matter (or is a Unicode property, which is skipped here). */
18066 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18067 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18069 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18070 * nor /l make a difference in what these match,
18071 * therefore we just add what they match to cp_list. */
18072 if (classnum != _CC_VERTSPACE) {
18073 assert( namedclass == ANYOF_HORIZWS
18074 || namedclass == ANYOF_NHORIZWS);
18076 /* It turns out that \h is just a synonym for
18078 classnum = _CC_BLANK;
18081 _invlist_union_maybe_complement_2nd(
18083 PL_XPosix_ptrs[classnum],
18084 namedclass % 2 != 0, /* Complement if odd
18085 (NHORIZWS, NVERTWS)
18090 else if ( AT_LEAST_UNI_SEMANTICS
18091 || classnum == _CC_ASCII
18092 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
18093 || classnum == _CC_XDIGIT)))
18095 /* We usually have to worry about /d affecting what POSIX
18096 * classes match, with special code needed because we won't
18097 * know until runtime what all matches. But there is no
18098 * extra work needed under /u and /a; and [:ascii:] is
18099 * unaffected by /d; and :digit: and :xdigit: don't have
18100 * runtime differences under /d. So we can special case
18101 * these, and avoid some extra work below, and at runtime.
18103 _invlist_union_maybe_complement_2nd(
18105 ((AT_LEAST_ASCII_RESTRICTED)
18106 ? PL_Posix_ptrs[classnum]
18107 : PL_XPosix_ptrs[classnum]),
18108 namedclass % 2 != 0,
18111 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18112 complement and use nposixes */
18113 SV** posixes_ptr = namedclass % 2 == 0
18116 _invlist_union_maybe_complement_2nd(
18118 PL_XPosix_ptrs[classnum],
18119 namedclass % 2 != 0,
18123 } /* end of namedclass \blah */
18125 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
18127 /* If 'range' is set, 'value' is the ending of a range--check its
18128 * validity. (If value isn't a single code point in the case of a
18129 * range, we should have figured that out above in the code that
18130 * catches false ranges). Later, we will handle each individual code
18131 * point in the range. If 'range' isn't set, this could be the
18132 * beginning of a range, so check for that by looking ahead to see if
18133 * the next real character to be processed is the range indicator--the
18138 /* For unicode ranges, we have to test that the Unicode as opposed
18139 * to the native values are not decreasing. (Above 255, there is
18140 * no difference between native and Unicode) */
18141 if (unicode_range && prevvalue < 255 && value < 255) {
18142 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18143 goto backwards_range;
18148 if (prevvalue > value) /* b-a */ {
18153 w = RExC_parse - rangebegin;
18155 "Invalid [] range \"%" UTF8f "\"",
18156 UTF8fARG(UTF, w, rangebegin));
18157 NOT_REACHED; /* NOTREACHED */
18161 prevvalue = value; /* save the beginning of the potential range */
18162 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18163 && *RExC_parse == '-')
18165 char* next_char_ptr = RExC_parse + 1;
18167 /* Get the next real char after the '-' */
18168 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
18170 /* If the '-' is at the end of the class (just before the ']',
18171 * it is a literal minus; otherwise it is a range */
18172 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18173 RExC_parse = next_char_ptr;
18175 /* a bad range like \w-, [:word:]- ? */
18176 if (namedclass > OOB_NAMEDCLASS) {
18177 if (strict || ckWARN(WARN_REGEXP)) {
18178 const int w = RExC_parse >= rangebegin
18179 ? RExC_parse - rangebegin
18182 vFAIL4("False [] range \"%*.*s\"",
18187 "False [] range \"%*.*s\"",
18191 cp_list = add_cp_to_invlist(cp_list, '-');
18194 range = 1; /* yeah, it's a range! */
18195 continue; /* but do it the next time */
18200 if (namedclass > OOB_NAMEDCLASS) {
18204 /* Here, we have a single value this time through the loop, and
18205 * <prevvalue> is the beginning of the range, if any; or <value> if
18208 /* non-Latin1 code point implies unicode semantics. */
18210 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18211 || prevvalue > MAX_LEGAL_CP))
18213 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18215 REQUIRE_UNI_RULES(flagp, 0);
18216 if ( ! silence_non_portable
18217 && UNICODE_IS_PERL_EXTENDED(value)
18218 && TO_OUTPUT_WARNINGS(RExC_parse))
18220 ckWARN2_non_literal_string(RExC_parse,
18221 packWARN(WARN_PORTABLE),
18222 PL_extended_cp_format,
18227 /* Ready to process either the single value, or the completed range.
18228 * For single-valued non-inverted ranges, we consider the possibility
18229 * of multi-char folds. (We made a conscious decision to not do this
18230 * for the other cases because it can often lead to non-intuitive
18231 * results. For example, you have the peculiar case that:
18232 * "s s" =~ /^[^\xDF]+$/i => Y
18233 * "ss" =~ /^[^\xDF]+$/i => N
18235 * See [perl #89750] */
18236 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18237 if ( value == LATIN_SMALL_LETTER_SHARP_S
18238 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18241 /* Here <value> is indeed a multi-char fold. Get what it is */
18243 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18246 UV folded = _to_uni_fold_flags(
18250 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18251 ? FOLD_FLAGS_NOMIX_ASCII
18255 /* Here, <folded> should be the first character of the
18256 * multi-char fold of <value>, with <foldbuf> containing the
18257 * whole thing. But, if this fold is not allowed (because of
18258 * the flags), <fold> will be the same as <value>, and should
18259 * be processed like any other character, so skip the special
18261 if (folded != value) {
18263 /* Skip if we are recursed, currently parsing the class
18264 * again. Otherwise add this character to the list of
18265 * multi-char folds. */
18266 if (! RExC_in_multi_char_class) {
18267 STRLEN cp_count = utf8_length(foldbuf,
18268 foldbuf + foldlen);
18269 SV* multi_fold = sv_2mortal(newSVpvs(""));
18271 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18274 = add_multi_match(multi_char_matches,
18280 /* This element should not be processed further in this
18283 value = save_value;
18284 prevvalue = save_prevvalue;
18290 if (strict && ckWARN(WARN_REGEXP)) {
18293 /* If the range starts above 255, everything is portable and
18294 * likely to be so for any forseeable character set, so don't
18296 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18297 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18299 else if (prevvalue != value) {
18301 /* Under strict, ranges that stop and/or end in an ASCII
18302 * printable should have each end point be a portable value
18303 * for it (preferably like 'A', but we don't warn if it is
18304 * a (portable) Unicode name or code point), and the range
18305 * must be be all digits or all letters of the same case.
18306 * Otherwise, the range is non-portable and unclear as to
18307 * what it contains */
18308 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
18309 && ( non_portable_endpoint
18310 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18311 || (isLOWER_A(prevvalue) && isLOWER_A(value))
18312 || (isUPPER_A(prevvalue) && isUPPER_A(value))
18314 vWARN(RExC_parse, "Ranges of ASCII printables should"
18315 " be some subset of \"0-9\","
18316 " \"A-Z\", or \"a-z\"");
18318 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18319 SSize_t index_start;
18320 SSize_t index_final;
18322 /* But the nature of Unicode and languages mean we
18323 * can't do the same checks for above-ASCII ranges,
18324 * except in the case of digit ones. These should
18325 * contain only digits from the same group of 10. The
18326 * ASCII case is handled just above. Hence here, the
18327 * range could be a range of digits. First some
18328 * unlikely special cases. Grandfather in that a range
18329 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18330 * if its starting value is one of the 10 digits prior
18331 * to it. This is because it is an alternate way of
18332 * writing 19D1, and some people may expect it to be in
18333 * that group. But it is bad, because it won't give
18334 * the expected results. In Unicode 5.2 it was
18335 * considered to be in that group (of 11, hence), but
18336 * this was fixed in the next version */
18338 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18339 goto warn_bad_digit_range;
18341 else if (UNLIKELY( prevvalue >= 0x1D7CE
18342 && value <= 0x1D7FF))
18344 /* This is the only other case currently in Unicode
18345 * where the algorithm below fails. The code
18346 * points just above are the end points of a single
18347 * range containing only decimal digits. It is 5
18348 * different series of 0-9. All other ranges of
18349 * digits currently in Unicode are just a single
18350 * series. (And mktables will notify us if a later
18351 * Unicode version breaks this.)
18353 * If the range being checked is at most 9 long,
18354 * and the digit values represented are in
18355 * numerical order, they are from the same series.
18357 if ( value - prevvalue > 9
18358 || ((( value - 0x1D7CE) % 10)
18359 <= (prevvalue - 0x1D7CE) % 10))
18361 goto warn_bad_digit_range;
18366 /* For all other ranges of digits in Unicode, the
18367 * algorithm is just to check if both end points
18368 * are in the same series, which is the same range.
18370 index_start = _invlist_search(
18371 PL_XPosix_ptrs[_CC_DIGIT],
18374 /* Warn if the range starts and ends with a digit,
18375 * and they are not in the same group of 10. */
18376 if ( index_start >= 0
18377 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18379 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18380 value)) != index_start
18381 && index_final >= 0
18382 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18384 warn_bad_digit_range:
18385 vWARN(RExC_parse, "Ranges of digits should be"
18386 " from the same group of"
18393 if ((! range || prevvalue == value) && non_portable_endpoint) {
18394 if (isPRINT_A(value)) {
18397 if (isBACKSLASHED_PUNCT(value)) {
18398 literal[d++] = '\\';
18400 literal[d++] = (char) value;
18401 literal[d++] = '\0';
18404 "\"%.*s\" is more clearly written simply as \"%s\"",
18405 (int) (RExC_parse - rangebegin),
18410 else if (isMNEMONIC_CNTRL(value)) {
18412 "\"%.*s\" is more clearly written simply as \"%s\"",
18413 (int) (RExC_parse - rangebegin),
18415 cntrl_to_mnemonic((U8) value)
18421 /* Deal with this element of the class */
18424 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18427 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18428 * that don't require special handling, we can just add the range like
18429 * we do for ASCII platforms */
18430 if ((UNLIKELY(prevvalue == 0) && value >= 255)
18431 || ! (prevvalue < 256
18433 || (! non_portable_endpoint
18434 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18435 || (isUPPER_A(prevvalue)
18436 && isUPPER_A(value)))))))
18438 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18442 /* Here, requires special handling. This can be because it is a
18443 * range whose code points are considered to be Unicode, and so
18444 * must be individually translated into native, or because its a
18445 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18446 * EBCDIC, but we have defined them to include only the "expected"
18447 * upper or lower case ASCII alphabetics. Subranges above 255 are
18448 * the same in native and Unicode, so can be added as a range */
18449 U8 start = NATIVE_TO_LATIN1(prevvalue);
18451 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18452 for (j = start; j <= end; j++) {
18453 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18456 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18462 range = 0; /* this range (if it was one) is done now */
18463 } /* End of loop through all the text within the brackets */
18465 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18466 output_posix_warnings(pRExC_state, posix_warnings);
18469 /* If anything in the class expands to more than one character, we have to
18470 * deal with them by building up a substitute parse string, and recursively
18471 * calling reg() on it, instead of proceeding */
18472 if (multi_char_matches) {
18473 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18476 char *save_end = RExC_end;
18477 char *save_parse = RExC_parse;
18478 char *save_start = RExC_start;
18479 Size_t constructed_prefix_len = 0; /* This gives the length of the
18480 constructed portion of the
18481 substitute parse. */
18482 bool first_time = TRUE; /* First multi-char occurrence doesn't get
18487 /* Only one level of recursion allowed */
18488 assert(RExC_copy_start_in_constructed == RExC_precomp);
18490 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
18491 because too confusing */
18493 sv_catpvs(substitute_parse, "(?:");
18497 /* Look at the longest strings first */
18498 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18503 if (av_exists(multi_char_matches, cp_count)) {
18504 AV** this_array_ptr;
18507 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18509 while ((this_sequence = av_pop(*this_array_ptr)) !=
18512 if (! first_time) {
18513 sv_catpvs(substitute_parse, "|");
18515 first_time = FALSE;
18517 sv_catpv(substitute_parse, SvPVX(this_sequence));
18522 /* If the character class contains anything else besides these
18523 * multi-character strings, have to include it in recursive parsing */
18524 if (element_count) {
18525 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18527 sv_catpvs(substitute_parse, "|");
18528 if (has_l_bracket) { /* Add an [ if the original had one */
18529 sv_catpvs(substitute_parse, "[");
18531 constructed_prefix_len = SvCUR(substitute_parse);
18532 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18534 /* Put in a closing ']' to match any opening one, but not if going
18535 * off the end, as otherwise we are adding something that really
18537 if (has_l_bracket && RExC_parse < RExC_end) {
18538 sv_catpvs(substitute_parse, "]");
18542 sv_catpvs(substitute_parse, ")");
18545 /* This is a way to get the parse to skip forward a whole named
18546 * sequence instead of matching the 2nd character when it fails the
18548 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18552 /* Set up the data structure so that any errors will be properly
18553 * reported. See the comments at the definition of
18554 * REPORT_LOCATION_ARGS for details */
18555 RExC_copy_start_in_input = (char *) orig_parse;
18556 RExC_start = RExC_parse = SvPV(substitute_parse, len);
18557 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18558 RExC_end = RExC_parse + len;
18559 RExC_in_multi_char_class = 1;
18561 ret = reg(pRExC_state, 1, ®_flags, depth+1);
18563 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18565 /* And restore so can parse the rest of the pattern */
18566 RExC_parse = save_parse;
18567 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18568 RExC_end = save_end;
18569 RExC_in_multi_char_class = 0;
18570 SvREFCNT_dec_NN(multi_char_matches);
18574 /* If folding, we calculate all characters that could fold to or from the
18575 * ones already on the list */
18576 if (cp_foldable_list) {
18578 UV start, end; /* End points of code point ranges */
18580 SV* fold_intersection = NULL;
18583 /* Our calculated list will be for Unicode rules. For locale
18584 * matching, we have to keep a separate list that is consulted at
18585 * runtime only when the locale indicates Unicode rules (and we
18586 * don't include potential matches in the ASCII/Latin1 range, as
18587 * any code point could fold to any other, based on the run-time
18588 * locale). For non-locale, we just use the general list */
18590 use_list = &only_utf8_locale_list;
18593 use_list = &cp_list;
18596 /* Only the characters in this class that participate in folds need
18597 * be checked. Get the intersection of this class and all the
18598 * possible characters that are foldable. This can quickly narrow
18599 * down a large class */
18600 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18601 &fold_intersection);
18603 /* Now look at the foldable characters in this class individually */
18604 invlist_iterinit(fold_intersection);
18605 while (invlist_iternext(fold_intersection, &start, &end)) {
18609 /* Look at every character in the range */
18610 for (j = start; j <= end; j++) {
18611 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18614 Size_t folds_count;
18616 const U32 * remaining_folds;
18620 /* Under /l, we don't know what code points below 256
18621 * fold to, except we do know the MICRO SIGN folds to
18622 * an above-255 character if the locale is UTF-8, so we
18623 * add it to the special list (in *use_list) Otherwise
18624 * we know now what things can match, though some folds
18625 * are valid under /d only if the target is UTF-8.
18626 * Those go in a separate list */
18627 if ( IS_IN_SOME_FOLD_L1(j)
18628 && ! (LOC && j != MICRO_SIGN))
18631 /* ASCII is always matched; non-ASCII is matched
18632 * only under Unicode rules (which could happen
18633 * under /l if the locale is a UTF-8 one */
18634 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18635 *use_list = add_cp_to_invlist(*use_list,
18636 PL_fold_latin1[j]);
18638 else if (j != PL_fold_latin1[j]) {
18639 upper_latin1_only_utf8_matches
18640 = add_cp_to_invlist(
18641 upper_latin1_only_utf8_matches,
18642 PL_fold_latin1[j]);
18646 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18647 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18649 add_above_Latin1_folds(pRExC_state,
18656 /* Here is an above Latin1 character. We don't have the
18657 * rules hard-coded for it. First, get its fold. This is
18658 * the simple fold, as the multi-character folds have been
18659 * handled earlier and separated out */
18660 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18661 (ASCII_FOLD_RESTRICTED)
18662 ? FOLD_FLAGS_NOMIX_ASCII
18665 /* Single character fold of above Latin1. Add everything
18666 * in its fold closure to the list that this node should
18668 folds_count = _inverse_folds(folded, &first_fold,
18670 for (k = 0; k <= folds_count; k++) {
18671 UV c = (k == 0) /* First time through use itself */
18673 : (k == 1) /* 2nd time use, the first fold */
18676 /* Then the remaining ones */
18677 : remaining_folds[k-2];
18679 /* /aa doesn't allow folds between ASCII and non- */
18680 if (( ASCII_FOLD_RESTRICTED
18681 && (isASCII(c) != isASCII(j))))
18686 /* Folds under /l which cross the 255/256 boundary are
18687 * added to a separate list. (These are valid only
18688 * when the locale is UTF-8.) */
18689 if (c < 256 && LOC) {
18690 *use_list = add_cp_to_invlist(*use_list, c);
18694 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18696 cp_list = add_cp_to_invlist(cp_list, c);
18699 /* Similarly folds involving non-ascii Latin1
18700 * characters under /d are added to their list */
18701 upper_latin1_only_utf8_matches
18702 = add_cp_to_invlist(
18703 upper_latin1_only_utf8_matches,
18709 SvREFCNT_dec_NN(fold_intersection);
18712 /* Now that we have finished adding all the folds, there is no reason
18713 * to keep the foldable list separate */
18714 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18715 SvREFCNT_dec_NN(cp_foldable_list);
18718 /* And combine the result (if any) with any inversion lists from posix
18719 * classes. The lists are kept separate up to now because we don't want to
18720 * fold the classes */
18721 if (simple_posixes) { /* These are the classes known to be unaffected by
18724 _invlist_union(cp_list, simple_posixes, &cp_list);
18725 SvREFCNT_dec_NN(simple_posixes);
18728 cp_list = simple_posixes;
18731 if (posixes || nposixes) {
18732 if (! DEPENDS_SEMANTICS) {
18734 /* For everything but /d, we can just add the current 'posixes' and
18735 * 'nposixes' to the main list */
18738 _invlist_union(cp_list, posixes, &cp_list);
18739 SvREFCNT_dec_NN(posixes);
18747 _invlist_union(cp_list, nposixes, &cp_list);
18748 SvREFCNT_dec_NN(nposixes);
18751 cp_list = nposixes;
18756 /* Under /d, things like \w match upper Latin1 characters only if
18757 * the target string is in UTF-8. But things like \W match all the
18758 * upper Latin1 characters if the target string is not in UTF-8.
18760 * Handle the case with something like \W separately */
18762 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18764 /* A complemented posix class matches all upper Latin1
18765 * characters if not in UTF-8. And it matches just certain
18766 * ones when in UTF-8. That means those certain ones are
18767 * matched regardless, so can just be added to the
18768 * unconditional list */
18770 _invlist_union(cp_list, nposixes, &cp_list);
18771 SvREFCNT_dec_NN(nposixes);
18775 cp_list = nposixes;
18778 /* Likewise for 'posixes' */
18779 _invlist_union(posixes, cp_list, &cp_list);
18780 SvREFCNT_dec(posixes);
18782 /* Likewise for anything else in the range that matched only
18784 if (upper_latin1_only_utf8_matches) {
18785 _invlist_union(cp_list,
18786 upper_latin1_only_utf8_matches,
18788 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18789 upper_latin1_only_utf8_matches = NULL;
18792 /* If we don't match all the upper Latin1 characters regardless
18793 * of UTF-8ness, we have to set a flag to match the rest when
18795 _invlist_subtract(only_non_utf8_list, cp_list,
18796 &only_non_utf8_list);
18797 if (_invlist_len(only_non_utf8_list) != 0) {
18798 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18800 SvREFCNT_dec_NN(only_non_utf8_list);
18803 /* Here there were no complemented posix classes. That means
18804 * the upper Latin1 characters in 'posixes' match only when the
18805 * target string is in UTF-8. So we have to add them to the
18806 * list of those types of code points, while adding the
18807 * remainder to the unconditional list.
18809 * First calculate what they are */
18810 SV* nonascii_but_latin1_properties = NULL;
18811 _invlist_intersection(posixes, PL_UpperLatin1,
18812 &nonascii_but_latin1_properties);
18814 /* And add them to the final list of such characters. */
18815 _invlist_union(upper_latin1_only_utf8_matches,
18816 nonascii_but_latin1_properties,
18817 &upper_latin1_only_utf8_matches);
18819 /* Remove them from what now becomes the unconditional list */
18820 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18823 /* And add those unconditional ones to the final list */
18825 _invlist_union(cp_list, posixes, &cp_list);
18826 SvREFCNT_dec_NN(posixes);
18833 SvREFCNT_dec(nonascii_but_latin1_properties);
18835 /* Get rid of any characters from the conditional list that we
18836 * now know are matched unconditionally, which may make that
18838 _invlist_subtract(upper_latin1_only_utf8_matches,
18840 &upper_latin1_only_utf8_matches);
18841 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18842 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18843 upper_latin1_only_utf8_matches = NULL;
18849 /* And combine the result (if any) with any inversion list from properties.
18850 * The lists are kept separate up to now so that we can distinguish the two
18851 * in regards to matching above-Unicode. A run-time warning is generated
18852 * if a Unicode property is matched against a non-Unicode code point. But,
18853 * we allow user-defined properties to match anything, without any warning,
18854 * and we also suppress the warning if there is a portion of the character
18855 * class that isn't a Unicode property, and which matches above Unicode, \W
18856 * or [\x{110000}] for example.
18857 * (Note that in this case, unlike the Posix one above, there is no
18858 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18859 * forces Unicode semantics */
18863 /* If it matters to the final outcome, see if a non-property
18864 * component of the class matches above Unicode. If so, the
18865 * warning gets suppressed. This is true even if just a single
18866 * such code point is specified, as, though not strictly correct if
18867 * another such code point is matched against, the fact that they
18868 * are using above-Unicode code points indicates they should know
18869 * the issues involved */
18871 warn_super = ! (invert
18872 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18875 _invlist_union(properties, cp_list, &cp_list);
18876 SvREFCNT_dec_NN(properties);
18879 cp_list = properties;
18884 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18886 /* Because an ANYOF node is the only one that warns, this node
18887 * can't be optimized into something else */
18888 optimizable = FALSE;
18892 /* Here, we have calculated what code points should be in the character
18895 * Now we can see about various optimizations. Fold calculation (which we
18896 * did above) needs to take place before inversion. Otherwise /[^k]/i
18897 * would invert to include K, which under /i would match k, which it
18898 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18899 * folded until runtime */
18901 /* If we didn't do folding, it's because some information isn't available
18902 * until runtime; set the run-time fold flag for these We know to set the
18903 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18904 * at least one 0-255 range code point */
18907 /* Some things on the list might be unconditionally included because of
18908 * other components. Remove them, and clean up the list if it goes to
18910 if (only_utf8_locale_list && cp_list) {
18911 _invlist_subtract(only_utf8_locale_list, cp_list,
18912 &only_utf8_locale_list);
18914 if (_invlist_len(only_utf8_locale_list) == 0) {
18915 SvREFCNT_dec_NN(only_utf8_locale_list);
18916 only_utf8_locale_list = NULL;
18919 if ( only_utf8_locale_list
18920 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18921 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18923 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18926 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18928 else if (cp_list && invlist_lowest(cp_list) < 256) {
18929 /* If nothing is below 256, has no locale dependency; otherwise it
18931 anyof_flags |= ANYOFL_FOLD;
18932 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18935 else if ( DEPENDS_SEMANTICS
18936 && ( upper_latin1_only_utf8_matches
18937 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18939 RExC_seen_d_op = TRUE;
18940 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18943 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18947 && ! has_runtime_dependency)
18949 _invlist_invert(cp_list);
18951 /* Clear the invert flag since have just done it here */
18955 /* All possible optimizations below still have these characteristics.
18956 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18958 *flagp |= HASWIDTH|SIMPLE;
18961 *ret_invlist = cp_list;
18966 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18967 RExC_contains_locale = 1;
18970 /* Some character classes are equivalent to other nodes. Such nodes take
18971 * up less room, and some nodes require fewer operations to execute, than
18972 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
18973 * improve efficiency. */
18976 PERL_UINT_FAST8_T i;
18977 UV partial_cp_count = 0;
18978 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18979 UV end[MAX_FOLD_FROMS+1] = { 0 };
18980 bool single_range = FALSE;
18982 if (cp_list) { /* Count the code points in enough ranges that we would
18983 see all the ones possible in any fold in this version
18986 invlist_iterinit(cp_list);
18987 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18988 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18991 partial_cp_count += end[i] - start[i] + 1;
18995 single_range = TRUE;
18997 invlist_iterfinish(cp_list);
19000 /* If we know at compile time that this matches every possible code
19001 * point, any run-time dependencies don't matter */
19002 if (start[0] == 0 && end[0] == UV_MAX) {
19004 ret = reganode(pRExC_state, OPFAIL, 0);
19007 ret = reg_node(pRExC_state, SANY);
19013 /* Similarly, for /l posix classes, if both a class and its
19014 * complement match, any run-time dependencies don't matter */
19016 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19019 if ( POSIXL_TEST(posixl, namedclass) /* class */
19020 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19023 ret = reganode(pRExC_state, OPFAIL, 0);
19026 ret = reg_node(pRExC_state, SANY);
19033 /* For well-behaved locales, some classes are subsets of others,
19034 * so complementing the subset and including the non-complemented
19035 * superset should match everything, like [\D[:alnum:]], and
19036 * [[:^alpha:][:alnum:]], but some implementations of locales are
19037 * buggy, and khw thinks its a bad idea to have optimization change
19038 * behavior, even if it avoids an OS bug in a given case */
19040 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19042 /* If is a single posix /l class, can optimize to just that op.
19043 * Such a node will not match anything in the Latin1 range, as that
19044 * is not determinable until runtime, but will match whatever the
19045 * class does outside that range. (Note that some classes won't
19046 * match anything outside the range, like [:ascii:]) */
19047 if ( isSINGLE_BIT_SET(posixl)
19048 && (partial_cp_count == 0 || start[0] > 255))
19051 SV * class_above_latin1 = NULL;
19052 bool already_inverted;
19053 bool are_equivalent;
19055 /* Compute which bit is set, which is the same thing as, e.g.,
19056 * ANYOF_CNTRL. From
19057 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19059 static const int MultiplyDeBruijnBitPosition2[32] =
19061 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19062 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19065 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19066 * 0x077CB531U) >> 27];
19067 classnum = namedclass_to_classnum(namedclass);
19069 /* The named classes are such that the inverted number is one
19070 * larger than the non-inverted one */
19071 already_inverted = namedclass
19072 - classnum_to_namedclass(classnum);
19074 /* Create an inversion list of the official property, inverted
19075 * if the constructed node list is inverted, and restricted to
19076 * only the above latin1 code points, which are the only ones
19077 * known at compile time */
19078 _invlist_intersection_maybe_complement_2nd(
19080 PL_XPosix_ptrs[classnum],
19082 &class_above_latin1);
19083 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19085 SvREFCNT_dec_NN(class_above_latin1);
19087 if (are_equivalent) {
19089 /* Resolve the run-time inversion flag with this possibly
19090 * inverted class */
19091 invert = invert ^ already_inverted;
19093 ret = reg_node(pRExC_state,
19094 POSIXL + invert * (NPOSIXL - POSIXL));
19095 FLAGS(REGNODE_p(ret)) = classnum;
19101 /* khw can't think of any other possible transformation involving
19103 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19107 if (! has_runtime_dependency) {
19109 /* If the list is empty, nothing matches. This happens, for
19110 * example, when a Unicode property that doesn't match anything is
19111 * the only element in the character class (perluniprops.pod notes
19112 * such properties). */
19113 if (partial_cp_count == 0) {
19115 ret = reg_node(pRExC_state, SANY);
19118 ret = reganode(pRExC_state, OPFAIL, 0);
19124 /* If matches everything but \n */
19125 if ( start[0] == 0 && end[0] == '\n' - 1
19126 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19129 ret = reg_node(pRExC_state, REG_ANY);
19135 /* Next see if can optimize classes that contain just a few code points
19136 * into an EXACTish node. The reason to do this is to let the
19137 * optimizer join this node with adjacent EXACTish ones, and ANYOF
19138 * nodes require conversion to code point from UTF-8.
19140 * An EXACTFish node can be generated even if not under /i, and vice
19141 * versa. But care must be taken. An EXACTFish node has to be such
19142 * that it only matches precisely the code points in the class, but we
19143 * want to generate the least restrictive one that does that, to
19144 * increase the odds of being able to join with an adjacent node. For
19145 * example, if the class contains [kK], we have to make it an EXACTFAA
19146 * node to prevent the KELVIN SIGN from matching. Whether we are under
19147 * /i or not is irrelevant in this case. Less obvious is the pattern
19148 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
19149 * supposed to match the single character U+0149 LATIN SMALL LETTER N
19150 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
19151 * that includes \X{02BC}, there is a multi-char fold that does, and so
19152 * the node generated for it must be an EXACTFish one. On the other
19153 * hand qr/:/i should generate a plain EXACT node since the colon
19154 * participates in no fold whatsoever, and having it EXACT tells the
19155 * optimizer the target string cannot match unless it has a colon in
19161 /* Only try if there are no more code points in the class than
19162 * in the max possible fold */
19163 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19165 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19167 /* We can always make a single code point class into an
19168 * EXACTish node. */
19172 /* Here is /l: Use EXACTL, except if there is a fold not
19173 * known until runtime so shows as only a single code point
19174 * here. For code points above 255, we know which can
19175 * cause problems by having a potential fold to the Latin1
19178 || ( start[0] > 255
19179 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19187 else if (! FOLD) { /* Not /l and not /i */
19188 op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19190 else if (start[0] < 256) { /* /i, not /l, and the code point is
19193 /* Under /i, it gets a little tricky. A code point that
19194 * doesn't participate in a fold should be an EXACT node.
19195 * We know this one isn't the result of a simple fold, or
19196 * there'd be more than one code point in the list, but it
19197 * could be part of a multi- character fold. In that case
19198 * we better not create an EXACT node, as we would wrongly
19199 * be telling the optimizer that this code point must be in
19200 * the target string, and that is wrong. This is because
19201 * if the sequence around this code point forms a
19202 * multi-char fold, what needs to be in the string could be
19203 * the code point that folds to the sequence.
19205 * This handles the case of below-255 code points, as we
19206 * have an easy look up for those. The next clause handles
19207 * the above-256 one */
19208 op = IS_IN_SOME_FOLD_L1(start[0])
19212 else { /* /i, larger code point. Since we are under /i, and
19213 have just this code point, we know that it can't
19214 fold to something else, so PL_InMultiCharFold
19216 op = _invlist_contains_cp(PL_InMultiCharFold,
19224 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19225 && _invlist_contains_cp(PL_in_some_fold, start[0]))
19227 /* Here, the only runtime dependency, if any, is from /d, and
19228 * the class matches more than one code point, and the lowest
19229 * code point participates in some fold. It might be that the
19230 * other code points are /i equivalent to this one, and hence
19231 * they would representable by an EXACTFish node. Above, we
19232 * eliminated classes that contain too many code points to be
19233 * EXACTFish, with the test for MAX_FOLD_FROMS
19235 * First, special case the ASCII fold pairs, like 'B' and 'b'.
19236 * We do this because we have EXACTFAA at our disposal for the
19238 if (partial_cp_count == 2 && isASCII(start[0])) {
19240 /* The only ASCII characters that participate in folds are
19242 assert(isALPHA(start[0]));
19243 if ( end[0] == start[0] /* First range is a single
19244 character, so 2nd exists */
19245 && isALPHA_FOLD_EQ(start[0], start[1]))
19248 /* Here, is part of an ASCII fold pair */
19250 if ( ASCII_FOLD_RESTRICTED
19251 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19253 /* If the second clause just above was true, it
19254 * means we can't be under /i, or else the list
19255 * would have included more than this fold pair.
19256 * Therefore we have to exclude the possibility of
19257 * whatever else it is that folds to these, by
19258 * using EXACTFAA */
19261 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19263 /* Here, there's no simple fold that start[0] is part
19264 * of, but there is a multi-character one. If we
19265 * are not under /i, we want to exclude that
19266 * possibility; if under /i, we want to include it
19268 op = (FOLD) ? EXACTFU : EXACTFAA;
19272 /* Here, the only possible fold start[0] particpates in
19273 * is with start[1]. /i or not isn't relevant */
19277 value = toFOLD(start[0]);
19280 else if ( ! upper_latin1_only_utf8_matches
19281 || ( _invlist_len(upper_latin1_only_utf8_matches)
19284 invlist_highest(upper_latin1_only_utf8_matches)]
19287 /* Here, the smallest character is non-ascii or there are
19288 * more than 2 code points matched by this node. Also, we
19289 * either don't have /d UTF-8 dependent matches, or if we
19290 * do, they look like they could be a single character that
19291 * is the fold of the lowest one in the always-match list.
19292 * This test quickly excludes most of the false positives
19293 * when there are /d UTF-8 depdendent matches. These are
19294 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19295 * SMALL LETTER A WITH GRAVE iff the target string is
19296 * UTF-8. (We don't have to worry above about exceeding
19297 * the array bounds of PL_fold_latin1[] because any code
19298 * point in 'upper_latin1_only_utf8_matches' is below 256.)
19300 * EXACTFAA would apply only to pairs (hence exactly 2 code
19301 * points) in the ASCII range, so we can't use it here to
19302 * artificially restrict the fold domain, so we check if
19303 * the class does or does not match some EXACTFish node.
19304 * Further, if we aren't under /i, and and the folded-to
19305 * character is part of a multi-character fold, we can't do
19306 * this optimization, as the sequence around it could be
19307 * that multi-character fold, and we don't here know the
19308 * context, so we have to assume it is that multi-char
19309 * fold, to prevent potential bugs.
19311 * To do the general case, we first find the fold of the
19312 * lowest code point (which may be higher than the lowest
19313 * one), then find everything that folds to it. (The data
19314 * structure we have only maps from the folded code points,
19315 * so we have to do the earlier step.) */
19318 U8 foldbuf[UTF8_MAXBYTES_CASE];
19319 UV folded = _to_uni_fold_flags(start[0],
19320 foldbuf, &foldlen, 0);
19322 const U32 * remaining_folds;
19323 Size_t folds_to_this_cp_count = _inverse_folds(
19327 Size_t folds_count = folds_to_this_cp_count + 1;
19328 SV * fold_list = _new_invlist(folds_count);
19331 /* If there are UTF-8 dependent matches, create a temporary
19332 * list of what this node matches, including them. */
19333 SV * all_cp_list = NULL;
19334 SV ** use_this_list = &cp_list;
19336 if (upper_latin1_only_utf8_matches) {
19337 all_cp_list = _new_invlist(0);
19338 use_this_list = &all_cp_list;
19339 _invlist_union(cp_list,
19340 upper_latin1_only_utf8_matches,
19344 /* Having gotten everything that participates in the fold
19345 * containing the lowest code point, we turn that into an
19346 * inversion list, making sure everything is included. */
19347 fold_list = add_cp_to_invlist(fold_list, start[0]);
19348 fold_list = add_cp_to_invlist(fold_list, folded);
19349 if (folds_to_this_cp_count > 0) {
19350 fold_list = add_cp_to_invlist(fold_list, first_fold);
19351 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19352 fold_list = add_cp_to_invlist(fold_list,
19353 remaining_folds[i]);
19357 /* If the fold list is identical to what's in this ANYOF
19358 * node, the node can be represented by an EXACTFish one
19360 if (_invlistEQ(*use_this_list, fold_list,
19361 0 /* Don't complement */ )
19364 /* But, we have to be careful, as mentioned above.
19365 * Just the right sequence of characters could match
19366 * this if it is part of a multi-character fold. That
19367 * IS what we want if we are under /i. But it ISN'T
19368 * what we want if not under /i, as it could match when
19369 * it shouldn't. So, when we aren't under /i and this
19370 * character participates in a multi-char fold, we
19371 * don't optimize into an EXACTFish node. So, for each
19372 * case below we have to check if we are folding
19373 * and if not, if it is not part of a multi-char fold.
19375 if (start[0] > 255) { /* Highish code point */
19376 if (FOLD || ! _invlist_contains_cp(
19377 PL_InMultiCharFold, folded))
19381 : (ASCII_FOLD_RESTRICTED)
19386 } /* Below, the lowest code point < 256 */
19389 && DEPENDS_SEMANTICS)
19390 { /* An EXACTF node containing a single character
19391 's', can be an EXACTFU if it doesn't get
19392 joined with an adjacent 's' */
19393 op = EXACTFU_S_EDGE;
19397 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19399 if (upper_latin1_only_utf8_matches) {
19402 /* We can't use the fold, as that only matches
19406 else if ( UNLIKELY(start[0] == MICRO_SIGN)
19408 { /* EXACTFUP is a special node for this
19410 op = (ASCII_FOLD_RESTRICTED)
19413 value = MICRO_SIGN;
19415 else if ( ASCII_FOLD_RESTRICTED
19416 && ! isASCII(start[0]))
19417 { /* For ASCII under /iaa, we can use EXACTFU
19429 SvREFCNT_dec_NN(fold_list);
19430 SvREFCNT_dec(all_cp_list);
19437 /* Here, we have calculated what EXACTish node to use. Have to
19438 * convert to UTF-8 if not already there */
19441 SvREFCNT_dec(cp_list);;
19442 REQUIRE_UTF8(flagp);
19445 /* This is a kludge to the special casing issues with this
19446 * ligature under /aa. FB05 should fold to FB06, but the
19447 * call above to _to_uni_fold_flags() didn't find this, as
19448 * it didn't use the /aa restriction in order to not miss
19449 * other folds that would be affected. This is the only
19450 * instance likely to ever be a problem in all of Unicode.
19451 * So special case it. */
19452 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
19453 && ASCII_FOLD_RESTRICTED)
19455 value = LATIN_SMALL_LIGATURE_ST;
19459 len = (UTF) ? UVCHR_SKIP(value) : 1;
19461 ret = regnode_guts(pRExC_state, op, len, "exact");
19462 FILL_NODE(ret, op);
19463 RExC_emit += 1 + STR_SZ(len);
19464 setSTR_LEN(REGNODE_p(ret), len);
19466 *STRINGs(REGNODE_p(ret)) = (U8) value;
19469 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19475 if (! has_runtime_dependency) {
19477 /* See if this can be turned into an ANYOFM node. Think about the
19478 * bit patterns in two different bytes. In some positions, the
19479 * bits in each will be 1; and in other positions both will be 0;
19480 * and in some positions the bit will be 1 in one byte, and 0 in
19481 * the other. Let 'n' be the number of positions where the bits
19482 * differ. We create a mask which has exactly 'n' 0 bits, each in
19483 * a position where the two bytes differ. Now take the set of all
19484 * bytes that when ANDed with the mask yield the same result. That
19485 * set has 2**n elements, and is representable by just two 8 bit
19486 * numbers: the result and the mask. Importantly, matching the set
19487 * can be vectorized by creating a word full of the result bytes,
19488 * and a word full of the mask bytes, yielding a significant speed
19489 * up. Here, see if this node matches such a set. As a concrete
19490 * example consider [01], and the byte representing '0' which is
19491 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
19492 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
19493 * 0x30. Any other bytes ANDed yield something else. So [01],
19494 * which is a common usage, is optimizable into ANYOFM, and can
19495 * benefit from the speed up. We can only do this on UTF-8
19496 * invariant bytes, because they have the same bit patterns under
19498 PERL_UINT_FAST8_T inverted = 0;
19500 const PERL_UINT_FAST8_T max_permissible = 0xFF;
19502 const PERL_UINT_FAST8_T max_permissible = 0x7F;
19504 /* If doesn't fit the criteria for ANYOFM, invert and try again.
19505 * If that works we will instead later generate an NANYOFM, and
19506 * invert back when through */
19507 if (invlist_highest(cp_list) > max_permissible) {
19508 _invlist_invert(cp_list);
19512 if (invlist_highest(cp_list) <= max_permissible) {
19513 UV this_start, this_end;
19514 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
19515 U8 bits_differing = 0;
19516 Size_t full_cp_count = 0;
19517 bool first_time = TRUE;
19519 /* Go through the bytes and find the bit positions that differ
19521 invlist_iterinit(cp_list);
19522 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19523 unsigned int i = this_start;
19526 if (! UVCHR_IS_INVARIANT(i)) {
19530 first_time = FALSE;
19531 lowest_cp = this_start;
19533 /* We have set up the code point to compare with.
19534 * Don't compare it with itself */
19538 /* Find the bit positions that differ from the lowest code
19539 * point in the node. Keep track of all such positions by
19541 for (; i <= this_end; i++) {
19542 if (! UVCHR_IS_INVARIANT(i)) {
19546 bits_differing |= i ^ lowest_cp;
19549 full_cp_count += this_end - this_start + 1;
19552 /* At the end of the loop, we count how many bits differ from
19553 * the bits in lowest code point, call the count 'd'. If the
19554 * set we found contains 2**d elements, it is the closure of
19555 * all code points that differ only in those bit positions. To
19556 * convince yourself of that, first note that the number in the
19557 * closure must be a power of 2, which we test for. The only
19558 * way we could have that count and it be some differing set,
19559 * is if we got some code points that don't differ from the
19560 * lowest code point in any position, but do differ from each
19561 * other in some other position. That means one code point has
19562 * a 1 in that position, and another has a 0. But that would
19563 * mean that one of them differs from the lowest code point in
19564 * that position, which possibility we've already excluded. */
19565 if ( (inverted || full_cp_count > 1)
19566 && full_cp_count == 1U << PL_bitcount[bits_differing])
19570 op = ANYOFM + inverted;;
19572 /* We need to make the bits that differ be 0's */
19573 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19575 /* The argument is the lowest code point */
19576 ret = reganode(pRExC_state, op, lowest_cp);
19577 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19581 invlist_iterfinish(cp_list);
19585 _invlist_invert(cp_list);
19592 /* XXX We could create an ANYOFR_LOW node here if we saved above if
19593 * all were invariants, it wasn't inverted, and there is a single
19594 * range. This would be faster than some of the posix nodes we
19595 * create below like /\d/a, but would be twice the size. Without
19596 * having actually measured the gain, khw doesn't think the
19597 * tradeoff is really worth it */
19600 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19601 PERL_UINT_FAST8_T type;
19602 SV * intersection = NULL;
19603 SV* d_invlist = NULL;
19605 /* See if this matches any of the POSIX classes. The POSIXA and
19606 * POSIXD ones are about the same speed as ANYOF ops, but take less
19607 * room; the ones that have above-Latin1 code point matches are
19608 * somewhat faster than ANYOF. */
19610 for (type = POSIXA; type >= POSIXD; type--) {
19613 if (type == POSIXL) { /* But not /l posix classes */
19617 for (posix_class = 0;
19618 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19621 SV** our_code_points = &cp_list;
19622 SV** official_code_points;
19625 if (type == POSIXA) {
19626 official_code_points = &PL_Posix_ptrs[posix_class];
19629 official_code_points = &PL_XPosix_ptrs[posix_class];
19632 /* Skip non-existent classes of this type. e.g. \v only
19633 * has an entry in PL_XPosix_ptrs */
19634 if (! *official_code_points) {
19638 /* Try both the regular class, and its inversion */
19639 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19640 bool this_inverted = invert ^ try_inverted;
19642 if (type != POSIXD) {
19644 /* This class that isn't /d can't match if we have
19645 * /d dependencies */
19646 if (has_runtime_dependency
19647 & HAS_D_RUNTIME_DEPENDENCY)
19652 else /* is /d */ if (! this_inverted) {
19654 /* /d classes don't match anything non-ASCII below
19655 * 256 unconditionally (which cp_list contains) */
19656 _invlist_intersection(cp_list, PL_UpperLatin1,
19658 if (_invlist_len(intersection) != 0) {
19662 SvREFCNT_dec(d_invlist);
19663 d_invlist = invlist_clone(cp_list, NULL);
19665 /* But under UTF-8 it turns into using /u rules.
19666 * Add the things it matches under these conditions
19667 * so that we check below that these are identical
19668 * to what the tested class should match */
19669 if (upper_latin1_only_utf8_matches) {
19672 upper_latin1_only_utf8_matches,
19675 our_code_points = &d_invlist;
19677 else { /* POSIXD, inverted. If this doesn't have this
19678 flag set, it isn't /d. */
19679 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19683 our_code_points = &cp_list;
19686 /* Here, have weeded out some things. We want to see
19687 * if the list of characters this node contains
19688 * ('*our_code_points') precisely matches those of the
19689 * class we are currently checking against
19690 * ('*official_code_points'). */
19691 if (_invlistEQ(*our_code_points,
19692 *official_code_points,
19695 /* Here, they precisely match. Optimize this ANYOF
19696 * node into its equivalent POSIX one of the
19697 * correct type, possibly inverted */
19698 ret = reg_node(pRExC_state, (try_inverted)
19702 FLAGS(REGNODE_p(ret)) = posix_class;
19703 SvREFCNT_dec(d_invlist);
19704 SvREFCNT_dec(intersection);
19710 SvREFCNT_dec(d_invlist);
19711 SvREFCNT_dec(intersection);
19714 /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19715 * both in size and speed. Currently, a 20 bit range base (smallest
19716 * code point in the range), and a 12 bit maximum delta are packed into
19717 * a 32 bit word. This allows for using it on all of the Unicode code
19718 * points except for the highest plane, which is only for private use
19719 * code points. khw doubts that a bigger delta is likely in real world
19722 && ! has_runtime_dependency
19723 && anyof_flags == 0
19724 && start[0] < (1 << ANYOFR_BASE_BITS)
19725 && end[0] - start[0]
19726 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19727 * CHARBITS - ANYOFR_BASE_BITS))))
19730 U8 low_utf8[UTF8_MAXBYTES+1];
19731 U8 high_utf8[UTF8_MAXBYTES+1];
19733 ret = reganode(pRExC_state, ANYOFR,
19734 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19736 /* Place the lowest UTF-8 start byte in the flags field, so as to
19737 * allow efficient ruling out at run time of many possible inputs.
19739 (void) uvchr_to_utf8(low_utf8, start[0]);
19740 (void) uvchr_to_utf8(high_utf8, end[0]);
19742 /* If all code points share the same first byte, this can be an
19743 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
19744 * quickly rule out many inputs at run-time without having to
19745 * compute the code point from UTF-8. For EBCDIC, we use I8, as
19746 * not doing that transformation would not rule out nearly so many
19748 if (low_utf8[0] == high_utf8[0]) {
19749 OP(REGNODE_p(ret)) = ANYOFRb;
19750 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19753 ANYOF_FLAGS(REGNODE_p(ret))
19754 = NATIVE_UTF8_TO_I8(low_utf8[0]);
19760 /* If didn't find an optimization and there is no need for a bitmap,
19761 * optimize to indicate that */
19762 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19764 && ! upper_latin1_only_utf8_matches
19765 && anyof_flags == 0)
19767 U8 low_utf8[UTF8_MAXBYTES+1];
19768 UV highest_cp = invlist_highest(cp_list);
19770 /* Currently the maximum allowed code point by the system is
19771 * IV_MAX. Higher ones are reserved for future internal use. This
19772 * particular regnode can be used for higher ones, but we can't
19773 * calculate the code point of those. IV_MAX suffices though, as
19774 * it will be a large first byte */
19775 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19778 /* We store the lowest possible first byte of the UTF-8
19779 * representation, using the flags field. This allows for quick
19780 * ruling out of some inputs without having to convert from UTF-8
19781 * to code point. For EBCDIC, we use I8, as not doing that
19782 * transformation would not rule out nearly so many things */
19783 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19787 /* If the first UTF-8 start byte for the highest code point in the
19788 * range is suitably small, we may be able to get an upper bound as
19790 if (highest_cp <= IV_MAX) {
19791 U8 high_utf8[UTF8_MAXBYTES+1];
19792 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19795 /* If the lowest and highest are the same, we can get an exact
19796 * first byte instead of a just minimum or even a sequence of
19797 * exact leading bytes. We signal these with different
19799 if (low_utf8[0] == high_utf8[0]) {
19800 Size_t len = find_first_differing_byte_pos(low_utf8,
19802 MIN(low_len, high_len));
19806 /* No need to convert to I8 for EBCDIC as this is an
19808 anyof_flags = low_utf8[0];
19813 ret = regnode_guts(pRExC_state, op,
19814 regarglen[op] + STR_SZ(len),
19816 FILL_NODE(ret, op);
19817 ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19819 Copy(low_utf8, /* Add the common bytes */
19820 ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19822 RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19823 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19824 NULL, only_utf8_locale_list);
19828 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19831 /* Here, the high byte is not the same as the low, but is
19832 * small enough that its reasonable to have a loose upper
19833 * bound, which is packed in with the strict lower bound.
19834 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19835 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19836 * is the same thing as UTF-8 */
19839 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19840 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19843 if (range_diff <= max_range_diff / 8) {
19846 else if (range_diff <= max_range_diff / 4) {
19849 else if (range_diff <= max_range_diff / 2) {
19852 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19857 goto done_finding_op;
19859 } /* End of seeing if can optimize it into a different node */
19861 is_anyof: /* It's going to be an ANYOF node. */
19862 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19872 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19873 FILL_NODE(ret, op); /* We set the argument later */
19874 RExC_emit += 1 + regarglen[op];
19875 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19877 /* Here, <cp_list> contains all the code points we can determine at
19878 * compile time that match under all conditions. Go through it, and
19879 * for things that belong in the bitmap, put them there, and delete from
19880 * <cp_list>. While we are at it, see if everything above 255 is in the
19881 * list, and if so, set a flag to speed up execution */
19883 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19886 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19890 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19893 /* Here, the bitmap has been populated with all the Latin1 code points that
19894 * always match. Can now add to the overall list those that match only
19895 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19897 if (upper_latin1_only_utf8_matches) {
19899 _invlist_union(cp_list,
19900 upper_latin1_only_utf8_matches,
19902 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19905 cp_list = upper_latin1_only_utf8_matches;
19907 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19910 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19911 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19914 only_utf8_locale_list);
19915 SvREFCNT_dec(cp_list);;
19916 SvREFCNT_dec(only_utf8_locale_list);
19921 /* Here, the node is getting optimized into something that's not an ANYOF
19922 * one. Finish up. */
19924 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19925 RExC_parse - orig_parse);;
19926 SvREFCNT_dec(cp_list);;
19927 SvREFCNT_dec(only_utf8_locale_list);
19931 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19934 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19935 regnode* const node,
19937 SV* const runtime_defns,
19938 SV* const only_utf8_locale_list)
19940 /* Sets the arg field of an ANYOF-type node 'node', using information about
19941 * the node passed-in. If there is nothing outside the node's bitmap, the
19942 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19943 * the count returned by add_data(), having allocated and stored an array,
19946 * av[0] stores the inversion list defining this class as far as known at
19947 * this time, or PL_sv_undef if nothing definite is now known.
19948 * av[1] stores the inversion list of code points that match only if the
19949 * current locale is UTF-8, or if none, PL_sv_undef if there is an
19950 * av[2], or no entry otherwise.
19951 * av[2] stores the list of user-defined properties whose subroutine
19952 * definitions aren't known at this time, or no entry if none. */
19956 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19958 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19959 assert(! (ANYOF_FLAGS(node)
19960 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19961 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19964 AV * const av = newAV();
19968 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
19971 if (only_utf8_locale_list) {
19972 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
19973 SvREFCNT_inc_NN(only_utf8_locale_list));
19976 if (runtime_defns) {
19977 av_store(av, DEFERRED_USER_DEFINED_INDEX,
19978 SvREFCNT_inc_NN(runtime_defns));
19981 rv = newRV_noinc(MUTABLE_SV(av));
19982 n = add_data(pRExC_state, STR_WITH_LEN("s"));
19983 RExC_rxi->data->data[n] = (void*)rv;
19990 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19991 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
19993 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
19997 /* For internal core use only.
19998 * Returns the inversion list for the input 'node' in the regex 'prog'.
19999 * If <doinit> is 'true', will attempt to create the inversion list if not
20001 * If <listsvp> is non-null, will return the printable contents of the
20002 * property definition. This can be used to get debugging information
20003 * even before the inversion list exists, by calling this function with
20004 * 'doinit' set to false, in which case the components that will be used
20005 * to eventually create the inversion list are returned (in a printable
20007 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20008 * store an inversion list of code points that should match only if the
20009 * execution-time locale is a UTF-8 one.
20010 * If <output_invlist> is not NULL, it is where this routine is to store an
20011 * inversion list of the code points that would be instead returned in
20012 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20013 * when this parameter is used, is just the non-code point data that
20014 * will go into creating the inversion list. This currently should be just
20015 * user-defined properties whose definitions were not known at compile
20016 * time. Using this parameter allows for easier manipulation of the
20017 * inversion list's data by the caller. It is illegal to call this
20018 * function with this parameter set, but not <listsvp>
20020 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20021 * that, in spite of this function's name, the inversion list it returns
20022 * may include the bitmap data as well */
20024 SV *si = NULL; /* Input initialization string */
20025 SV* invlist = NULL;
20027 RXi_GET_DECL(prog, progi);
20028 const struct reg_data * const data = prog ? progi->data : NULL;
20030 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20031 PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20033 PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20035 assert(! output_invlist || listsvp);
20037 if (data && data->count) {
20038 const U32 n = ARG(node);
20040 if (data->what[n] == 's') {
20041 SV * const rv = MUTABLE_SV(data->data[n]);
20042 AV * const av = MUTABLE_AV(SvRV(rv));
20043 SV **const ary = AvARRAY(av);
20045 invlist = ary[INVLIST_INDEX];
20047 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20048 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20051 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20052 si = ary[DEFERRED_USER_DEFINED_INDEX];
20055 if (doinit && (si || invlist)) {
20058 SV * msg = newSVpvs_flags("", SVs_TEMP);
20060 SV * prop_definition = handle_user_defined_property(
20061 "", 0, FALSE, /* There is no \p{}, \P{} */
20062 SvPVX_const(si)[1] - '0', /* /i or not has been
20063 stored here for just
20065 TRUE, /* run time */
20066 FALSE, /* This call must find the defn */
20067 si, /* The property definition */
20070 0 /* base level call */
20074 assert(prop_definition == NULL);
20076 Perl_croak(aTHX_ "%" UTF8f,
20077 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20081 _invlist_union(invlist, prop_definition, &invlist);
20082 SvREFCNT_dec_NN(prop_definition);
20085 invlist = prop_definition;
20088 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20089 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20091 ary[INVLIST_INDEX] = invlist;
20092 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20093 ? ONLY_LOCALE_MATCHES_INDEX
20101 /* If requested, return a printable version of what this ANYOF node matches
20104 SV* matches_string = NULL;
20106 /* This function can be called at compile-time, before everything gets
20107 * resolved, in which case we return the currently best available
20108 * information, which is the string that will eventually be used to do
20109 * that resolving, 'si' */
20111 /* Here, we only have 'si' (and possibly some passed-in data in
20112 * 'invlist', which is handled below) If the caller only wants
20113 * 'si', use that. */
20114 if (! output_invlist) {
20115 matches_string = newSVsv(si);
20118 /* But if the caller wants an inversion list of the node, we
20119 * need to parse 'si' and place as much as possible in the
20120 * desired output inversion list, making 'matches_string' only
20121 * contain the currently unresolvable things */
20122 const char *si_string = SvPVX(si);
20123 STRLEN remaining = SvCUR(si);
20127 /* Ignore everything before and including the first new-line */
20128 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20129 assert (si_string != NULL);
20131 remaining = SvPVX(si) + SvCUR(si) - si_string;
20133 while (remaining > 0) {
20135 /* The data consists of just strings defining user-defined
20136 * property names, but in prior incarnations, and perhaps
20137 * somehow from pluggable regex engines, it could still
20138 * hold hex code point definitions, all of which should be
20139 * legal (or it wouldn't have gotten this far). Each
20140 * component of a range would be separated by a tab, and
20141 * each range by a new-line. If these are found, instead
20142 * add them to the inversion list */
20143 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
20144 |PERL_SCAN_SILENT_NON_PORTABLE;
20145 STRLEN len = remaining;
20146 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20148 /* If the hex decode routine found something, it should go
20149 * up to the next \n */
20150 if ( *(si_string + len) == '\n') {
20151 if (count) { /* 2nd code point on line */
20152 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20155 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20158 goto prepare_for_next_iteration;
20161 /* If the hex decode was instead for the lower range limit,
20162 * save it, and go parse the upper range limit */
20163 if (*(si_string + len) == '\t') {
20164 assert(count == 0);
20168 prepare_for_next_iteration:
20169 si_string += len + 1;
20170 remaining -= len + 1;
20174 /* Here, didn't find a legal hex number. Just add the text
20175 * from here up to the next \n, omitting any trailing
20179 len = strcspn(si_string,
20180 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20182 if (matches_string) {
20183 sv_catpvn(matches_string, si_string, len);
20186 matches_string = newSVpvn(si_string, len);
20188 sv_catpvs(matches_string, " ");
20192 && UCHARAT(si_string)
20193 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20198 if (remaining && UCHARAT(si_string) == '\n') {
20202 } /* end of loop through the text */
20204 assert(matches_string);
20205 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
20206 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20208 } /* end of has an 'si' */
20211 /* Add the stuff that's already known */
20214 /* Again, if the caller doesn't want the output inversion list, put
20215 * everything in 'matches-string' */
20216 if (! output_invlist) {
20217 if ( ! matches_string) {
20218 matches_string = newSVpvs("\n");
20220 sv_catsv(matches_string, invlist_contents(invlist,
20221 TRUE /* traditional style */
20224 else if (! *output_invlist) {
20225 *output_invlist = invlist_clone(invlist, NULL);
20228 _invlist_union(*output_invlist, invlist, output_invlist);
20232 *listsvp = matches_string;
20238 /* reg_skipcomment()
20240 Absorbs an /x style # comment from the input stream,
20241 returning a pointer to the first character beyond the comment, or if the
20242 comment terminates the pattern without anything following it, this returns
20243 one past the final character of the pattern (in other words, RExC_end) and
20244 sets the REG_RUN_ON_COMMENT_SEEN flag.
20246 Note it's the callers responsibility to ensure that we are
20247 actually in /x mode
20251 PERL_STATIC_INLINE char*
20252 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20254 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20258 while (p < RExC_end) {
20259 if (*(++p) == '\n') {
20264 /* we ran off the end of the pattern without ending the comment, so we have
20265 * to add an \n when wrapping */
20266 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20271 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20273 const bool force_to_xmod
20276 /* If the text at the current parse position '*p' is a '(?#...)' comment,
20277 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20278 * is /x whitespace, advance '*p' so that on exit it points to the first
20279 * byte past all such white space and comments */
20281 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20283 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20285 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20288 if (RExC_end - (*p) >= 3
20290 && *(*p + 1) == '?'
20291 && *(*p + 2) == '#')
20293 while (*(*p) != ')') {
20294 if ((*p) == RExC_end)
20295 FAIL("Sequence (?#... not terminated");
20303 const char * save_p = *p;
20304 while ((*p) < RExC_end) {
20306 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20309 else if (*(*p) == '#') {
20310 (*p) = reg_skipcomment(pRExC_state, (*p));
20316 if (*p != save_p) {
20329 Advances the parse position by one byte, unless that byte is the beginning
20330 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
20331 those two cases, the parse position is advanced beyond all such comments and
20334 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20338 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20340 PERL_ARGS_ASSERT_NEXTCHAR;
20342 if (RExC_parse < RExC_end) {
20344 || UTF8_IS_INVARIANT(*RExC_parse)
20345 || UTF8_IS_START(*RExC_parse));
20347 RExC_parse += (UTF)
20348 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20351 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20352 FALSE /* Don't force /x */ );
20357 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20359 /* 'size' is the delta number of smallest regnode equivalents to add or
20360 * subtract from the current memory allocated to the regex engine being
20363 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20368 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20369 /* +1 for REG_MAGIC */
20372 if ( RExC_rxi == NULL )
20373 FAIL("Regexp out of space");
20374 RXi_SET(RExC_rx, RExC_rxi);
20376 RExC_emit_start = RExC_rxi->program;
20378 Zero(REGNODE_p(RExC_emit), size, regnode);
20381 #ifdef RE_TRACK_PATTERN_OFFSETS
20382 Renew(RExC_offsets, 2*RExC_size+1, U32);
20384 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20386 RExC_offsets[0] = RExC_size;
20390 STATIC regnode_offset
20391 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20393 /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20394 * equivalents space. It aligns and increments RExC_size
20396 * It returns the regnode's offset into the regex engine program */
20398 const regnode_offset ret = RExC_emit;
20400 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20402 PERL_ARGS_ASSERT_REGNODE_GUTS;
20404 SIZE_ALIGN(RExC_size);
20405 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20406 NODE_ALIGN_FILL(REGNODE_p(ret));
20407 #ifndef RE_TRACK_PATTERN_OFFSETS
20408 PERL_UNUSED_ARG(name);
20409 PERL_UNUSED_ARG(op);
20411 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20413 if (RExC_offsets) { /* MJD */
20415 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20418 (UV)(RExC_emit) > RExC_offsets[0]
20419 ? "Overwriting end of array!\n" : "OK",
20421 (UV)(RExC_parse - RExC_start),
20422 (UV)RExC_offsets[0]));
20423 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20430 - reg_node - emit a node
20432 STATIC regnode_offset /* Location. */
20433 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20435 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20436 regnode_offset ptr = ret;
20438 PERL_ARGS_ASSERT_REG_NODE;
20440 assert(regarglen[op] == 0);
20442 FILL_ADVANCE_NODE(ptr, op);
20448 - reganode - emit a node with an argument
20450 STATIC regnode_offset /* Location. */
20451 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20453 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20454 regnode_offset ptr = ret;
20456 PERL_ARGS_ASSERT_REGANODE;
20458 /* ANYOF are special cased to allow non-length 1 args */
20459 assert(regarglen[op] == 1);
20461 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20467 - regpnode - emit a temporary node with a SV* argument
20469 STATIC regnode_offset /* Location. */
20470 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20472 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20473 regnode_offset ptr = ret;
20475 PERL_ARGS_ASSERT_REGPNODE;
20477 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20482 STATIC regnode_offset
20483 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20485 /* emit a node with U32 and I32 arguments */
20487 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20488 regnode_offset ptr = ret;
20490 PERL_ARGS_ASSERT_REG2LANODE;
20492 assert(regarglen[op] == 2);
20494 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20500 - reginsert - insert an operator in front of already-emitted operand
20502 * That means that on exit 'operand' is the offset of the newly inserted
20503 * operator, and the original operand has been relocated.
20505 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20506 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20508 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20509 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20511 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20514 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20515 const regnode_offset operand, const U32 depth)
20520 const int offset = regarglen[(U8)op];
20521 const int size = NODE_STEP_REGNODE + offset;
20522 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20524 PERL_ARGS_ASSERT_REGINSERT;
20525 PERL_UNUSED_CONTEXT;
20526 PERL_UNUSED_ARG(depth);
20527 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20528 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20529 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20530 studying. If this is wrong then we need to adjust RExC_recurse
20531 below like we do with RExC_open_parens/RExC_close_parens. */
20532 change_engine_size(pRExC_state, (Ptrdiff_t) size);
20533 src = REGNODE_p(RExC_emit);
20535 dst = REGNODE_p(RExC_emit);
20537 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20538 * and [perl #133871] shows this can lead to problems, so skip this
20539 * realignment of parens until a later pass when they are reliable */
20540 if (! IN_PARENS_PASS && RExC_open_parens) {
20542 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20543 /* remember that RExC_npar is rex->nparens + 1,
20544 * iow it is 1 more than the number of parens seen in
20545 * the pattern so far. */
20546 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20547 /* note, RExC_open_parens[0] is the start of the
20548 * regex, it can't move. RExC_close_parens[0] is the end
20549 * of the regex, it *can* move. */
20550 if ( paren && RExC_open_parens[paren] >= operand ) {
20551 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20552 RExC_open_parens[paren] += size;
20554 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20556 if ( RExC_close_parens[paren] >= operand ) {
20557 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20558 RExC_close_parens[paren] += size;
20560 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20565 RExC_end_op += size;
20567 while (src > REGNODE_p(operand)) {
20568 StructCopy(--src, --dst, regnode);
20569 #ifdef RE_TRACK_PATTERN_OFFSETS
20570 if (RExC_offsets) { /* MJD 20010112 */
20572 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20576 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20577 ? "Overwriting end of array!\n" : "OK",
20578 (UV)REGNODE_OFFSET(src),
20579 (UV)REGNODE_OFFSET(dst),
20580 (UV)RExC_offsets[0]));
20581 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20582 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20587 place = REGNODE_p(operand); /* Op node, where operand used to be. */
20588 #ifdef RE_TRACK_PATTERN_OFFSETS
20589 if (RExC_offsets) { /* MJD */
20591 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20595 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20596 ? "Overwriting end of array!\n" : "OK",
20597 (UV)REGNODE_OFFSET(place),
20598 (UV)(RExC_parse - RExC_start),
20599 (UV)RExC_offsets[0]));
20600 Set_Node_Offset(place, RExC_parse);
20601 Set_Node_Length(place, 1);
20604 src = NEXTOPER(place);
20606 FILL_NODE(operand, op);
20608 /* Zero out any arguments in the new node */
20609 Zero(src, offset, regnode);
20613 - regtail - set the next-pointer at the end of a node chain of p to val. If
20614 that value won't fit in the space available, instead returns FALSE.
20615 (Except asserts if we can't fit in the largest space the regex
20616 engine is designed for.)
20617 - SEE ALSO: regtail_study
20620 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20621 const regnode_offset p,
20622 const regnode_offset val,
20625 regnode_offset scan;
20626 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20628 PERL_ARGS_ASSERT_REGTAIL;
20630 PERL_UNUSED_ARG(depth);
20633 /* Find last node. */
20634 scan = (regnode_offset) p;
20636 regnode * const temp = regnext(REGNODE_p(scan));
20638 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20639 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20640 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
20641 SvPV_nolen_const(RExC_mysv), scan,
20642 (temp == NULL ? "->" : ""),
20643 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20648 scan = REGNODE_OFFSET(temp);
20651 assert(val >= scan);
20652 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20653 assert((UV) (val - scan) <= U32_MAX);
20654 ARG_SET(REGNODE_p(scan), val - scan);
20657 if (val - scan > U16_MAX) {
20658 /* Populate this with something that won't loop and will likely
20659 * lead to a crash if the caller ignores the failure return, and
20660 * execution continues */
20661 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20664 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20672 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20673 - Look for optimizable sequences at the same time.
20674 - currently only looks for EXACT chains.
20676 This is experimental code. The idea is to use this routine to perform
20677 in place optimizations on branches and groups as they are constructed,
20678 with the long term intention of removing optimization from study_chunk so
20679 that it is purely analytical.
20681 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20682 to control which is which.
20684 This used to return a value that was ignored. It was a problem that it is
20685 #ifdef'd to be another function that didn't return a value. khw has changed it
20686 so both currently return a pass/fail return.
20689 /* TODO: All four parms should be const */
20692 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20693 const regnode_offset val, U32 depth)
20695 regnode_offset scan;
20697 #ifdef EXPERIMENTAL_INPLACESCAN
20700 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20702 PERL_ARGS_ASSERT_REGTAIL_STUDY;
20705 /* Find last node. */
20709 regnode * const temp = regnext(REGNODE_p(scan));
20710 #ifdef EXPERIMENTAL_INPLACESCAN
20711 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20712 bool unfolded_multi_char; /* Unexamined in this routine */
20713 if (join_exact(pRExC_state, scan, &min,
20714 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20715 return TRUE; /* Was return EXACT */
20719 switch (OP(REGNODE_p(scan))) {
20726 case EXACTFU_S_EDGE:
20727 case EXACTFAA_NO_TRIE:
20734 if( exact == PSEUDO )
20735 exact= OP(REGNODE_p(scan));
20736 else if ( exact != OP(REGNODE_p(scan)) )
20745 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20746 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20747 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
20748 SvPV_nolen_const(RExC_mysv),
20750 PL_reg_name[exact]);
20754 scan = REGNODE_OFFSET(temp);
20757 DEBUG_PARSE_MSG("");
20758 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20759 Perl_re_printf( aTHX_
20760 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20761 SvPV_nolen_const(RExC_mysv),
20766 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20767 assert((UV) (val - scan) <= U32_MAX);
20768 ARG_SET(REGNODE_p(scan), val - scan);
20771 if (val - scan > U16_MAX) {
20772 /* Populate this with something that won't loop and will likely
20773 * lead to a crash if the caller ignores the failure return, and
20774 * execution continues */
20775 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20778 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20781 return TRUE; /* Was 'return exact' */
20786 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20788 /* Returns an inversion list of all the code points matched by the
20789 * ANYOFM/NANYOFM node 'n' */
20791 SV * cp_list = _new_invlist(-1);
20792 const U8 lowest = (U8) ARG(n);
20795 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20797 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20799 /* Starting with the lowest code point, any code point that ANDed with the
20800 * mask yields the lowest code point is in the set */
20801 for (i = lowest; i <= 0xFF; i++) {
20802 if ((i & FLAGS(n)) == ARG(n)) {
20803 cp_list = add_cp_to_invlist(cp_list, i);
20806 /* We know how many code points (a power of two) that are in the
20807 * set. No use looking once we've got that number */
20808 if (count >= needed) break;
20812 if (OP(n) == NANYOFM) {
20813 _invlist_invert(cp_list);
20819 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20824 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20829 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20831 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20832 if (flags & (1<<bit)) {
20833 if (!set++ && lead)
20834 Perl_re_printf( aTHX_ "%s", lead);
20835 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20840 Perl_re_printf( aTHX_ "\n");
20842 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20847 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20853 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20855 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20856 if (flags & (1<<bit)) {
20857 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20860 if (!set++ && lead)
20861 Perl_re_printf( aTHX_ "%s", lead);
20862 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20865 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20866 if (!set++ && lead) {
20867 Perl_re_printf( aTHX_ "%s", lead);
20870 case REGEX_UNICODE_CHARSET:
20871 Perl_re_printf( aTHX_ "UNICODE");
20873 case REGEX_LOCALE_CHARSET:
20874 Perl_re_printf( aTHX_ "LOCALE");
20876 case REGEX_ASCII_RESTRICTED_CHARSET:
20877 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20879 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20880 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20883 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20889 Perl_re_printf( aTHX_ "\n");
20891 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20897 Perl_regdump(pTHX_ const regexp *r)
20901 SV * const sv = sv_newmortal();
20902 SV *dsv= sv_newmortal();
20903 RXi_GET_DECL(r, ri);
20904 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20906 PERL_ARGS_ASSERT_REGDUMP;
20908 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20910 /* Header fields of interest. */
20911 for (i = 0; i < 2; i++) {
20912 if (r->substrs->data[i].substr) {
20913 RE_PV_QUOTED_DECL(s, 0, dsv,
20914 SvPVX_const(r->substrs->data[i].substr),
20915 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20916 PL_dump_re_max_len);
20917 Perl_re_printf( aTHX_
20918 "%s %s%s at %" IVdf "..%" UVuf " ",
20919 i ? "floating" : "anchored",
20921 RE_SV_TAIL(r->substrs->data[i].substr),
20922 (IV)r->substrs->data[i].min_offset,
20923 (UV)r->substrs->data[i].max_offset);
20925 else if (r->substrs->data[i].utf8_substr) {
20926 RE_PV_QUOTED_DECL(s, 1, dsv,
20927 SvPVX_const(r->substrs->data[i].utf8_substr),
20928 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20930 Perl_re_printf( aTHX_
20931 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20932 i ? "floating" : "anchored",
20934 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20935 (IV)r->substrs->data[i].min_offset,
20936 (UV)r->substrs->data[i].max_offset);
20940 if (r->check_substr || r->check_utf8)
20941 Perl_re_printf( aTHX_
20943 ( r->check_substr == r->substrs->data[1].substr
20944 && r->check_utf8 == r->substrs->data[1].utf8_substr
20945 ? "(checking floating" : "(checking anchored"));
20946 if (r->intflags & PREGf_NOSCAN)
20947 Perl_re_printf( aTHX_ " noscan");
20948 if (r->extflags & RXf_CHECK_ALL)
20949 Perl_re_printf( aTHX_ " isall");
20950 if (r->check_substr || r->check_utf8)
20951 Perl_re_printf( aTHX_ ") ");
20953 if (ri->regstclass) {
20954 regprop(r, sv, ri->regstclass, NULL, NULL);
20955 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
20957 if (r->intflags & PREGf_ANCH) {
20958 Perl_re_printf( aTHX_ "anchored");
20959 if (r->intflags & PREGf_ANCH_MBOL)
20960 Perl_re_printf( aTHX_ "(MBOL)");
20961 if (r->intflags & PREGf_ANCH_SBOL)
20962 Perl_re_printf( aTHX_ "(SBOL)");
20963 if (r->intflags & PREGf_ANCH_GPOS)
20964 Perl_re_printf( aTHX_ "(GPOS)");
20965 Perl_re_printf( aTHX_ " ");
20967 if (r->intflags & PREGf_GPOS_SEEN)
20968 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
20969 if (r->intflags & PREGf_SKIP)
20970 Perl_re_printf( aTHX_ "plus ");
20971 if (r->intflags & PREGf_IMPLICIT)
20972 Perl_re_printf( aTHX_ "implicit ");
20973 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
20974 if (r->extflags & RXf_EVAL_SEEN)
20975 Perl_re_printf( aTHX_ "with eval ");
20976 Perl_re_printf( aTHX_ "\n");
20978 regdump_extflags("r->extflags: ", r->extflags);
20979 regdump_intflags("r->intflags: ", r->intflags);
20982 PERL_ARGS_ASSERT_REGDUMP;
20983 PERL_UNUSED_CONTEXT;
20984 PERL_UNUSED_ARG(r);
20985 #endif /* DEBUGGING */
20988 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20991 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
20992 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
20993 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
20994 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
20995 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
20996 || _CC_VERTSPACE != 15
20997 # error Need to adjust order of anyofs[]
20999 static const char * const anyofs[] = {
21036 - regprop - printable representation of opcode, with run time support
21040 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21045 RXi_GET_DECL(prog, progi);
21046 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21048 PERL_ARGS_ASSERT_REGPROP;
21052 if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
21053 if (pRExC_state) { /* This gives more info, if we have it */
21054 FAIL3("panic: corrupted regexp opcode %d > %d",
21055 (int)OP(o), (int)REGNODE_MAX);
21058 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21059 (int)OP(o), (int)REGNODE_MAX);
21062 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21064 k = PL_regkind[OP(o)];
21067 sv_catpvs(sv, " ");
21068 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21069 * is a crude hack but it may be the best for now since
21070 * we have no flag "this EXACTish node was UTF-8"
21072 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21073 PL_colors[0], PL_colors[1],
21074 PERL_PV_ESCAPE_UNI_DETECT |
21075 PERL_PV_ESCAPE_NONASCII |
21076 PERL_PV_PRETTY_ELLIPSES |
21077 PERL_PV_PRETTY_LTGT |
21078 PERL_PV_PRETTY_NOCLEAR
21080 } else if (k == TRIE) {
21081 /* print the details of the trie in dumpuntil instead, as
21082 * progi->data isn't available here */
21083 const char op = OP(o);
21084 const U32 n = ARG(o);
21085 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21086 (reg_ac_data *)progi->data->data[n] :
21088 const reg_trie_data * const trie
21089 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21091 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21092 DEBUG_TRIE_COMPILE_r({
21094 sv_catpvs(sv, "(JUMP)");
21095 Perl_sv_catpvf(aTHX_ sv,
21096 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21097 (UV)trie->startstate,
21098 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21099 (UV)trie->wordcount,
21102 (UV)TRIE_CHARCOUNT(trie),
21103 (UV)trie->uniquecharcount
21106 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21107 sv_catpvs(sv, "[");
21108 (void) put_charclass_bitmap_innards(sv,
21109 ((IS_ANYOF_TRIE(op))
21111 : TRIE_BITMAP(trie)),
21118 sv_catpvs(sv, "]");
21120 } else if (k == CURLY) {
21121 U32 lo = ARG1(o), hi = ARG2(o);
21122 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21123 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21124 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21125 if (hi == REG_INFTY)
21126 sv_catpvs(sv, "INFTY");
21128 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21129 sv_catpvs(sv, "}");
21131 else if (k == WHILEM && o->flags) /* Ordinal/of */
21132 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21133 else if (k == REF || k == OPEN || k == CLOSE
21134 || k == GROUPP || OP(o)==ACCEPT)
21136 AV *name_list= NULL;
21137 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21138 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21139 if ( RXp_PAREN_NAMES(prog) ) {
21140 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21141 } else if ( pRExC_state ) {
21142 name_list= RExC_paren_name_list;
21145 if ( k != REF || (OP(o) < REFN)) {
21146 SV **name= av_fetch(name_list, parno, 0 );
21148 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21151 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21152 I32 *nums=(I32*)SvPVX(sv_dat);
21153 SV **name= av_fetch(name_list, nums[0], 0 );
21156 for ( n=0; n<SvIVX(sv_dat); n++ ) {
21157 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21158 (n ? "," : ""), (IV)nums[n]);
21160 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21164 if ( k == REF && reginfo) {
21165 U32 n = ARG(o); /* which paren pair */
21166 I32 ln = prog->offs[n].start;
21167 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21168 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21169 else if (ln == prog->offs[n].end)
21170 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21172 const char *s = reginfo->strbeg + ln;
21173 Perl_sv_catpvf(aTHX_ sv, ": ");
21174 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21175 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21178 } else if (k == GOSUB) {
21179 AV *name_list= NULL;
21180 if ( RXp_PAREN_NAMES(prog) ) {
21181 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21182 } else if ( pRExC_state ) {
21183 name_list= RExC_paren_name_list;
21186 /* Paren and offset */
21187 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21188 (int)((o + (int)ARG2L(o)) - progi->program) );
21190 SV **name= av_fetch(name_list, ARG(o), 0 );
21192 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21195 else if (k == LOGICAL)
21196 /* 2: embedded, otherwise 1 */
21197 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21198 else if (k == ANYOF || k == ANYOFR) {
21202 bool do_sep = FALSE; /* Do we need to separate various components of
21204 /* Set if there is still an unresolved user-defined property */
21205 SV *unresolved = NULL;
21207 /* Things that are ignored except when the runtime locale is UTF-8 */
21208 SV *only_utf8_locale_invlist = NULL;
21210 /* Code points that don't fit in the bitmap */
21211 SV *nonbitmap_invlist = NULL;
21213 /* And things that aren't in the bitmap, but are small enough to be */
21214 SV* bitmap_range_not_in_bitmap = NULL;
21218 if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21224 flags = ANYOF_FLAGS(o);
21225 bitmap = ANYOF_BITMAP(o);
21229 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21230 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21231 sv_catpvs(sv, "{utf8-locale-reqd}");
21233 if (flags & ANYOFL_FOLD) {
21234 sv_catpvs(sv, "{i}");
21238 inverted = flags & ANYOF_INVERT;
21240 /* If there is stuff outside the bitmap, get it */
21241 if (arg != ANYOF_ONLY_HAS_BITMAP) {
21242 if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21243 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21245 ANYOFRbase(o) + ANYOFRdelta(o));
21248 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21249 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21251 &only_utf8_locale_invlist,
21252 &nonbitmap_invlist);
21254 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21256 &only_utf8_locale_invlist,
21257 &nonbitmap_invlist);
21261 /* The non-bitmap data may contain stuff that could fit in the
21262 * bitmap. This could come from a user-defined property being
21263 * finally resolved when this call was done; or much more likely
21264 * because there are matches that require UTF-8 to be valid, and so
21265 * aren't in the bitmap (or ANYOFR). This is teased apart later */
21266 _invlist_intersection(nonbitmap_invlist,
21268 &bitmap_range_not_in_bitmap);
21269 /* Leave just the things that don't fit into the bitmap */
21270 _invlist_subtract(nonbitmap_invlist,
21272 &nonbitmap_invlist);
21275 /* Obey this flag to add all above-the-bitmap code points */
21276 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21277 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21278 NUM_ANYOF_CODE_POINTS,
21282 /* Ready to start outputting. First, the initial left bracket */
21283 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21285 /* ANYOFH by definition doesn't have anything that will fit inside the
21286 * bitmap; ANYOFR may or may not. */
21287 if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21288 && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21289 || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21291 /* Then all the things that could fit in the bitmap */
21292 do_sep = put_charclass_bitmap_innards(sv,
21294 bitmap_range_not_in_bitmap,
21295 only_utf8_locale_invlist,
21299 /* Can't try inverting for a
21300 * better display if there
21301 * are things that haven't
21304 || inRANGE(OP(o), ANYOFR, ANYOFRb));
21305 SvREFCNT_dec(bitmap_range_not_in_bitmap);
21307 /* If there are user-defined properties which haven't been defined
21308 * yet, output them. If the result is not to be inverted, it is
21309 * clearest to output them in a separate [] from the bitmap range
21310 * stuff. If the result is to be complemented, we have to show
21311 * everything in one [], as the inversion applies to the whole
21312 * thing. Use {braces} to separate them from anything in the
21313 * bitmap and anything above the bitmap. */
21316 if (! do_sep) { /* If didn't output anything in the bitmap
21318 sv_catpvs(sv, "^");
21320 sv_catpvs(sv, "{");
21323 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21326 sv_catsv(sv, unresolved);
21328 sv_catpvs(sv, "}");
21330 do_sep = ! inverted;
21334 /* And, finally, add the above-the-bitmap stuff */
21335 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21338 /* See if truncation size is overridden */
21339 const STRLEN dump_len = (PL_dump_re_max_len > 256)
21340 ? PL_dump_re_max_len
21343 /* This is output in a separate [] */
21345 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21348 /* And, for easy of understanding, it is shown in the
21349 * uncomplemented form if possible. The one exception being if
21350 * there are unresolved items, where the inversion has to be
21351 * delayed until runtime */
21352 if (inverted && ! unresolved) {
21353 _invlist_invert(nonbitmap_invlist);
21354 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21357 contents = invlist_contents(nonbitmap_invlist,
21358 FALSE /* output suitable for catsv */
21361 /* If the output is shorter than the permissible maximum, just do it. */
21362 if (SvCUR(contents) <= dump_len) {
21363 sv_catsv(sv, contents);
21366 const char * contents_string = SvPVX(contents);
21367 STRLEN i = dump_len;
21369 /* Otherwise, start at the permissible max and work back to the
21370 * first break possibility */
21371 while (i > 0 && contents_string[i] != ' ') {
21374 if (i == 0) { /* Fail-safe. Use the max if we couldn't
21375 find a legal break */
21379 sv_catpvn(sv, contents_string, i);
21380 sv_catpvs(sv, "...");
21383 SvREFCNT_dec_NN(contents);
21384 SvREFCNT_dec_NN(nonbitmap_invlist);
21387 /* And finally the matching, closing ']' */
21388 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21390 if (OP(o) == ANYOFHs) {
21391 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21393 else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21394 U8 lowest = (OP(o) != ANYOFHr)
21396 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21397 U8 highest = (OP(o) == ANYOFHr)
21398 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21399 : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21402 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21403 if (lowest != highest) {
21404 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21406 Perl_sv_catpvf(aTHX_ sv, ")");
21409 SvREFCNT_dec(unresolved);
21411 else if (k == ANYOFM) {
21412 SV * cp_list = get_ANYOFM_contents(o);
21414 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21415 if (OP(o) == NANYOFM) {
21416 _invlist_invert(cp_list);
21419 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21420 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21422 SvREFCNT_dec(cp_list);
21424 else if (k == POSIXD || k == NPOSIXD) {
21425 U8 index = FLAGS(o) * 2;
21426 if (index < C_ARRAY_LENGTH(anyofs)) {
21427 if (*anyofs[index] != '[') {
21428 sv_catpvs(sv, "[");
21430 sv_catpv(sv, anyofs[index]);
21431 if (*anyofs[index] != '[') {
21432 sv_catpvs(sv, "]");
21436 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21439 else if (k == BOUND || k == NBOUND) {
21440 /* Must be synced with order of 'bound_type' in regcomp.h */
21441 const char * const bounds[] = {
21442 "", /* Traditional */
21448 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21449 sv_catpv(sv, bounds[FLAGS(o)]);
21451 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21452 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21454 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21456 Perl_sv_catpvf(aTHX_ sv, "]");
21458 else if (OP(o) == SBOL)
21459 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21461 /* add on the verb argument if there is one */
21462 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21464 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21465 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21467 sv_catpvs(sv, ":NULL");
21470 PERL_UNUSED_CONTEXT;
21471 PERL_UNUSED_ARG(sv);
21472 PERL_UNUSED_ARG(o);
21473 PERL_UNUSED_ARG(prog);
21474 PERL_UNUSED_ARG(reginfo);
21475 PERL_UNUSED_ARG(pRExC_state);
21476 #endif /* DEBUGGING */
21482 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21483 { /* Assume that RE_INTUIT is set */
21484 /* Returns an SV containing a string that must appear in the target for it
21487 struct regexp *const prog = ReANY(r);
21488 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21490 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21491 PERL_UNUSED_CONTEXT;
21495 if (prog->maxlen > 0) {
21496 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21497 ? prog->check_utf8 : prog->check_substr);
21499 if (!PL_colorset) reginitcolors();
21500 Perl_re_printf( aTHX_
21501 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21503 RX_UTF8(r) ? "utf8 " : "",
21504 PL_colors[5], PL_colors[0],
21507 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21511 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21512 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21518 handles refcounting and freeing the perl core regexp structure. When
21519 it is necessary to actually free the structure the first thing it
21520 does is call the 'free' method of the regexp_engine associated to
21521 the regexp, allowing the handling of the void *pprivate; member
21522 first. (This routine is not overridable by extensions, which is why
21523 the extensions free is called first.)
21525 See regdupe and regdupe_internal if you change anything here.
21527 #ifndef PERL_IN_XSUB_RE
21529 Perl_pregfree(pTHX_ REGEXP *r)
21535 Perl_pregfree2(pTHX_ REGEXP *rx)
21537 struct regexp *const r = ReANY(rx);
21538 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21540 PERL_ARGS_ASSERT_PREGFREE2;
21545 if (r->mother_re) {
21546 ReREFCNT_dec(r->mother_re);
21548 CALLREGFREE_PVT(rx); /* free the private data */
21549 SvREFCNT_dec(RXp_PAREN_NAMES(r));
21553 for (i = 0; i < 2; i++) {
21554 SvREFCNT_dec(r->substrs->data[i].substr);
21555 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21557 Safefree(r->substrs);
21559 RX_MATCH_COPY_FREE(rx);
21560 #ifdef PERL_ANY_COW
21561 SvREFCNT_dec(r->saved_copy);
21564 SvREFCNT_dec(r->qr_anoncv);
21565 if (r->recurse_locinput)
21566 Safefree(r->recurse_locinput);
21572 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21573 except that dsv will be created if NULL.
21575 This function is used in two main ways. First to implement
21576 $r = qr/....; $s = $$r;
21578 Secondly, it is used as a hacky workaround to the structural issue of
21580 being stored in the regexp structure which is in turn stored in
21581 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21582 could be PL_curpm in multiple contexts, and could require multiple
21583 result sets being associated with the pattern simultaneously, such
21584 as when doing a recursive match with (??{$qr})
21586 The solution is to make a lightweight copy of the regexp structure
21587 when a qr// is returned from the code executed by (??{$qr}) this
21588 lightweight copy doesn't actually own any of its data except for
21589 the starp/end and the actual regexp structure itself.
21595 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21597 struct regexp *drx;
21598 struct regexp *const srx = ReANY(ssv);
21599 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21601 PERL_ARGS_ASSERT_REG_TEMP_COPY;
21604 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21606 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21608 /* our only valid caller, sv_setsv_flags(), should have done
21609 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21610 assert(!SvOOK(dsv));
21611 assert(!SvIsCOW(dsv));
21612 assert(!SvROK(dsv));
21614 if (SvPVX_const(dsv)) {
21616 Safefree(SvPVX(dsv));
21621 SvOK_off((SV *)dsv);
21624 /* For PVLVs, the head (sv_any) points to an XPVLV, while
21625 * the LV's xpvlenu_rx will point to a regexp body, which
21626 * we allocate here */
21627 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21628 assert(!SvPVX(dsv));
21629 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21630 temp->sv_any = NULL;
21631 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21632 SvREFCNT_dec_NN(temp);
21633 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21634 ing below will not set it. */
21635 SvCUR_set(dsv, SvCUR(ssv));
21638 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21639 sv_force_normal(sv) is called. */
21643 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21644 SvPV_set(dsv, RX_WRAPPED(ssv));
21645 /* We share the same string buffer as the original regexp, on which we
21646 hold a reference count, incremented when mother_re is set below.
21647 The string pointer is copied here, being part of the regexp struct.
21649 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21650 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21654 const I32 npar = srx->nparens+1;
21655 Newx(drx->offs, npar, regexp_paren_pair);
21656 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21658 if (srx->substrs) {
21660 Newx(drx->substrs, 1, struct reg_substr_data);
21661 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21663 for (i = 0; i < 2; i++) {
21664 SvREFCNT_inc_void(drx->substrs->data[i].substr);
21665 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21668 /* check_substr and check_utf8, if non-NULL, point to either their
21669 anchored or float namesakes, and don't hold a second reference. */
21671 RX_MATCH_COPIED_off(dsv);
21672 #ifdef PERL_ANY_COW
21673 drx->saved_copy = NULL;
21675 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21676 SvREFCNT_inc_void(drx->qr_anoncv);
21677 if (srx->recurse_locinput)
21678 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21685 /* regfree_internal()
21687 Free the private data in a regexp. This is overloadable by
21688 extensions. Perl takes care of the regexp structure in pregfree(),
21689 this covers the *pprivate pointer which technically perl doesn't
21690 know about, however of course we have to handle the
21691 regexp_internal structure when no extension is in use.
21693 Note this is called before freeing anything in the regexp
21698 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21700 struct regexp *const r = ReANY(rx);
21701 RXi_GET_DECL(r, ri);
21702 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21704 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21714 SV *dsv= sv_newmortal();
21715 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21716 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21717 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21718 PL_colors[4], PL_colors[5], s);
21722 #ifdef RE_TRACK_PATTERN_OFFSETS
21724 Safefree(ri->u.offsets); /* 20010421 MJD */
21726 if (ri->code_blocks)
21727 S_free_codeblocks(aTHX_ ri->code_blocks);
21730 int n = ri->data->count;
21733 /* If you add a ->what type here, update the comment in regcomp.h */
21734 switch (ri->data->what[n]) {
21740 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21743 Safefree(ri->data->data[n]);
21749 { /* Aho Corasick add-on structure for a trie node.
21750 Used in stclass optimization only */
21752 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21753 #ifdef USE_ITHREADS
21757 refcount = --aho->refcount;
21760 PerlMemShared_free(aho->states);
21761 PerlMemShared_free(aho->fail);
21762 /* do this last!!!! */
21763 PerlMemShared_free(ri->data->data[n]);
21764 /* we should only ever get called once, so
21765 * assert as much, and also guard the free
21766 * which /might/ happen twice. At the least
21767 * it will make code anlyzers happy and it
21768 * doesn't cost much. - Yves */
21769 assert(ri->regstclass);
21770 if (ri->regstclass) {
21771 PerlMemShared_free(ri->regstclass);
21772 ri->regstclass = 0;
21779 /* trie structure. */
21781 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21782 #ifdef USE_ITHREADS
21786 refcount = --trie->refcount;
21789 PerlMemShared_free(trie->charmap);
21790 PerlMemShared_free(trie->states);
21791 PerlMemShared_free(trie->trans);
21793 PerlMemShared_free(trie->bitmap);
21795 PerlMemShared_free(trie->jump);
21796 PerlMemShared_free(trie->wordinfo);
21797 /* do this last!!!! */
21798 PerlMemShared_free(ri->data->data[n]);
21803 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21804 ri->data->what[n]);
21807 Safefree(ri->data->what);
21808 Safefree(ri->data);
21814 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21815 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21816 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
21819 re_dup_guts - duplicate a regexp.
21821 This routine is expected to clone a given regexp structure. It is only
21822 compiled under USE_ITHREADS.
21824 After all of the core data stored in struct regexp is duplicated
21825 the regexp_engine.dupe method is used to copy any private data
21826 stored in the *pprivate pointer. This allows extensions to handle
21827 any duplication it needs to do.
21829 See pregfree() and regfree_internal() if you change anything here.
21831 #if defined(USE_ITHREADS)
21832 #ifndef PERL_IN_XSUB_RE
21834 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21838 const struct regexp *r = ReANY(sstr);
21839 struct regexp *ret = ReANY(dstr);
21841 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21843 npar = r->nparens+1;
21844 Newx(ret->offs, npar, regexp_paren_pair);
21845 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21847 if (ret->substrs) {
21848 /* Do it this way to avoid reading from *r after the StructCopy().
21849 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21850 cache, it doesn't matter. */
21852 const bool anchored = r->check_substr
21853 ? r->check_substr == r->substrs->data[0].substr
21854 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21855 Newx(ret->substrs, 1, struct reg_substr_data);
21856 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21858 for (i = 0; i < 2; i++) {
21859 ret->substrs->data[i].substr =
21860 sv_dup_inc(ret->substrs->data[i].substr, param);
21861 ret->substrs->data[i].utf8_substr =
21862 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21865 /* check_substr and check_utf8, if non-NULL, point to either their
21866 anchored or float namesakes, and don't hold a second reference. */
21868 if (ret->check_substr) {
21870 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21872 ret->check_substr = ret->substrs->data[0].substr;
21873 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21875 assert(r->check_substr == r->substrs->data[1].substr);
21876 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21878 ret->check_substr = ret->substrs->data[1].substr;
21879 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21881 } else if (ret->check_utf8) {
21883 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21885 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21890 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21891 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21892 if (r->recurse_locinput)
21893 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21896 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21898 if (RX_MATCH_COPIED(dstr))
21899 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21901 ret->subbeg = NULL;
21902 #ifdef PERL_ANY_COW
21903 ret->saved_copy = NULL;
21906 /* Whether mother_re be set or no, we need to copy the string. We
21907 cannot refrain from copying it when the storage points directly to
21908 our mother regexp, because that's
21909 1: a buffer in a different thread
21910 2: something we no longer hold a reference on
21911 so we need to copy it locally. */
21912 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21913 /* set malloced length to a non-zero value so it will be freed
21914 * (otherwise in combination with SVf_FAKE it looks like an alien
21915 * buffer). It doesn't have to be the actual malloced size, since it
21916 * should never be grown */
21917 SvLEN_set(dstr, SvCUR(sstr)+1);
21918 ret->mother_re = NULL;
21920 #endif /* PERL_IN_XSUB_RE */
21925 This is the internal complement to regdupe() which is used to copy
21926 the structure pointed to by the *pprivate pointer in the regexp.
21927 This is the core version of the extension overridable cloning hook.
21928 The regexp structure being duplicated will be copied by perl prior
21929 to this and will be provided as the regexp *r argument, however
21930 with the /old/ structures pprivate pointer value. Thus this routine
21931 may override any copying normally done by perl.
21933 It returns a pointer to the new regexp_internal structure.
21937 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21940 struct regexp *const r = ReANY(rx);
21941 regexp_internal *reti;
21943 RXi_GET_DECL(r, ri);
21945 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21949 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21950 char, regexp_internal);
21951 Copy(ri->program, reti->program, len+1, regnode);
21954 if (ri->code_blocks) {
21956 Newx(reti->code_blocks, 1, struct reg_code_blocks);
21957 Newx(reti->code_blocks->cb, ri->code_blocks->count,
21958 struct reg_code_block);
21959 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21960 ri->code_blocks->count, struct reg_code_block);
21961 for (n = 0; n < ri->code_blocks->count; n++)
21962 reti->code_blocks->cb[n].src_regex = (REGEXP*)
21963 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21964 reti->code_blocks->count = ri->code_blocks->count;
21965 reti->code_blocks->refcnt = 1;
21968 reti->code_blocks = NULL;
21970 reti->regstclass = NULL;
21973 struct reg_data *d;
21974 const int count = ri->data->count;
21977 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21978 char, struct reg_data);
21979 Newx(d->what, count, U8);
21982 for (i = 0; i < count; i++) {
21983 d->what[i] = ri->data->what[i];
21984 switch (d->what[i]) {
21985 /* see also regcomp.h and regfree_internal() */
21986 case 'a': /* actually an AV, but the dup function is identical.
21987 values seem to be "plain sv's" generally. */
21988 case 'r': /* a compiled regex (but still just another SV) */
21989 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21990 this use case should go away, the code could have used
21991 'a' instead - see S_set_ANYOF_arg() for array contents. */
21992 case 'S': /* actually an SV, but the dup function is identical. */
21993 case 'u': /* actually an HV, but the dup function is identical.
21994 values are "plain sv's" */
21995 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21998 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21999 * patterns which could start with several different things. Pre-TRIE
22000 * this was more important than it is now, however this still helps
22001 * in some places, for instance /x?a+/ might produce a SSC equivalent
22002 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22005 /* This is cheating. */
22006 Newx(d->data[i], 1, regnode_ssc);
22007 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22008 reti->regstclass = (regnode*)d->data[i];
22011 /* AHO-CORASICK fail table */
22012 /* Trie stclasses are readonly and can thus be shared
22013 * without duplication. We free the stclass in pregfree
22014 * when the corresponding reg_ac_data struct is freed.
22016 reti->regstclass= ri->regstclass;
22019 /* TRIE transition table */
22021 ((reg_trie_data*)ri->data->data[i])->refcount++;
22024 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22025 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22026 is not from another regexp */
22027 d->data[i] = ri->data->data[i];
22030 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22031 ri->data->what[i]);
22040 reti->name_list_idx = ri->name_list_idx;
22042 #ifdef RE_TRACK_PATTERN_OFFSETS
22043 if (ri->u.offsets) {
22044 Newx(reti->u.offsets, 2*len+1, U32);
22045 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22048 SetProgLen(reti, len);
22051 return (void*)reti;
22054 #endif /* USE_ITHREADS */
22056 #ifndef PERL_IN_XSUB_RE
22059 - regnext - dig the "next" pointer out of a node
22062 Perl_regnext(pTHX_ regnode *p)
22069 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
22070 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22071 (int)OP(p), (int)REGNODE_MAX);
22074 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22084 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22087 STRLEN len = strlen(pat);
22090 const char *message;
22092 PERL_ARGS_ASSERT_RE_CROAK;
22096 Copy(pat, buf, len , char);
22098 buf[len + 1] = '\0';
22099 va_start(args, pat);
22100 msv = vmess(buf, &args);
22102 message = SvPV_const(msv, len);
22105 Copy(message, buf, len , char);
22106 /* len-1 to avoid \n */
22107 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22110 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22112 #ifndef PERL_IN_XSUB_RE
22114 Perl_save_re_context(pTHX)
22119 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22122 const REGEXP * const rx = PM_GETRE(PL_curpm);
22124 nparens = RX_NPARENS(rx);
22127 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22128 * that PL_curpm will be null, but that utf8.pm and the modules it
22129 * loads will only use $1..$3.
22130 * The t/porting/re_context.t test file checks this assumption.
22135 for (i = 1; i <= nparens; i++) {
22136 char digits[TYPE_CHARS(long)];
22137 const STRLEN len = my_snprintf(digits, sizeof(digits),
22139 GV *const *const gvp
22140 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22143 GV * const gv = *gvp;
22144 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22154 S_put_code_point(pTHX_ SV *sv, UV c)
22156 PERL_ARGS_ASSERT_PUT_CODE_POINT;
22159 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22161 else if (isPRINT(c)) {
22162 const char string = (char) c;
22164 /* We use {phrase} as metanotation in the class, so also escape literal
22166 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22167 sv_catpvs(sv, "\\");
22168 sv_catpvn(sv, &string, 1);
22170 else if (isMNEMONIC_CNTRL(c)) {
22171 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22174 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22178 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22181 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22183 /* Appends to 'sv' a displayable version of the range of code points from
22184 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
22185 * that have them, when they occur at the beginning or end of the range.
22186 * It uses hex to output the remaining code points, unless 'allow_literals'
22187 * is true, in which case the printable ASCII ones are output as-is (though
22188 * some of these will be escaped by put_code_point()).
22190 * NOTE: This is designed only for printing ranges of code points that fit
22191 * inside an ANYOF bitmap. Higher code points are simply suppressed
22194 const unsigned int min_range_count = 3;
22196 assert(start <= end);
22198 PERL_ARGS_ASSERT_PUT_RANGE;
22200 while (start <= end) {
22202 const char * format;
22204 if (end - start < min_range_count) {
22206 /* Output chars individually when they occur in short ranges */
22207 for (; start <= end; start++) {
22208 put_code_point(sv, start);
22213 /* If permitted by the input options, and there is a possibility that
22214 * this range contains a printable literal, look to see if there is
22216 if (allow_literals && start <= MAX_PRINT_A) {
22218 /* If the character at the beginning of the range isn't an ASCII
22219 * printable, effectively split the range into two parts:
22220 * 1) the portion before the first such printable,
22222 * and output them separately. */
22223 if (! isPRINT_A(start)) {
22224 UV temp_end = start + 1;
22226 /* There is no point looking beyond the final possible
22227 * printable, in MAX_PRINT_A */
22228 UV max = MIN(end, MAX_PRINT_A);
22230 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22234 /* Here, temp_end points to one beyond the first printable if
22235 * found, or to one beyond 'max' if not. If none found, make
22236 * sure that we use the entire range */
22237 if (temp_end > MAX_PRINT_A) {
22238 temp_end = end + 1;
22241 /* Output the first part of the split range: the part that
22242 * doesn't have printables, with the parameter set to not look
22243 * for literals (otherwise we would infinitely recurse) */
22244 put_range(sv, start, temp_end - 1, FALSE);
22246 /* The 2nd part of the range (if any) starts here. */
22249 /* We do a continue, instead of dropping down, because even if
22250 * the 2nd part is non-empty, it could be so short that we want
22251 * to output it as individual characters, as tested for at the
22252 * top of this loop. */
22256 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
22257 * output a sub-range of just the digits or letters, then process
22258 * the remaining portion as usual. */
22259 if (isALPHANUMERIC_A(start)) {
22260 UV mask = (isDIGIT_A(start))
22265 UV temp_end = start + 1;
22267 /* Find the end of the sub-range that includes just the
22268 * characters in the same class as the first character in it */
22269 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22274 /* For short ranges, don't duplicate the code above to output
22275 * them; just call recursively */
22276 if (temp_end - start < min_range_count) {
22277 put_range(sv, start, temp_end, FALSE);
22279 else { /* Output as a range */
22280 put_code_point(sv, start);
22281 sv_catpvs(sv, "-");
22282 put_code_point(sv, temp_end);
22284 start = temp_end + 1;
22288 /* We output any other printables as individual characters */
22289 if (isPUNCT_A(start) || isSPACE_A(start)) {
22290 while (start <= end && (isPUNCT_A(start)
22291 || isSPACE_A(start)))
22293 put_code_point(sv, start);
22298 } /* End of looking for literals */
22300 /* Here is not to output as a literal. Some control characters have
22301 * mnemonic names. Split off any of those at the beginning and end of
22302 * the range to print mnemonically. It isn't possible for many of
22303 * these to be in a row, so this won't overwhelm with output */
22305 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22307 while (isMNEMONIC_CNTRL(start) && start <= end) {
22308 put_code_point(sv, start);
22312 /* If this didn't take care of the whole range ... */
22313 if (start <= end) {
22315 /* Look backwards from the end to find the final non-mnemonic
22318 while (isMNEMONIC_CNTRL(temp_end)) {
22322 /* And separately output the interior range that doesn't start
22323 * or end with mnemonics */
22324 put_range(sv, start, temp_end, FALSE);
22326 /* Then output the mnemonic trailing controls */
22327 start = temp_end + 1;
22328 while (start <= end) {
22329 put_code_point(sv, start);
22336 /* As a final resort, output the range or subrange as hex. */
22338 if (start >= NUM_ANYOF_CODE_POINTS) {
22341 else { /* Have to split range at the bitmap boundary */
22342 this_end = (end < NUM_ANYOF_CODE_POINTS)
22344 : NUM_ANYOF_CODE_POINTS - 1;
22346 #if NUM_ANYOF_CODE_POINTS > 256
22347 format = (this_end < 256)
22348 ? "\\x%02" UVXf "-\\x%02" UVXf
22349 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22351 format = "\\x%02" UVXf "-\\x%02" UVXf;
22353 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22354 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22355 GCC_DIAG_RESTORE_STMT;
22361 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22363 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22367 bool allow_literals = TRUE;
22369 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22371 /* Generally, it is more readable if printable characters are output as
22372 * literals, but if a range (nearly) spans all of them, it's best to output
22373 * it as a single range. This code will use a single range if all but 2
22374 * ASCII printables are in it */
22375 invlist_iterinit(invlist);
22376 while (invlist_iternext(invlist, &start, &end)) {
22378 /* If the range starts beyond the final printable, it doesn't have any
22380 if (start > MAX_PRINT_A) {
22384 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
22385 * all but two, the range must start and end no later than 2 from
22387 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22388 if (end > MAX_PRINT_A) {
22394 if (end - start >= MAX_PRINT_A - ' ' - 2) {
22395 allow_literals = FALSE;
22400 invlist_iterfinish(invlist);
22402 /* Here we have figured things out. Output each range */
22403 invlist_iterinit(invlist);
22404 while (invlist_iternext(invlist, &start, &end)) {
22405 if (start >= NUM_ANYOF_CODE_POINTS) {
22408 put_range(sv, start, end, allow_literals);
22410 invlist_iterfinish(invlist);
22416 S_put_charclass_bitmap_innards_common(pTHX_
22417 SV* invlist, /* The bitmap */
22418 SV* posixes, /* Under /l, things like [:word:], \S */
22419 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
22420 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
22421 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
22422 const bool invert /* Is the result to be inverted? */
22425 /* Create and return an SV containing a displayable version of the bitmap
22426 * and associated information determined by the input parameters. If the
22427 * output would have been only the inversion indicator '^', NULL is instead
22433 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22436 output = newSVpvs("^");
22439 output = newSVpvs("");
22442 /* First, the code points in the bitmap that are unconditionally there */
22443 put_charclass_bitmap_innards_invlist(output, invlist);
22445 /* Traditionally, these have been placed after the main code points */
22447 sv_catsv(output, posixes);
22450 if (only_utf8 && _invlist_len(only_utf8)) {
22451 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22452 put_charclass_bitmap_innards_invlist(output, only_utf8);
22455 if (not_utf8 && _invlist_len(not_utf8)) {
22456 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22457 put_charclass_bitmap_innards_invlist(output, not_utf8);
22460 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22461 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22462 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22464 /* This is the only list in this routine that can legally contain code
22465 * points outside the bitmap range. The call just above to
22466 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22467 * output them here. There's about a half-dozen possible, and none in
22468 * contiguous ranges longer than 2 */
22469 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22471 SV* above_bitmap = NULL;
22473 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22475 invlist_iterinit(above_bitmap);
22476 while (invlist_iternext(above_bitmap, &start, &end)) {
22479 for (i = start; i <= end; i++) {
22480 put_code_point(output, i);
22483 invlist_iterfinish(above_bitmap);
22484 SvREFCNT_dec_NN(above_bitmap);
22488 if (invert && SvCUR(output) == 1) {
22496 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22498 SV *nonbitmap_invlist,
22499 SV *only_utf8_locale_invlist,
22500 const regnode * const node,
22502 const bool force_as_is_display)
22504 /* Appends to 'sv' a displayable version of the innards of the bracketed
22505 * character class defined by the other arguments:
22506 * 'bitmap' points to the bitmap, or NULL if to ignore that.
22507 * 'nonbitmap_invlist' is an inversion list of the code points that are in
22508 * the bitmap range, but for some reason aren't in the bitmap; NULL if
22509 * none. The reasons for this could be that they require some
22510 * condition such as the target string being or not being in UTF-8
22511 * (under /d), or because they came from a user-defined property that
22512 * was not resolved at the time of the regex compilation (under /u)
22513 * 'only_utf8_locale_invlist' is an inversion list of the code points that
22514 * are valid only if the runtime locale is a UTF-8 one; NULL if none
22515 * 'node' is the regex pattern ANYOF node. It is needed only when the
22516 * above two parameters are not null, and is passed so that this
22517 * routine can tease apart the various reasons for them.
22518 * 'flags' is the flags field of 'node'
22519 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
22520 * to invert things to see if that leads to a cleaner display. If
22521 * FALSE, this routine is free to use its judgment about doing this.
22523 * It returns TRUE if there was actually something output. (It may be that
22524 * the bitmap, etc is empty.)
22526 * When called for outputting the bitmap of a non-ANYOF node, just pass the
22527 * bitmap, with the succeeding parameters set to NULL, and the final one to
22531 /* In general, it tries to display the 'cleanest' representation of the
22532 * innards, choosing whether to display them inverted or not, regardless of
22533 * whether the class itself is to be inverted. However, there are some
22534 * cases where it can't try inverting, as what actually matches isn't known
22535 * until runtime, and hence the inversion isn't either. */
22538 bool inverting_allowed = ! force_as_is_display;
22541 STRLEN orig_sv_cur = SvCUR(sv);
22543 SV* invlist; /* Inversion list we accumulate of code points that
22544 are unconditionally matched */
22545 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
22547 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
22549 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
22550 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
22553 SV* as_is_display; /* The output string when we take the inputs
22555 SV* inverted_display; /* The output string when we invert the inputs */
22557 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
22559 /* We are biased in favor of displaying things without them being inverted,
22560 * as that is generally easier to understand */
22561 const int bias = 5;
22563 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22565 /* Start off with whatever code points are passed in. (We clone, so we
22566 * don't change the caller's list) */
22567 if (nonbitmap_invlist) {
22568 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22569 invlist = invlist_clone(nonbitmap_invlist, NULL);
22571 else { /* Worst case size is every other code point is matched */
22572 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22576 if (OP(node) == ANYOFD) {
22578 /* This flag indicates that the code points below 0x100 in the
22579 * nonbitmap list are precisely the ones that match only when the
22580 * target is UTF-8 (they should all be non-ASCII). */
22581 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22583 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22584 _invlist_subtract(invlist, only_utf8, &invlist);
22587 /* And this flag for matching all non-ASCII 0xFF and below */
22588 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22590 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22593 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22595 /* If either of these flags are set, what matches isn't
22596 * determinable except during execution, so don't know enough here
22598 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22599 inverting_allowed = FALSE;
22602 /* What the posix classes match also varies at runtime, so these
22603 * will be output symbolically. */
22604 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22607 posixes = newSVpvs("");
22608 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22609 if (ANYOF_POSIXL_TEST(node, i)) {
22610 sv_catpv(posixes, anyofs[i]);
22617 /* Accumulate the bit map into the unconditional match list */
22619 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22620 if (BITMAP_TEST(bitmap, i)) {
22623 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22626 invlist = _add_range_to_invlist(invlist, start, i-1);
22631 /* Make sure that the conditional match lists don't have anything in them
22632 * that match unconditionally; otherwise the output is quite confusing.
22633 * This could happen if the code that populates these misses some
22636 _invlist_subtract(only_utf8, invlist, &only_utf8);
22639 _invlist_subtract(not_utf8, invlist, ¬_utf8);
22642 if (only_utf8_locale_invlist) {
22644 /* Since this list is passed in, we have to make a copy before
22646 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22648 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22650 /* And, it can get really weird for us to try outputting an inverted
22651 * form of this list when it has things above the bitmap, so don't even
22653 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22654 inverting_allowed = FALSE;
22658 /* Calculate what the output would be if we take the input as-is */
22659 as_is_display = put_charclass_bitmap_innards_common(invlist,
22666 /* If have to take the output as-is, just do that */
22667 if (! inverting_allowed) {
22668 if (as_is_display) {
22669 sv_catsv(sv, as_is_display);
22670 SvREFCNT_dec_NN(as_is_display);
22673 else { /* But otherwise, create the output again on the inverted input, and
22674 use whichever version is shorter */
22676 int inverted_bias, as_is_bias;
22678 /* We will apply our bias to whichever of the the results doesn't have
22688 inverted_bias = bias;
22691 /* Now invert each of the lists that contribute to the output,
22692 * excluding from the result things outside the possible range */
22694 /* For the unconditional inversion list, we have to add in all the
22695 * conditional code points, so that when inverted, they will be gone
22697 _invlist_union(only_utf8, invlist, &invlist);
22698 _invlist_union(not_utf8, invlist, &invlist);
22699 _invlist_union(only_utf8_locale, invlist, &invlist);
22700 _invlist_invert(invlist);
22701 _invlist_intersection(invlist, PL_InBitmap, &invlist);
22704 _invlist_invert(only_utf8);
22705 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22707 else if (not_utf8) {
22709 /* If a code point matches iff the target string is not in UTF-8,
22710 * then complementing the result has it not match iff not in UTF-8,
22711 * which is the same thing as matching iff it is UTF-8. */
22712 only_utf8 = not_utf8;
22716 if (only_utf8_locale) {
22717 _invlist_invert(only_utf8_locale);
22718 _invlist_intersection(only_utf8_locale,
22720 &only_utf8_locale);
22723 inverted_display = put_charclass_bitmap_innards_common(
22728 only_utf8_locale, invert);
22730 /* Use the shortest representation, taking into account our bias
22731 * against showing it inverted */
22732 if ( inverted_display
22733 && ( ! as_is_display
22734 || ( SvCUR(inverted_display) + inverted_bias
22735 < SvCUR(as_is_display) + as_is_bias)))
22737 sv_catsv(sv, inverted_display);
22739 else if (as_is_display) {
22740 sv_catsv(sv, as_is_display);
22743 SvREFCNT_dec(as_is_display);
22744 SvREFCNT_dec(inverted_display);
22747 SvREFCNT_dec_NN(invlist);
22748 SvREFCNT_dec(only_utf8);
22749 SvREFCNT_dec(not_utf8);
22750 SvREFCNT_dec(posixes);
22751 SvREFCNT_dec(only_utf8_locale);
22753 return SvCUR(sv) > orig_sv_cur;
22756 #define CLEAR_OPTSTART \
22757 if (optstart) STMT_START { \
22758 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
22759 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22763 #define DUMPUNTIL(b,e) \
22765 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22767 STATIC const regnode *
22768 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22769 const regnode *last, const regnode *plast,
22770 SV* sv, I32 indent, U32 depth)
22772 U8 op = PSEUDO; /* Arbitrary non-END op. */
22773 const regnode *next;
22774 const regnode *optstart= NULL;
22776 RXi_GET_DECL(r, ri);
22777 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22779 PERL_ARGS_ASSERT_DUMPUNTIL;
22781 #ifdef DEBUG_DUMPUNTIL
22782 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
22783 last ? last-start : 0, plast ? plast-start : 0);
22786 if (plast && plast < last)
22789 while (PL_regkind[op] != END && (!last || node < last)) {
22791 /* While that wasn't END last time... */
22794 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22796 next = regnext((regnode *)node);
22799 if (OP(node) == OPTIMIZED) {
22800 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22807 regprop(r, sv, node, NULL, NULL);
22808 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
22809 (int)(2*indent + 1), "", SvPVX_const(sv));
22811 if (OP(node) != OPTIMIZED) {
22812 if (next == NULL) /* Next ptr. */
22813 Perl_re_printf( aTHX_ " (0)");
22814 else if (PL_regkind[(U8)op] == BRANCH
22815 && PL_regkind[OP(next)] != BRANCH )
22816 Perl_re_printf( aTHX_ " (FAIL)");
22818 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
22819 Perl_re_printf( aTHX_ "\n");
22823 if (PL_regkind[(U8)op] == BRANCHJ) {
22826 const regnode *nnode = (OP(next) == LONGJMP
22827 ? regnext((regnode *)next)
22829 if (last && nnode > last)
22831 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22834 else if (PL_regkind[(U8)op] == BRANCH) {
22836 DUMPUNTIL(NEXTOPER(node), next);
22838 else if ( PL_regkind[(U8)op] == TRIE ) {
22839 const regnode *this_trie = node;
22840 const char op = OP(node);
22841 const U32 n = ARG(node);
22842 const reg_ac_data * const ac = op>=AHOCORASICK ?
22843 (reg_ac_data *)ri->data->data[n] :
22845 const reg_trie_data * const trie =
22846 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22848 AV *const trie_words
22849 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22851 const regnode *nextbranch= NULL;
22854 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22855 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22857 Perl_re_indentf( aTHX_ "%s ",
22860 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22861 SvCUR(*elem_ptr), PL_dump_re_max_len,
22862 PL_colors[0], PL_colors[1],
22864 ? PERL_PV_ESCAPE_UNI
22866 | PERL_PV_PRETTY_ELLIPSES
22867 | PERL_PV_PRETTY_LTGT
22872 U16 dist= trie->jump[word_idx+1];
22873 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22874 (UV)((dist ? this_trie + dist : next) - start));
22877 nextbranch= this_trie + trie->jump[0];
22878 DUMPUNTIL(this_trie + dist, nextbranch);
22880 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22881 nextbranch= regnext((regnode *)nextbranch);
22883 Perl_re_printf( aTHX_ "\n");
22886 if (last && next > last)
22891 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22892 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22893 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22895 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22897 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22899 else if ( op == PLUS || op == STAR) {
22900 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22902 else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22903 /* Literal string, where present. */
22904 node += NODE_SZ_STR(node) - 1;
22905 node = NEXTOPER(node);
22908 node = NEXTOPER(node);
22909 node += regarglen[(U8)op];
22911 if (op == CURLYX || op == OPEN || op == SROPEN)
22915 #ifdef DEBUG_DUMPUNTIL
22916 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22921 #endif /* DEBUGGING */
22923 #ifndef PERL_IN_XSUB_RE
22925 # include "uni_keywords.h"
22928 Perl_init_uniprops(pTHX)
22933 char * dump_len_string;
22935 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22936 if ( ! dump_len_string
22937 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22939 PL_dump_re_max_len = 60; /* A reasonable default */
22943 PL_user_def_props = newHV();
22945 # ifdef USE_ITHREADS
22947 HvSHAREKEYS_off(PL_user_def_props);
22948 PL_user_def_props_aTHX = aTHX;
22952 /* Set up the inversion list interpreter-level variables */
22954 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22955 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22956 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22957 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22958 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22959 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22960 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22961 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22962 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22963 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22964 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22965 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22966 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22967 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22968 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22969 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22971 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22972 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22973 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22974 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22975 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22976 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22977 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22978 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22979 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22980 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22981 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22982 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22983 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22984 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22985 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22986 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22988 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22989 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22990 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22991 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22992 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22994 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
22995 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22996 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22997 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22999 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23001 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23002 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23004 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23005 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23007 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23008 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23009 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23010 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23011 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23012 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23013 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23014 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23015 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23016 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23017 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23018 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23019 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23020 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23023 /* The below are used only by deprecated functions. They could be removed */
23024 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23025 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23026 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23030 /* These four functions are compiled only in regcomp.c, where they have access
23031 * to the data they return. They are a way for re_comp.c to get access to that
23032 * data without having to compile the whole data structures. */
23035 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23037 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23039 return match_uniprop((U8 *) key, key_len);
23043 Perl_get_prop_definition(pTHX_ const int table_index)
23045 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23047 /* Create and return the inversion list */
23048 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23051 const char * const *
23052 Perl_get_prop_values(const int table_index)
23054 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23056 return UNI_prop_value_ptrs[table_index];
23060 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23062 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23064 return deprecated_property_msgs[warning_offset];
23069 This code was mainly added for backcompat to give a warning for non-portable
23070 code points in user-defined properties. But experiments showed that the
23071 warning in earlier perls were only omitted on overflow, which should be an
23072 error, so there really isnt a backcompat issue, and actually adding the
23073 warning when none was present before might cause breakage, for little gain. So
23074 khw left this code in, but not enabled. Tests were never added.
23077 Ei |const char *|get_extended_utf8_msg|const UV cp
23079 PERL_STATIC_INLINE const char *
23080 S_get_extended_utf8_msg(pTHX_ const UV cp)
23082 U8 dummy[UTF8_MAXBYTES + 1];
23086 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23089 msg = hv_fetchs(msgs, "text", 0);
23092 (void) sv_2mortal((SV *) msgs);
23094 return SvPVX(*msg);
23098 #endif /* end of ! PERL_IN_XSUB_RE */
23101 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23102 const bool ignore_case)
23104 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23105 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23106 * because nothing outside of ASCII will match. Use /m because the input
23107 * string may be a bunch of lines strung together.
23109 * Also sets up the debugging info */
23111 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23113 SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23114 REGEXP * subpattern_re;
23115 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23117 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23122 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23124 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23125 rx_flags = flags & RXf_PMf_COMPILETIME;
23127 #ifndef PERL_IN_XSUB_RE
23128 /* Use the core engine if this file is regcomp.c. That means no
23129 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23130 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23131 &PL_core_reg_engine,
23135 if (isDEBUG_WILDCARD) {
23136 /* Use the special debugging engine if this file is re_comp.c and wants
23137 * to output the wildcard matching. This uses whatever
23138 * 'use re "Debug ..." is in effect */
23139 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23145 /* Use the special wildcard engine if this file is re_comp.c and
23146 * doesn't want to output the wildcard matching. This uses whatever
23147 * 'use re "Debug ..." is in effect for compilation, but this engine
23148 * structure has been set up so that it uses the core engine for
23149 * execution, so no execution debugging as a result of re.pm will be
23151 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23155 /* XXX The above has the effect that any user-supplied regex engine
23156 * won't be called for matching wildcards. That might be good, or bad.
23157 * It could be changed in several ways. The reason it is done the
23158 * current way is to avoid having to save and restore
23159 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
23160 * could be used. Another suggestion is to keep the authoritative
23161 * value of the debug flags in a thread-local variable and add set/get
23162 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23163 * Still another is to pass a flag, say in the engine's intflags that
23164 * would be checked each time before doing the debug output */
23168 assert(subpattern_re); /* Should have died if didn't compile successfully */
23169 return subpattern_re;
23173 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23174 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23177 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23179 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23183 /* The compilation has set things up so that if the program doesn't want to
23184 * see the wildcard matching procedure, it will get the core execution
23185 * engine, which is subject only to -Dr. So we have to turn that off
23186 * around this procedure */
23187 if (! isDEBUG_WILDCARD) {
23188 /* Note! Casts away 'volatile' */
23190 PL_debug &= ~ DEBUG_r_FLAG;
23193 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23201 S_handle_user_defined_property(pTHX_
23203 /* Parses the contents of a user-defined property definition; returning the
23204 * expanded definition if possible. If so, the return is an inversion
23207 * If there are subroutines that are part of the expansion and which aren't
23208 * known at the time of the call to this function, this returns what
23209 * parse_uniprop_string() returned for the first one encountered.
23211 * If an error was found, NULL is returned, and 'msg' gets a suitable
23212 * message appended to it. (Appending allows the back trace of how we got
23213 * to the faulty definition to be displayed through nested calls of
23214 * user-defined subs.)
23216 * The caller IS responsible for freeing any returned SV.
23218 * The syntax of the contents is pretty much described in perlunicode.pod,
23219 * but we also allow comments on each line */
23221 const char * name, /* Name of property */
23222 const STRLEN name_len, /* The name's length in bytes */
23223 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23224 const bool to_fold, /* ? Is this under /i */
23225 const bool runtime, /* ? Are we in compile- or run-time */
23226 const bool deferrable, /* Is it ok for this property's full definition
23227 to be deferred until later? */
23228 SV* contents, /* The property's definition */
23229 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
23230 getting called unless this is thought to be
23231 a user-defined property */
23232 SV * msg, /* Any error or warning msg(s) are appended to
23234 const STRLEN level) /* Recursion level of this call */
23237 const char * string = SvPV_const(contents, len);
23238 const char * const e = string + len;
23239 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23240 const STRLEN msgs_length_on_entry = SvCUR(msg);
23242 const char * s0 = string; /* Points to first byte in the current line
23243 being parsed in 'string' */
23244 const char overflow_msg[] = "Code point too large in \"";
23245 SV* running_definition = NULL;
23247 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23249 *user_defined_ptr = TRUE;
23251 /* Look at each line */
23253 const char * s; /* Current byte */
23254 char op = '+'; /* Default operation is 'union' */
23255 IV min = 0; /* range begin code point */
23256 IV max = -1; /* and range end */
23257 SV* this_definition;
23259 /* Skip comment lines */
23261 s0 = strchr(s0, '\n');
23269 /* For backcompat, allow an empty first line */
23275 /* First character in the line may optionally be the operation */
23284 /* If the line is one or two hex digits separated by blank space, its
23285 * a range; otherwise it is either another user-defined property or an
23290 if (! isXDIGIT(*s)) {
23291 goto check_if_property;
23294 do { /* Each new hex digit will add 4 bits. */
23295 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23296 s = strchr(s, '\n');
23300 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23301 sv_catpv(msg, overflow_msg);
23302 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23303 UTF8fARG(is_contents_utf8, s - s0, s0));
23304 sv_catpvs(msg, "\"");
23305 goto return_failure;
23308 /* Accumulate this digit into the value */
23309 min = (min << 4) + READ_XDIGIT(s);
23310 } while (isXDIGIT(*s));
23312 while (isBLANK(*s)) { s++; }
23314 /* We allow comments at the end of the line */
23316 s = strchr(s, '\n');
23322 else if (s < e && *s != '\n') {
23323 if (! isXDIGIT(*s)) {
23324 goto check_if_property;
23327 /* Look for the high point of the range */
23330 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23331 s = strchr(s, '\n');
23335 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23336 sv_catpv(msg, overflow_msg);
23337 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23338 UTF8fARG(is_contents_utf8, s - s0, s0));
23339 sv_catpvs(msg, "\"");
23340 goto return_failure;
23343 max = (max << 4) + READ_XDIGIT(s);
23344 } while (isXDIGIT(*s));
23346 while (isBLANK(*s)) { s++; }
23349 s = strchr(s, '\n');
23354 else if (s < e && *s != '\n') {
23355 goto check_if_property;
23359 if (max == -1) { /* The line only had one entry */
23362 else if (max < min) {
23363 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23364 sv_catpvs(msg, "Illegal range in \"");
23365 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23366 UTF8fARG(is_contents_utf8, s - s0, s0));
23367 sv_catpvs(msg, "\"");
23368 goto return_failure;
23371 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
23373 if ( UNICODE_IS_PERL_EXTENDED(min)
23374 || UNICODE_IS_PERL_EXTENDED(max))
23376 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23378 /* If both code points are non-portable, warn only on the lower
23380 sv_catpv(msg, get_extended_utf8_msg(
23381 (UNICODE_IS_PERL_EXTENDED(min))
23383 sv_catpvs(msg, " in \"");
23384 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23385 UTF8fARG(is_contents_utf8, s - s0, s0));
23386 sv_catpvs(msg, "\"");
23391 /* Here, this line contains a legal range */
23392 this_definition = sv_2mortal(_new_invlist(2));
23393 this_definition = _add_range_to_invlist(this_definition, min, max);
23398 /* Here it isn't a legal range line. See if it is a legal property
23399 * line. First find the end of the meat of the line */
23400 s = strpbrk(s, "#\n");
23405 /* Ignore trailing blanks in keeping with the requirements of
23406 * parse_uniprop_string() */
23408 while (s > s0 && isBLANK_A(*s)) {
23413 this_definition = parse_uniprop_string(s0, s - s0,
23414 is_utf8, to_fold, runtime,
23417 user_defined_ptr, msg,
23419 ? level /* Don't increase level
23420 if input is empty */
23423 if (this_definition == NULL) {
23424 goto return_failure; /* 'msg' should have had the reason
23425 appended to it by the above call */
23428 if (! is_invlist(this_definition)) { /* Unknown at this time */
23429 return newSVsv(this_definition);
23433 s = strchr(s, '\n');
23443 _invlist_union(running_definition, this_definition,
23444 &running_definition);
23447 _invlist_subtract(running_definition, this_definition,
23448 &running_definition);
23451 _invlist_intersection(running_definition, this_definition,
23452 &running_definition);
23455 _invlist_union_complement_2nd(running_definition,
23456 this_definition, &running_definition);
23459 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23460 __FILE__, __LINE__, op);
23464 /* Position past the '\n' */
23466 } /* End of loop through the lines of 'contents' */
23468 /* Here, we processed all the lines in 'contents' without error. If we
23469 * didn't add any warnings, simply return success */
23470 if (msgs_length_on_entry == SvCUR(msg)) {
23472 /* If the expansion was empty, the answer isn't nothing: its an empty
23473 * inversion list */
23474 if (running_definition == NULL) {
23475 running_definition = _new_invlist(1);
23478 return running_definition;
23481 /* Otherwise, add some explanatory text, but we will return success */
23485 running_definition = NULL;
23489 if (name_len > 0) {
23490 sv_catpvs(msg, " in expansion of ");
23491 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23494 return running_definition;
23497 /* As explained below, certain operations need to take place in the first
23498 * thread created. These macros switch contexts */
23499 # ifdef USE_ITHREADS
23500 # define DECLARATION_FOR_GLOBAL_CONTEXT \
23501 PerlInterpreter * save_aTHX = aTHX;
23502 # define SWITCH_TO_GLOBAL_CONTEXT \
23503 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23504 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
23505 # define CUR_CONTEXT aTHX
23506 # define ORIGINAL_CONTEXT save_aTHX
23508 # define DECLARATION_FOR_GLOBAL_CONTEXT
23509 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
23510 # define RESTORE_CONTEXT NOOP
23511 # define CUR_CONTEXT NULL
23512 # define ORIGINAL_CONTEXT NULL
23516 S_delete_recursion_entry(pTHX_ void *key)
23518 /* Deletes the entry used to detect recursion when expanding user-defined
23519 * properties. This is a function so it can be set up to be called even if
23520 * the program unexpectedly quits */
23523 SV ** current_entry;
23524 const STRLEN key_len = strlen((const char *) key);
23525 DECLARATION_FOR_GLOBAL_CONTEXT;
23527 SWITCH_TO_GLOBAL_CONTEXT;
23529 /* If the entry is one of these types, it is a permanent entry, and not the
23530 * one used to detect recursions. This function should delete only the
23531 * recursion entry */
23532 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23534 && ! is_invlist(*current_entry)
23535 && ! SvPOK(*current_entry))
23537 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23545 S_get_fq_name(pTHX_
23546 const char * const name, /* The first non-blank in the \p{}, \P{} */
23547 const Size_t name_len, /* Its length in bytes, not including any trailing space */
23548 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23549 const bool has_colon_colon
23552 /* Returns a mortal SV containing the fully qualified version of the input
23557 fq_name = newSVpvs_flags("", SVs_TEMP);
23559 /* Use the current package if it wasn't included in our input */
23560 if (! has_colon_colon) {
23561 const HV * pkg = (IN_PERL_COMPILETIME)
23563 : CopSTASH(PL_curcop);
23564 const char* pkgname = HvNAME(pkg);
23566 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23567 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23568 sv_catpvs(fq_name, "::");
23571 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23572 UTF8fARG(is_utf8, name_len, name));
23577 S_parse_uniprop_string(pTHX_
23579 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
23580 * now. If so, the return is an inversion list.
23582 * If the property is user-defined, it is a subroutine, which in turn
23583 * may call other subroutines. This function will call the whole nest of
23584 * them to get the definition they return; if some aren't known at the time
23585 * of the call to this function, the fully qualified name of the highest
23586 * level sub is returned. It is an error to call this function at runtime
23587 * without every sub defined.
23589 * If an error was found, NULL is returned, and 'msg' gets a suitable
23590 * message appended to it. (Appending allows the back trace of how we got
23591 * to the faulty definition to be displayed through nested calls of
23592 * user-defined subs.)
23594 * The caller should NOT try to free any returned inversion list.
23596 * Other parameters will be set on return as described below */
23598 const char * const name, /* The first non-blank in the \p{}, \P{} */
23599 Size_t name_len, /* Its length in bytes, not including any
23601 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23602 const bool to_fold, /* ? Is this under /i */
23603 const bool runtime, /* TRUE if this is being called at run time */
23604 const bool deferrable, /* TRUE if it's ok for the definition to not be
23605 known at this call */
23606 AV ** strings, /* To return string property values, like named
23608 bool *user_defined_ptr, /* Upon return from this function it will be
23609 set to TRUE if any component is a
23610 user-defined property */
23611 SV * msg, /* Any error or warning msg(s) are appended to
23613 const STRLEN level) /* Recursion level of this call */
23616 char* lookup_name; /* normalized name for lookup in our tables */
23617 unsigned lookup_len; /* Its length */
23618 enum { Not_Strict = 0, /* Some properties have stricter name */
23619 Strict, /* normalization rules, which we decide */
23620 As_Is /* upon based on parsing */
23621 } stricter = Not_Strict;
23623 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23624 * (though it requires extra effort to download them from Unicode and
23625 * compile perl to know about them) */
23626 bool is_nv_type = FALSE;
23628 unsigned int i, j = 0;
23629 int equals_pos = -1; /* Where the '=' is found, or negative if none */
23630 int slash_pos = -1; /* Where the '/' is found, or negative if none */
23631 int table_index = 0; /* The entry number for this property in the table
23632 of all Unicode property names */
23633 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
23634 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
23635 the normalized name in certain situations */
23636 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
23637 part of a package name */
23638 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
23639 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
23640 property rather than a Unicode
23642 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
23643 if an error. If it is an inversion list,
23644 it is the definition. Otherwise it is a
23645 string containing the fully qualified sub
23647 SV * fq_name = NULL; /* For user-defined properties, the fully
23649 bool invert_return = FALSE; /* ? Do we need to complement the result before
23651 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23652 explicit utf8:: package that we strip
23654 /* The expansion of properties that could be either user-defined or
23655 * official unicode ones is deferred until runtime, including a marker for
23656 * those that might be in the latter category. This boolean indicates if
23657 * we've seen that marker. If not, what we're parsing can't be such an
23658 * official Unicode property whose expansion was deferred */
23659 bool could_be_deferred_official = FALSE;
23661 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23663 /* The input will be normalized into 'lookup_name' */
23664 Newx(lookup_name, name_len, char);
23665 SAVEFREEPV(lookup_name);
23667 /* Parse the input. */
23668 for (i = 0; i < name_len; i++) {
23669 char cur = name[i];
23671 /* Most of the characters in the input will be of this ilk, being parts
23673 if (isIDCONT_A(cur)) {
23675 /* Case differences are ignored. Our lookup routine assumes
23676 * everything is lowercase, so normalize to that */
23677 if (isUPPER_A(cur)) {
23678 lookup_name[j++] = toLOWER_A(cur);
23682 if (cur == '_') { /* Don't include these in the normalized name */
23686 lookup_name[j++] = cur;
23688 /* The first character in a user-defined name must be of this type.
23690 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23691 could_be_user_defined = FALSE;
23697 /* Here, the character is not something typically in a name, But these
23698 * two types of characters (and the '_' above) can be freely ignored in
23699 * most situations. Later it may turn out we shouldn't have ignored
23700 * them, and we have to reparse, but we don't have enough information
23701 * yet to make that decision */
23702 if (cur == '-' || isSPACE_A(cur)) {
23703 could_be_user_defined = FALSE;
23707 /* An equals sign or single colon mark the end of the first part of
23708 * the property name */
23710 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23712 lookup_name[j++] = '='; /* Treat the colon as an '=' */
23713 equals_pos = j; /* Note where it occurred in the input */
23714 could_be_user_defined = FALSE;
23718 /* If this looks like it is a marker we inserted at compile time,
23719 * set a flag and otherwise ignore it. If it isn't in the final
23720 * position, keep it as it would have been user input. */
23721 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23723 && could_be_user_defined
23724 && i == name_len - 1)
23727 could_be_deferred_official = TRUE;
23731 /* Otherwise, this character is part of the name. */
23732 lookup_name[j++] = cur;
23734 /* Here it isn't a single colon, so if it is a colon, it must be a
23738 /* A double colon should be a package qualifier. We note its
23739 * position and continue. Note that one could have
23740 * pkg1::pkg2::...::foo
23741 * so that the position at the end of the loop will be just after
23742 * the final qualifier */
23745 non_pkg_begin = i + 1;
23746 lookup_name[j++] = ':';
23747 lun_non_pkg_begin = j;
23749 else { /* Only word chars (and '::') can be in a user-defined name */
23750 could_be_user_defined = FALSE;
23752 } /* End of parsing through the lhs of the property name (or all of it if
23755 # define STRLENs(s) (sizeof("" s "") - 1)
23757 /* If there is a single package name 'utf8::', it is ambiguous. It could
23758 * be for a user-defined property, or it could be a Unicode property, as
23759 * all of them are considered to be for that package. For the purposes of
23760 * parsing the rest of the property, strip it off */
23761 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23762 lookup_name += STRLENs("utf8::");
23763 j -= STRLENs("utf8::");
23764 equals_pos -= STRLENs("utf8::");
23765 stripped_utf8_pkg = TRUE;
23768 /* Here, we are either done with the whole property name, if it was simple;
23769 * or are positioned just after the '=' if it is compound. */
23771 if (equals_pos >= 0) {
23772 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23774 /* Space immediately after the '=' is ignored */
23776 for (; i < name_len; i++) {
23777 if (! isSPACE_A(name[i])) {
23782 /* Most punctuation after the equals indicates a subpattern, like
23784 if ( isPUNCT_A(name[i])
23789 /* A backslash means the real delimitter is the next character,
23790 * but it must be punctuation */
23791 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23793 bool special_property = memEQs(lookup_name, j - 1, "name")
23794 || memEQs(lookup_name, j - 1, "na");
23795 if (! special_property) {
23796 /* Find the property. The table includes the equals sign, so
23797 * we use 'j' as-is */
23798 table_index = do_uniprop_match(lookup_name, j);
23800 if (special_property || table_index) {
23801 REGEXP * subpattern_re;
23802 char open = name[i++];
23804 const char * pos_in_brackets;
23805 const char * const * prop_values;
23808 /* Backslash => delimitter is the character following. We
23809 * already checked that it is punctuation */
23810 if (open == '\\') {
23815 /* This data structure is constructed so that the matching
23816 * closing bracket is 3 past its matching opening. The second
23817 * set of closing is so that if the opening is something like
23818 * ']', the closing will be that as well. Something similar is
23819 * done in toke.c */
23820 pos_in_brackets = memCHRs("([<)]>)]>", open);
23821 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23824 || name[name_len-1] != close
23825 || (escaped && name[name_len-2] != '\\')
23826 /* Also make sure that there are enough characters.
23827 * e.g., '\\\' would show up incorrectly as legal even
23828 * though it is too short */
23829 || (SSize_t) (name_len - i - 1 - escaped) < 0)
23831 sv_catpvs(msg, "Unicode property wildcard not terminated");
23832 goto append_name_to_msg;
23835 Perl_ck_warner_d(aTHX_
23836 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23837 "The Unicode property wildcards feature is experimental");
23839 if (special_property) {
23840 const char * error_msg;
23841 const char * revised_name = name + i;
23842 Size_t revised_name_len = name_len - (i + 1 + escaped);
23844 /* Currently, the only 'special_property' is name, which we
23845 * lookup in _charnames.pm */
23847 if (! load_charnames(newSVpvs("placeholder"),
23848 revised_name, revised_name_len,
23851 sv_catpv(msg, error_msg);
23852 goto append_name_to_msg;
23855 /* Farm this out to a function just to make the current
23856 * function less unwieldy */
23857 if (handle_names_wildcard(revised_name, revised_name_len,
23861 return prop_definition;
23867 prop_values = get_prop_values(table_index);
23869 /* Now create and compile the wildcard subpattern. Use /i
23870 * because the property values are supposed to match with case
23872 subpattern_re = compile_wildcard(name + i,
23873 name_len - i - 1 - escaped,
23877 /* For each legal property value, see if the supplied pattern
23879 while (*prop_values) {
23880 const char * const entry = *prop_values;
23881 const Size_t len = strlen(entry);
23882 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23884 if (execute_wildcard(subpattern_re,
23886 (char *) entry + len,
23890 { /* Here, matched. Add to the returned list */
23891 Size_t total_len = j + len;
23892 SV * sub_invlist = NULL;
23893 char * this_string;
23895 /* We know this is a legal \p{property=value}. Call
23896 * the function to return the list of code points that
23898 Newxz(this_string, total_len + 1, char);
23899 Copy(lookup_name, this_string, j, char);
23900 my_strlcat(this_string, entry, total_len + 1);
23901 SAVEFREEPV(this_string);
23902 sub_invlist = parse_uniprop_string(this_string,
23912 _invlist_union(prop_definition, sub_invlist,
23916 prop_values++; /* Next iteration, look at next propvalue */
23917 } /* End of looking through property values; (the data
23918 structure is terminated by a NULL ptr) */
23920 SvREFCNT_dec_NN(subpattern_re);
23922 if (prop_definition) {
23923 return prop_definition;
23926 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23927 goto append_name_to_msg;
23930 /* Here's how khw thinks we should proceed to handle the properties
23931 * not yet done: Bidi Mirroring Glyph can map to ""
23932 Bidi Paired Bracket can map to ""
23933 Case Folding (both full and simple)
23934 Shouldn't /i be good enough for Full
23935 Decomposition Mapping
23936 Equivalent Unified Ideograph can map to ""
23937 Lowercase Mapping (both full and simple)
23938 NFKC Case Fold can map to ""
23939 Titlecase Mapping (both full and simple)
23940 Uppercase Mapping (both full and simple)
23941 * Handle these the same way Name is done, using say, _wild.pm, but
23942 * having both loose and full, like in charclass_invlists.h.
23943 * Perhaps move block and script to that as they are somewhat large
23944 * in charclass_invlists.h.
23945 * For properties where the default is the code point itself, such
23946 * as any of the case changing mappings, the string would otherwise
23947 * consist of all Unicode code points in UTF-8 strung together.
23948 * This would be impractical. So instead, examine their compiled
23949 * pattern, looking at the ssc. If none, reject the pattern as an
23950 * error. Otherwise run the pattern against every code point in
23951 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
23952 * And it might be good to create an API to return the ssc.
23953 * Or handle them like the algorithmic names are done
23955 } /* End of is a wildcard subppattern */
23957 /* \p{name=...} is handled specially. Instead of using the normal
23958 * mechanism involving charclass_invlists.h, it uses _charnames.pm
23959 * which has the necessary (huge) data accessible to it, and which
23960 * doesn't get loaded unless necessary. The legal syntax for names is
23961 * somewhat different than other properties due both to the vagaries of
23962 * a few outlier official names, and the fact that only a few ASCII
23963 * characters are permitted in them */
23964 if ( memEQs(lookup_name, j - 1, "name")
23965 || memEQs(lookup_name, j - 1, "na"))
23970 const char * error_msg;
23972 SV * character_name;
23973 STRLEN character_len;
23978 /* Since the RHS (after skipping initial space) is passed unchanged
23979 * to charnames, and there are different criteria for what are
23980 * legal characters in the name, just parse it here. A character
23981 * name must begin with an ASCII alphabetic */
23982 if (! isALPHA(name[i])) {
23985 lookup_name[j++] = name[i];
23987 for (++i; i < name_len; i++) {
23988 /* Official names can only be in the ASCII range, and only
23989 * certain characters */
23990 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
23993 lookup_name[j++] = name[i];
23996 /* Finished parsing, save the name into an SV */
23997 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
23999 /* Make sure _charnames is loaded. (The parameters give context
24000 * for any errors generated */
24001 table = load_charnames(character_name, name, name_len, &error_msg);
24002 if (table == NULL) {
24003 sv_catpv(msg, error_msg);
24004 goto append_name_to_msg;
24007 lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
24008 if (! lookup_loose) {
24010 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24013 PUSHSTACKi(PERLSI_REGCOMP);
24019 XPUSHs(character_name);
24021 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24026 SvREFCNT_inc_simple_void_NN(character);
24033 if (! SvOK(character)) {
24037 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24038 if (character_len == SvCUR(character)) {
24039 prop_definition = add_cp_to_invlist(NULL, cp);
24044 /* First of the remaining characters in the string. */
24045 char * remaining = SvPVX(character) + character_len;
24047 if (strings == NULL) {
24048 goto failed; /* XXX Perhaps a specific msg instead, like
24049 'not available here' */
24052 if (*strings == NULL) {
24053 *strings = newAV();
24056 this_string = newAV();
24057 av_push(this_string, newSVuv(cp));
24060 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24061 av_push(this_string, newSVuv(cp));
24062 remaining += character_len;
24063 } while (remaining < SvEND(character));
24065 av_push(*strings, (SV *) this_string);
24068 return prop_definition;
24071 /* Certain properties whose values are numeric need special handling.
24072 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24073 * purposes of checking if this is one of those properties */
24074 if (memBEGINPs(lookup_name, j, "is")) {
24078 /* Then check if it is one of these specially-handled properties. The
24079 * possibilities are hard-coded because easier this way, and the list
24080 * is unlikely to change.
24082 * All numeric value type properties are of this ilk, and are also
24083 * special in a different way later on. So find those first. There
24084 * are several numeric value type properties in the Unihan DB (which is
24085 * unlikely to be compiled with perl, but we handle it here in case it
24086 * does get compiled). They all end with 'numeric'. The interiors
24087 * aren't checked for the precise property. This would stop working if
24088 * a cjk property were to be created that ended with 'numeric' and
24089 * wasn't a numeric type */
24090 is_nv_type = memEQs(lookup_name + lookup_offset,
24091 j - 1 - lookup_offset, "numericvalue")
24092 || memEQs(lookup_name + lookup_offset,
24093 j - 1 - lookup_offset, "nv")
24094 || ( memENDPs(lookup_name + lookup_offset,
24095 j - 1 - lookup_offset, "numeric")
24096 && ( memBEGINPs(lookup_name + lookup_offset,
24097 j - 1 - lookup_offset, "cjk")
24098 || memBEGINPs(lookup_name + lookup_offset,
24099 j - 1 - lookup_offset, "k")));
24101 || memEQs(lookup_name + lookup_offset,
24102 j - 1 - lookup_offset, "canonicalcombiningclass")
24103 || memEQs(lookup_name + lookup_offset,
24104 j - 1 - lookup_offset, "ccc")
24105 || memEQs(lookup_name + lookup_offset,
24106 j - 1 - lookup_offset, "age")
24107 || memEQs(lookup_name + lookup_offset,
24108 j - 1 - lookup_offset, "in")
24109 || memEQs(lookup_name + lookup_offset,
24110 j - 1 - lookup_offset, "presentin"))
24114 /* Since the stuff after the '=' is a number, we can't throw away
24115 * '-' willy-nilly, as those could be a minus sign. Other stricter
24116 * rules also apply. However, these properties all can have the
24117 * rhs not be a number, in which case they contain at least one
24118 * alphabetic. In those cases, the stricter rules don't apply.
24119 * But the numeric type properties can have the alphas [Ee] to
24120 * signify an exponent, and it is still a number with stricter
24121 * rules. So look for an alpha that signifies not-strict */
24123 for (k = i; k < name_len; k++) {
24124 if ( isALPHA_A(name[k])
24125 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24127 stricter = Not_Strict;
24135 /* A number may have a leading '+' or '-'. The latter is retained
24137 if (name[i] == '+') {
24140 else if (name[i] == '-') {
24141 lookup_name[j++] = '-';
24145 /* Skip leading zeros including single underscores separating the
24146 * zeros, or between the final leading zero and the first other
24148 for (; i < name_len - 1; i++) {
24149 if ( name[i] != '0'
24150 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24157 else { /* No '=' */
24159 /* Only a few properties without an '=' should be parsed with stricter
24160 * rules. The list is unlikely to change. */
24161 if ( memBEGINPs(lookup_name, j, "perl")
24162 && memNEs(lookup_name + 4, j - 4, "space")
24163 && memNEs(lookup_name + 4, j - 4, "word"))
24167 /* We set the inputs back to 0 and the code below will reparse,
24173 /* Here, we have either finished the property, or are positioned to parse
24174 * the remainder, and we know if stricter rules apply. Finish out, if not
24176 for (; i < name_len; i++) {
24177 char cur = name[i];
24179 /* In all instances, case differences are ignored, and we normalize to
24181 if (isUPPER_A(cur)) {
24182 lookup_name[j++] = toLOWER(cur);
24186 /* An underscore is skipped, but not under strict rules unless it
24187 * separates two digits */
24190 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
24191 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24193 lookup_name[j++] = '_';
24198 /* Hyphens are skipped except under strict */
24199 if (cur == '-' && ! stricter) {
24203 /* XXX Bug in documentation. It says white space skipped adjacent to
24204 * non-word char. Maybe we should, but shouldn't skip it next to a dot
24206 if (isSPACE_A(cur) && ! stricter) {
24210 lookup_name[j++] = cur;
24212 /* Unless this is a non-trailing slash, we are done with it */
24213 if (i >= name_len - 1 || cur != '/') {
24219 /* A slash in the 'numeric value' property indicates that what follows
24220 * is a denominator. It can have a leading '+' and '0's that should be
24221 * skipped. But we have never allowed a negative denominator, so treat
24222 * a minus like every other character. (No need to rule out a second
24223 * '/', as that won't match anything anyway */
24226 if (i < name_len && name[i] == '+') {
24230 /* Skip leading zeros including underscores separating digits */
24231 for (; i < name_len - 1; i++) {
24232 if ( name[i] != '0'
24233 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24239 /* Store the first real character in the denominator */
24240 if (i < name_len) {
24241 lookup_name[j++] = name[i];
24246 /* Here are completely done parsing the input 'name', and 'lookup_name'
24247 * contains a copy, normalized.
24249 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24250 * different from without the underscores. */
24251 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
24252 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24253 && UNLIKELY(name[name_len-1] == '_'))
24255 lookup_name[j++] = '&';
24258 /* If the original input began with 'In' or 'Is', it could be a subroutine
24259 * call to a user-defined property instead of a Unicode property name. */
24260 if ( name_len - non_pkg_begin > 2
24261 && name[non_pkg_begin+0] == 'I'
24262 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24264 /* Names that start with In have different characterstics than those
24265 * that start with Is */
24266 if (name[non_pkg_begin+1] == 's') {
24267 starts_with_Is = TRUE;
24271 could_be_user_defined = FALSE;
24274 if (could_be_user_defined) {
24277 /* If the user defined property returns the empty string, it could
24278 * easily be because the pattern is being compiled before the data it
24279 * actually needs to compile is available. This could be argued to be
24280 * a bug in the perl code, but this is a change of behavior for Perl,
24281 * so we handle it. This means that intentionally returning nothing
24282 * will not be resolved until runtime */
24283 bool empty_return = FALSE;
24285 /* Here, the name could be for a user defined property, which are
24286 * implemented as subs. */
24287 user_sub = get_cvn_flags(name, name_len, 0);
24290 /* Here, the property name could be a user-defined one, but there
24291 * is no subroutine to handle it (as of now). Defer handling it
24292 * until runtime. Otherwise, a block defined by Unicode in a later
24293 * release would get the synonym InFoo added for it, and existing
24294 * code that used that name would suddenly break if it referred to
24295 * the property before the sub was declared. See [perl #134146] */
24297 goto definition_deferred;
24300 /* Here, we are at runtime, and didn't find the user property. It
24301 * could be an official property, but only if no package was
24302 * specified, or just the utf8:: package. */
24303 if (could_be_deferred_official) {
24304 lookup_name += lun_non_pkg_begin;
24305 j -= lun_non_pkg_begin;
24307 else if (! stripped_utf8_pkg) {
24308 goto unknown_user_defined;
24311 /* Drop down to look up in the official properties */
24314 const char insecure[] = "Insecure user-defined property";
24316 /* Here, there is a sub by the correct name. Normally we call it
24317 * to get the property definition */
24319 SV * user_sub_sv = MUTABLE_SV(user_sub);
24320 SV * error; /* Any error returned by calling 'user_sub' */
24321 SV * key; /* The key into the hash of user defined sub names
24324 SV ** saved_user_prop_ptr; /* Hash entry for this property */
24326 /* How many times to retry when another thread is in the middle of
24327 * expanding the same definition we want */
24328 PERL_INT_FAST8_T retry_countdown = 10;
24330 DECLARATION_FOR_GLOBAL_CONTEXT;
24332 /* If we get here, we know this property is user-defined */
24333 *user_defined_ptr = TRUE;
24335 /* We refuse to call a potentially tainted subroutine; returning an
24338 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24339 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24340 goto append_name_to_msg;
24343 /* In principal, we only call each subroutine property definition
24344 * once during the life of the program. This guarantees that the
24345 * property definition never changes. The results of the single
24346 * sub call are stored in a hash, which is used instead for future
24347 * references to this property. The property definition is thus
24348 * immutable. But, to allow the user to have a /i-dependent
24349 * definition, we call the sub once for non-/i, and once for /i,
24350 * should the need arise, passing the /i status as a parameter.
24352 * We start by constructing the hash key name, consisting of the
24353 * fully qualified subroutine name, preceded by the /i status, so
24354 * that there is a key for /i and a different key for non-/i */
24355 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24356 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24357 non_pkg_begin != 0);
24358 sv_catsv(key, fq_name);
24361 /* We only call the sub once throughout the life of the program
24362 * (with the /i, non-/i exception noted above). That means the
24363 * hash must be global and accessible to all threads. It is
24364 * created at program start-up, before any threads are created, so
24365 * is accessible to all children. But this creates some
24368 * 1) The keys can't be shared, or else problems arise; sharing is
24369 * turned off at hash creation time
24370 * 2) All SVs in it are there for the remainder of the life of the
24371 * program, and must be created in the same interpreter context
24372 * as the hash, or else they will be freed from the wrong pool
24373 * at global destruction time. This is handled by switching to
24374 * the hash's context to create each SV going into it, and then
24375 * immediately switching back
24376 * 3) All accesses to the hash must be controlled by a mutex, to
24377 * prevent two threads from getting an unstable state should
24378 * they simultaneously be accessing it. The code below is
24379 * crafted so that the mutex is locked whenever there is an
24380 * access and unlocked only when the next stable state is
24383 * The hash stores either the definition of the property if it was
24384 * valid, or, if invalid, the error message that was raised. We
24385 * use the type of SV to distinguish.
24387 * There's also the need to guard against the definition expansion
24388 * from infinitely recursing. This is handled by storing the aTHX
24389 * of the expanding thread during the expansion. Again the SV type
24390 * is used to distinguish this from the other two cases. If we
24391 * come to here and the hash entry for this property is our aTHX,
24392 * it means we have recursed, and the code assumes that we would
24393 * infinitely recurse, so instead stops and raises an error.
24394 * (Any recursion has always been treated as infinite recursion in
24397 * If instead, the entry is for a different aTHX, it means that
24398 * that thread has gotten here first, and hasn't finished expanding
24399 * the definition yet. We just have to wait until it is done. We
24400 * sleep and retry a few times, returning an error if the other
24401 * thread doesn't complete. */
24404 USER_PROP_MUTEX_LOCK;
24406 /* If we have an entry for this key, the subroutine has already
24407 * been called once with this /i status. */
24408 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24409 SvPVX(key), SvCUR(key), 0);
24410 if (saved_user_prop_ptr) {
24412 /* If the saved result is an inversion list, it is the valid
24413 * definition of this property */
24414 if (is_invlist(*saved_user_prop_ptr)) {
24415 prop_definition = *saved_user_prop_ptr;
24417 /* The SV in the hash won't be removed until global
24418 * destruction, so it is stable and we can unlock */
24419 USER_PROP_MUTEX_UNLOCK;
24421 /* The caller shouldn't try to free this SV */
24422 return prop_definition;
24425 /* Otherwise, if it is a string, it is the error message
24426 * that was returned when we first tried to evaluate this
24427 * property. Fail, and append the message */
24428 if (SvPOK(*saved_user_prop_ptr)) {
24429 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24430 sv_catsv(msg, *saved_user_prop_ptr);
24432 /* The SV in the hash won't be removed until global
24433 * destruction, so it is stable and we can unlock */
24434 USER_PROP_MUTEX_UNLOCK;
24439 assert(SvIOK(*saved_user_prop_ptr));
24441 /* Here, we have an unstable entry in the hash. Either another
24442 * thread is in the middle of expanding the property's
24443 * definition, or we are ourselves recursing. We use the aTHX
24444 * in it to distinguish */
24445 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24447 /* Here, it's another thread doing the expanding. We've
24448 * looked as much as we are going to at the contents of the
24449 * hash entry. It's safe to unlock. */
24450 USER_PROP_MUTEX_UNLOCK;
24452 /* Retry a few times */
24453 if (retry_countdown-- > 0) {
24458 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24459 sv_catpvs(msg, "Timeout waiting for another thread to "
24461 goto append_name_to_msg;
24464 /* Here, we are recursing; don't dig any deeper */
24465 USER_PROP_MUTEX_UNLOCK;
24467 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24469 "Infinite recursion in user-defined property");
24470 goto append_name_to_msg;
24473 /* Here, this thread has exclusive control, and there is no entry
24474 * for this property in the hash. So we have the go ahead to
24475 * expand the definition ourselves. */
24477 PUSHSTACKi(PERLSI_REGCOMP);
24480 /* Create a temporary placeholder in the hash to detect recursion
24482 SWITCH_TO_GLOBAL_CONTEXT;
24483 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24484 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24487 /* Now that we have a placeholder, we can let other threads
24489 USER_PROP_MUTEX_UNLOCK;
24491 /* Make sure the placeholder always gets destroyed */
24492 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24497 /* Call the user's function, with the /i status as a parameter.
24498 * Note that we have gone to a lot of trouble to keep this call
24499 * from being within the locked mutex region. */
24500 XPUSHs(boolSV(to_fold));
24503 /* The following block was taken from swash_init(). Presumably
24504 * they apply to here as well, though we no longer use a swash --
24508 /* We might get here via a subroutine signature which uses a utf8
24509 * parameter name, at which point PL_subname will have been set
24510 * but not yet used. */
24511 save_item(PL_subname);
24513 /* G_SCALAR guarantees a single return value */
24514 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24519 if (TAINT_get || SvTRUE(error)) {
24520 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24521 if (SvTRUE(error)) {
24522 sv_catpvs(msg, "Error \"");
24523 sv_catsv(msg, error);
24524 sv_catpvs(msg, "\"");
24527 if (SvTRUE(error)) sv_catpvs(msg, "; ");
24528 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24531 if (name_len > 0) {
24532 sv_catpvs(msg, " in expansion of ");
24533 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24539 prop_definition = NULL;
24542 SV * contents = POPs;
24544 /* The contents is supposed to be the expansion of the property
24545 * definition. If the definition is deferrable, and we got an
24546 * empty string back, set a flag to later defer it (after clean
24549 && (! SvPOK(contents) || SvCUR(contents) == 0))
24551 empty_return = TRUE;
24553 else { /* Otherwise, call a function to check for valid syntax,
24556 prop_definition = handle_user_defined_property(
24558 is_utf8, to_fold, runtime,
24560 contents, user_defined_ptr,
24566 /* Here, we have the results of the expansion. Delete the
24567 * placeholder, and if the definition is now known, replace it with
24568 * that definition. We need exclusive access to the hash, and we
24569 * can't let anyone else in, between when we delete the placeholder
24570 * and add the permanent entry */
24571 USER_PROP_MUTEX_LOCK;
24573 S_delete_recursion_entry(aTHX_ SvPVX(key));
24575 if ( ! empty_return
24576 && (! prop_definition || is_invlist(prop_definition)))
24578 /* If we got success we use the inversion list defining the
24579 * property; otherwise use the error message */
24580 SWITCH_TO_GLOBAL_CONTEXT;
24581 (void) hv_store_ent(PL_user_def_props,
24584 ? newSVsv(prop_definition)
24590 /* All done, and the hash now has a permanent entry for this
24591 * property. Give up exclusive control */
24592 USER_PROP_MUTEX_UNLOCK;
24598 if (empty_return) {
24599 goto definition_deferred;
24602 if (prop_definition) {
24604 /* If the definition is for something not known at this time,
24605 * we toss it, and go return the main property name, as that's
24606 * the one the user will be aware of */
24607 if (! is_invlist(prop_definition)) {
24608 SvREFCNT_dec_NN(prop_definition);
24609 goto definition_deferred;
24612 sv_2mortal(prop_definition);
24616 return prop_definition;
24618 } /* End of calling the subroutine for the user-defined property */
24619 } /* End of it could be a user-defined property */
24621 /* Here it wasn't a user-defined property that is known at this time. See
24622 * if it is a Unicode property */
24624 lookup_len = j; /* This is a more mnemonic name than 'j' */
24626 /* Get the index into our pointer table of the inversion list corresponding
24627 * to the property */
24628 table_index = do_uniprop_match(lookup_name, lookup_len);
24630 /* If it didn't find the property ... */
24631 if (table_index == 0) {
24633 /* Try again stripping off any initial 'Is'. This is because we
24634 * promise that an initial Is is optional. The same isn't true of
24635 * names that start with 'In'. Those can match only blocks, and the
24636 * lookup table already has those accounted for. */
24637 if (starts_with_Is) {
24643 table_index = do_uniprop_match(lookup_name, lookup_len);
24646 if (table_index == 0) {
24649 /* Here, we didn't find it. If not a numeric type property, and
24650 * can't be a user-defined one, it isn't a legal property */
24651 if (! is_nv_type) {
24652 if (! could_be_user_defined) {
24656 /* Here, the property name is legal as a user-defined one. At
24657 * compile time, it might just be that the subroutine for that
24658 * property hasn't been encountered yet, but at runtime, it's
24659 * an error to try to use an undefined one */
24660 if (! deferrable) {
24661 goto unknown_user_defined;;
24664 goto definition_deferred;
24665 } /* End of isn't a numeric type property */
24667 /* The numeric type properties need more work to decide. What we
24668 * do is make sure we have the number in canonical form and look
24671 if (slash_pos < 0) { /* No slash */
24673 /* When it isn't a rational, take the input, convert it to a
24674 * NV, then create a canonical string representation of that
24678 SSize_t value_len = lookup_len - equals_pos;
24680 /* Get the value */
24681 if ( value_len <= 0
24682 || my_atof3(lookup_name + equals_pos, &value,
24684 != lookup_name + lookup_len)
24689 /* If the value is an integer, the canonical value is integral
24691 if (Perl_ceil(value) == value) {
24692 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24693 equals_pos, lookup_name, value);
24695 else { /* Otherwise, it is %e with a known precision */
24698 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24699 equals_pos, lookup_name,
24700 PL_E_FORMAT_PRECISION, value);
24702 /* The exponent generated is expecting two digits, whereas
24703 * %e on some systems will generate three. Remove leading
24704 * zeros in excess of 2 from the exponent. We start
24705 * looking for them after the '=' */
24706 exp_ptr = strchr(canonical + equals_pos, 'e');
24708 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24709 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24711 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24713 if (excess_exponent_len > 0) {
24714 SSize_t leading_zeros = strspn(cur_ptr, "0");
24715 SSize_t excess_leading_zeros
24716 = MIN(leading_zeros, excess_exponent_len);
24717 if (excess_leading_zeros > 0) {
24718 Move(cur_ptr + excess_leading_zeros,
24720 strlen(cur_ptr) - excess_leading_zeros
24721 + 1, /* Copy the NUL as well */
24728 else { /* Has a slash. Create a rational in canonical form */
24729 UV numerator, denominator, gcd, trial;
24730 const char * end_ptr;
24731 const char * sign = "";
24733 /* We can't just find the numerator, denominator, and do the
24734 * division, then use the method above, because that is
24735 * inexact. And the input could be a rational that is within
24736 * epsilon (given our precision) of a valid rational, and would
24737 * then incorrectly compare valid.
24739 * We're only interested in the part after the '=' */
24740 const char * this_lookup_name = lookup_name + equals_pos;
24741 lookup_len -= equals_pos;
24742 slash_pos -= equals_pos;
24744 /* Handle any leading minus */
24745 if (this_lookup_name[0] == '-') {
24747 this_lookup_name++;
24752 /* Convert the numerator to numeric */
24753 end_ptr = this_lookup_name + slash_pos;
24754 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24758 /* It better have included all characters before the slash */
24759 if (*end_ptr != '/') {
24763 /* Set to look at just the denominator */
24764 this_lookup_name += slash_pos;
24765 lookup_len -= slash_pos;
24766 end_ptr = this_lookup_name + lookup_len;
24768 /* Convert the denominator to numeric */
24769 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24773 /* It better be the rest of the characters, and don't divide by
24775 if ( end_ptr != this_lookup_name + lookup_len
24776 || denominator == 0)
24781 /* Get the greatest common denominator using
24782 http://en.wikipedia.org/wiki/Euclidean_algorithm */
24784 trial = denominator;
24785 while (trial != 0) {
24787 trial = gcd % trial;
24791 /* If already in lowest possible terms, we have already tried
24792 * looking this up */
24797 /* Reduce the rational, which should put it in canonical form
24800 denominator /= gcd;
24802 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24803 equals_pos, lookup_name, sign, numerator, denominator);
24806 /* Here, we have the number in canonical form. Try that */
24807 table_index = do_uniprop_match(canonical, strlen(canonical));
24808 if (table_index == 0) {
24811 } /* End of still didn't find the property in our table */
24812 } /* End of didn't find the property in our table */
24814 /* Here, we have a non-zero return, which is an index into a table of ptrs.
24815 * A negative return signifies that the real index is the absolute value,
24816 * but the result needs to be inverted */
24817 if (table_index < 0) {
24818 invert_return = TRUE;
24819 table_index = -table_index;
24822 /* Out-of band indices indicate a deprecated property. The proper index is
24823 * modulo it with the table size. And dividing by the table size yields
24824 * an offset into a table constructed by regen/mk_invlists.pl to contain
24825 * the corresponding warning message */
24826 if (table_index > MAX_UNI_KEYWORD_INDEX) {
24827 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24828 table_index %= MAX_UNI_KEYWORD_INDEX;
24829 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24830 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24831 (int) name_len, name,
24832 get_deprecated_property_msg(warning_offset));
24835 /* In a few properties, a different property is used under /i. These are
24836 * unlikely to change, so are hard-coded here. */
24838 if ( table_index == UNI_XPOSIXUPPER
24839 || table_index == UNI_XPOSIXLOWER
24840 || table_index == UNI_TITLE)
24842 table_index = UNI_CASED;
24844 else if ( table_index == UNI_UPPERCASELETTER
24845 || table_index == UNI_LOWERCASELETTER
24846 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
24847 || table_index == UNI_TITLECASELETTER
24850 table_index = UNI_CASEDLETTER;
24852 else if ( table_index == UNI_POSIXUPPER
24853 || table_index == UNI_POSIXLOWER)
24855 table_index = UNI_POSIXALPHA;
24859 /* Create and return the inversion list */
24860 prop_definition = get_prop_definition(table_index);
24861 sv_2mortal(prop_definition);
24863 /* See if there is a private use override to add to this definition */
24865 COPHH * hinthash = (IN_PERL_COMPILETIME)
24866 ? CopHINTHASH_get(&PL_compiling)
24867 : CopHINTHASH_get(PL_curcop);
24868 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24870 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24872 /* See if there is an element in the hints hash for this table */
24873 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24874 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24878 SV * pu_definition;
24880 SV * expanded_prop_definition =
24881 sv_2mortal(invlist_clone(prop_definition, NULL));
24883 /* If so, it's definition is the string from here to the next
24884 * \a character. And its format is the same as a user-defined
24886 pos += SvCUR(pu_lookup);
24887 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24888 pu_invlist = handle_user_defined_property(lookup_name,
24891 0, /* Not folded */
24899 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24900 sv_catpvs(msg, "Insecure private-use override");
24901 goto append_name_to_msg;
24904 /* For now, as a safety measure, make sure that it doesn't
24905 * override non-private use code points */
24906 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24908 /* Add it to the list to be returned */
24909 _invlist_union(prop_definition, pu_invlist,
24910 &expanded_prop_definition);
24911 prop_definition = expanded_prop_definition;
24912 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24917 if (invert_return) {
24918 _invlist_invert(prop_definition);
24920 return prop_definition;
24922 unknown_user_defined:
24923 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24924 sv_catpvs(msg, "Unknown user-defined property name");
24925 goto append_name_to_msg;
24928 if (non_pkg_begin != 0) {
24929 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24930 sv_catpvs(msg, "Illegal user-defined property name");
24933 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24934 sv_catpvs(msg, "Can't find Unicode property definition");
24938 append_name_to_msg:
24940 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
24941 const char * suffix = (runtime && level == 0) ? "}" : "\"";
24943 sv_catpv(msg, prefix);
24944 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24945 sv_catpv(msg, suffix);
24950 definition_deferred:
24953 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
24955 /* Here it could yet to be defined, so defer evaluation of this until
24956 * its needed at runtime. We need the fully qualified property name to
24957 * avoid ambiguity */
24959 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24963 /* If it didn't come with a package, or the package is utf8::, this
24964 * actually could be an official Unicode property whose inclusion we
24965 * are deferring until runtime to make sure that it isn't overridden by
24966 * a user-defined property of the same name (which we haven't
24967 * encountered yet). Add a marker to indicate this possibility, for
24968 * use at such time when we first need the definition during pattern
24969 * matching execution */
24970 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
24971 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
24974 /* We also need a trailing newline */
24975 sv_catpvs(fq_name, "\n");
24977 *user_defined_ptr = TRUE;
24983 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
24984 const STRLEN wname_len, /* Its length */
24985 SV ** prop_definition,
24988 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
24989 * any matches, adding them to prop_definition */
24993 CV * get_names_info; /* entry to charnames.pm to get info we need */
24994 SV * names_string; /* Contains all character names, except algo */
24995 SV * algorithmic_names; /* Contains info about algorithmically
24996 generated character names */
24997 REGEXP * subpattern_re; /* The user's pattern to match with */
24998 struct regexp * prog; /* The compiled pattern */
24999 char * all_names_start; /* lib/unicore/Name.pl string of every
25000 (non-algorithmic) character name */
25001 char * cur_pos; /* We match, effectively using /gc; this is
25002 where we are now */
25003 bool found_matches = FALSE; /* Did any name match so far? */
25004 SV * empty; /* For matching zero length names */
25005 SV * must; /* What substring, if any, must be in a name
25006 for the subpattern to match */
25007 SV * syllable_name = NULL; /* For Hangul syllables */
25008 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25009 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25011 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25012 * syllable name, and these are immutable and guaranteed by the Unicode
25013 * standard to never be extended */
25014 const STRLEN syl_max_len = hangul_prefix_len + 7;
25018 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25020 /* Make sure _charnames is loaded. (The parameters give context
25021 * for any errors generated */
25022 get_names_info = get_cv("_charnames::_get_names_info", 0);
25023 if (! get_names_info) {
25024 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25027 /* Get the charnames data */
25028 PUSHSTACKi(PERLSI_REGCOMP);
25036 /* Special _charnames entry point that returns the info this routine
25038 call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25042 /* Data structure for names which end in their very own code points */
25043 algorithmic_names = POPs;
25044 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25046 /* The lib/unicore/Name.pl string */
25047 names_string = POPs;
25048 SvREFCNT_inc_simple_void_NN(names_string);
25055 if ( ! SvROK(names_string)
25056 || ! SvROK(algorithmic_names))
25057 { /* Perhaps should panic instead XXX */
25058 SvREFCNT_dec(names_string);
25059 SvREFCNT_dec(algorithmic_names);
25063 names_string = sv_2mortal(SvRV(names_string));
25064 all_names_start = SvPVX(names_string);
25065 cur_pos = all_names_start;
25067 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25069 /* Compile the subpattern consisting of the name being looked for */
25070 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25071 must = re_intuit_string(subpattern_re);
25072 prog = ReANY(subpattern_re);
25074 /* If only nothing is matched, skip to where empty names are looked for */
25075 if (prog->maxlen == 0) {
25079 /* And match against the string of all names /gc. Don't even try if it
25080 * must match a character not found in any name. */
25082 || SvCUR(must) == 0
25083 || strspn(SvPVX(must), "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()")
25086 while (execute_wildcard(subpattern_re,
25088 SvEND(names_string),
25089 all_names_start, 0,
25092 { /* Here, matched. */
25094 /* Note the string entries look like
25095 * 00001\nSTART OF HEADING\n\n
25096 * so we could match anywhere in that string. We have to rule out
25097 * matching a code point line */
25098 char * this_name_start = all_names_start
25099 + RX_OFFS(subpattern_re)->start;
25100 char * this_name_end = all_names_start
25101 + RX_OFFS(subpattern_re)->end;
25104 UV cp = 0; /* Silences some compilers */
25105 AV * this_string = NULL;
25106 bool is_multi = FALSE;
25108 /* If matched nothing, advance to next possible match */
25109 if (this_name_start == this_name_end) {
25110 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25111 SvEND(names_string) - this_name_end);
25112 if (cur_pos == NULL) {
25117 /* Position the next match to start beyond the current returned
25119 cur_pos = (char *) memchr(this_name_end, '\n',
25120 SvEND(names_string) - this_name_end);
25123 /* Back up to the \n just before the beginning of the character. */
25124 cp_end = (char *) my_memrchr(all_names_start,
25126 this_name_start - all_names_start);
25128 /* If we didn't find a \n, it means it matched somewhere in the
25129 * initial '00000' in the string, so isn't a real match */
25130 if (cp_end == NULL) {
25134 this_name_start = cp_end + 1; /* The name starts just after */
25135 cp_end--; /* the \n, and the code point */
25136 /* ends just before it */
25138 /* All code points are 5 digits long */
25139 cp_start = cp_end - 4;
25141 /* This shouldn't happen, as we found a \n, and the first \n is
25142 * further along than what we subtracted */
25143 assert(cp_start >= all_names_start);
25145 if (cp_start == all_names_start) {
25146 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25150 /* If the character is a blank, we either have a named sequence, or
25151 * something is wrong */
25152 if (*(cp_start - 1) == ' ') {
25153 cp_start = (char *) my_memrchr(all_names_start,
25155 cp_start - all_names_start);
25159 assert(cp_start != NULL && cp_start >= all_names_start + 2);
25161 /* Except for the first line in the string, the sequence before the
25162 * code point is \n\n. If that isn't the case here, we didn't
25163 * match the name of a character. (We could have matched a named
25164 * sequence, not currently handled */
25165 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25169 /* We matched! Add this to the list */
25170 found_matches = TRUE;
25172 /* Loop through all the code points in the sequence */
25173 while (cp_start < cp_end) {
25175 /* Calculate this code point from its 5 digits */
25176 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25177 + (XDIGIT_VALUE(cp_start[1]) << 12)
25178 + (XDIGIT_VALUE(cp_start[2]) << 8)
25179 + (XDIGIT_VALUE(cp_start[3]) << 4)
25180 + XDIGIT_VALUE(cp_start[4]);
25182 cp_start += 6; /* Go past any blank */
25184 if (cp_start < cp_end || is_multi) {
25185 if (this_string == NULL) {
25186 this_string = newAV();
25190 av_push(this_string, newSVuv(cp));
25194 if (is_multi) { /* Was more than one code point */
25195 if (*strings == NULL) {
25196 *strings = newAV();
25199 av_push(*strings, (SV *) this_string);
25201 else { /* Only a single code point */
25202 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25204 } /* End of loop through the non-algorithmic names string */
25207 /* There are also character names not in 'names_string'. These are
25208 * algorithmically generatable. Try this pattern on each possible one.
25209 * (khw originally planned to leave this out given the large number of
25210 * matches attempted; but the speed turned out to be quite acceptable
25212 * There are plenty of opportunities to optimize to skip many of the tests.
25213 * beyond the rudimentary ones already here */
25215 /* First see if the subpattern matches any of the algorithmic generatable
25216 * Hangul syllable names.
25218 * We know none of these syllable names will match if the input pattern
25219 * requires more bytes than any syllable has, or if the input pattern only
25220 * matches an empty name, or if the pattern has something it must match and
25221 * one of the characters in that isn't in any Hangul syllable. */
25222 if ( prog->minlen <= (SSize_t) syl_max_len
25223 && prog->maxlen > 0
25225 || SvCUR(must) == 0
25226 || strspn(SvPVX(must), "\n ABCDEGHIJKLMNOPRSTUWY") == SvCUR(must)))
25228 /* These constants, names, values, and algorithm are adapted from the
25229 * Unicode standard, version 5.1, section 3.12, and should never
25231 const char * JamoL[] = {
25232 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25233 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25235 const int LCount = C_ARRAY_LENGTH(JamoL);
25237 const char * JamoV[] = {
25238 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25239 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25242 const int VCount = C_ARRAY_LENGTH(JamoV);
25244 const char * JamoT[] = {
25245 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25246 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25247 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25249 const int TCount = C_ARRAY_LENGTH(JamoT);
25253 /* This is the initial Hangul syllable code point; each time through the
25254 * inner loop, it maps to the next higher code point. For more info,
25255 * see the Hangul syllable section of the Unicode standard. */
25258 syllable_name = sv_2mortal(newSV(syl_max_len));
25259 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25261 for (L = 0; L < LCount; L++) {
25262 for (V = 0; V < VCount; V++) {
25263 for (T = 0; T < TCount; T++) {
25265 /* Truncate back to the prefix, which is unvarying */
25266 SvCUR_set(syllable_name, hangul_prefix_len);
25268 sv_catpv(syllable_name, JamoL[L]);
25269 sv_catpv(syllable_name, JamoV[V]);
25270 sv_catpv(syllable_name, JamoT[T]);
25272 if (execute_wildcard(subpattern_re,
25273 SvPVX(syllable_name),
25274 SvEND(syllable_name),
25275 SvPVX(syllable_name), 0,
25279 *prop_definition = add_cp_to_invlist(*prop_definition,
25281 found_matches = TRUE;
25290 /* The rest of the algorithmically generatable names are of the form
25291 * "PREFIX-code_point". The prefixes and the code point limits of each
25292 * were returned to us in the array 'algorithmic_names' from data in
25293 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
25294 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25297 /* Each element of the array is a hash, giving the details for the
25298 * series of names it covers. There is the base name of the characters
25299 * in the series, and the low and high code points in the series. And,
25300 * for optimization purposes a string containing all the legal
25301 * characters that could possibly be in a name in this series. */
25302 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25303 SV * prefix = * hv_fetchs(this_series, "name", 0);
25304 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25305 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25306 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25308 /* Pre-allocate an SV with enough space */
25309 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25311 if (high >= 0x10000) {
25312 sv_catpvs(algo_name, "0");
25315 /* This series can be skipped entirely if the pattern requires
25316 * something longer than any name in the series, or can only match an
25317 * empty name, or contains a character not found in any name in the
25319 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
25320 && prog->maxlen > 0
25322 || SvCUR(must) == 0
25323 || strspn(SvPVX(must), legal) == SvCUR(must)))
25325 for (j = low; j <= high; j++) { /* For each code point in the series */
25327 /* Get its name, and see if it matches the subpattern */
25328 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25331 if (execute_wildcard(subpattern_re,
25334 SvPVX(algo_name), 0,
25338 *prop_definition = add_cp_to_invlist(*prop_definition, j);
25339 found_matches = TRUE;
25346 /* Finally, see if the subpattern matches an empty string */
25347 empty = newSVpvs("");
25348 if (execute_wildcard(subpattern_re,
25355 /* Many code points have empty names. Currently these are the \p{GC=C}
25356 * ones, minus CC and CF */
25358 SV * empty_names_ref = get_prop_definition(UNI_C);
25359 SV * empty_names = invlist_clone(empty_names_ref, NULL);
25361 SV * subtract = get_prop_definition(UNI_CC);
25363 _invlist_subtract(empty_names, subtract, &empty_names);
25364 SvREFCNT_dec_NN(empty_names_ref);
25365 SvREFCNT_dec_NN(subtract);
25367 subtract = get_prop_definition(UNI_CF);
25368 _invlist_subtract(empty_names, subtract, &empty_names);
25369 SvREFCNT_dec_NN(subtract);
25371 _invlist_union(*prop_definition, empty_names, prop_definition);
25372 found_matches = TRUE;
25373 SvREFCNT_dec_NN(empty_names);
25375 SvREFCNT_dec_NN(empty);
25378 /* If we ever were to accept aliases for, say private use names, we would
25379 * need to do something fancier to find empty names. The code below works
25380 * (at the time it was written), and is slower than the above */
25381 const char empties_pat[] = "^.";
25382 if (strNE(name, empties_pat)) {
25383 SV * empty = newSVpvs("");
25384 if (execute_wildcard(subpattern_re,
25391 SV * empties = NULL;
25393 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25395 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25396 SvREFCNT_dec_NN(empties);
25398 found_matches = TRUE;
25400 SvREFCNT_dec_NN(empty);
25404 SvREFCNT_dec_NN(subpattern_re);
25405 return found_matches;
25409 * ex: set ts=8 sts=4 sw=4 et: