5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
147 #define STATIC static
150 /* this is a chain of data about sub patterns we are processing that
151 need to be handled separately/specially in study_chunk. Its so
152 we can simulate recursion without losing state. */
154 typedef struct scan_frame {
155 regnode *last_regnode; /* last node to process in this frame */
156 regnode *next_regnode; /* next node to process when last is reached */
157 U32 prev_recursed_depth;
158 I32 stopparen; /* what stopparen do we use */
159 bool in_gosub; /* this or an outer frame is for GOSUB */
161 struct scan_frame *this_prev_frame; /* this previous frame */
162 struct scan_frame *prev_frame; /* previous frame */
163 struct scan_frame *next_frame; /* next frame */
166 /* Certain characters are output as a sequence with the first being a
168 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
171 struct RExC_state_t {
172 U32 flags; /* RXf_* are we folding, multilining? */
173 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
174 char *precomp; /* uncompiled string. */
175 char *precomp_end; /* pointer to end of uncompiled string. */
176 REGEXP *rx_sv; /* The SV that is the regexp. */
177 regexp *rx; /* perl core regexp structure */
178 regexp_internal *rxi; /* internal data for regexp object
180 char *start; /* Start of input for compile */
181 char *end; /* End of input for compile */
182 char *parse; /* Input-scan pointer. */
183 char *copy_start; /* start of copy of input within
184 constructed parse string */
185 char *save_copy_start; /* Provides one level of saving
186 and restoring 'copy_start' */
187 char *copy_start_in_input; /* Position in input string
188 corresponding to copy_start */
189 SSize_t whilem_seen; /* number of WHILEM in this expr */
190 regnode *emit_start; /* Start of emitted-code area */
191 regnode_offset emit; /* Code-emit pointer */
192 I32 naughty; /* How bad is this pattern? */
193 I32 sawback; /* Did we see \1, ...? */
194 SSize_t size; /* Number of regnode equivalents in
196 Size_t sets_depth; /* Counts recursion depth of already-
197 compiled regex set patterns */
200 I32 parens_buf_size; /* #slots malloced open/close_parens */
201 regnode_offset *open_parens; /* offsets to open parens */
202 regnode_offset *close_parens; /* offsets to close parens */
203 HV *paren_names; /* Paren names */
205 /* position beyond 'precomp' of the warning message furthest away from
206 * 'precomp'. During the parse, no warnings are raised for any problems
207 * earlier in the parse than this position. This works if warnings are
208 * raised the first time a given spot is parsed, and if only one
209 * independent warning is raised for any given spot */
210 Size_t latest_warn_offset;
212 I32 npar; /* Capture buffer count so far in the
213 parse, (OPEN) plus one. ("par" 0 is
215 I32 total_par; /* During initial parse, is either 0,
216 or -1; the latter indicating a
217 reparse is needed. After that pass,
218 it is what 'npar' became after the
219 pass. Hence, it being > 0 indicates
220 we are in a reparse situation */
221 I32 nestroot; /* root parens we are in - used by
224 regnode *end_op; /* END node in program */
225 I32 utf8; /* whether the pattern is utf8 or not */
226 I32 orig_utf8; /* whether the pattern was originally in utf8 */
227 /* XXX use this for future optimisation of case
228 * where pattern must be upgraded to utf8. */
229 I32 uni_semantics; /* If a d charset modifier should use unicode
230 rules, even if the pattern is not in
233 I32 recurse_count; /* Number of recurse regops we have generated */
234 regnode **recurse; /* Recurse regops */
235 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
237 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
240 I32 override_recoding;
241 I32 recode_x_to_native;
242 I32 in_multi_char_class;
243 int code_index; /* next code_blocks[] slot */
244 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
246 SSize_t maxlen; /* mininum possible number of chars in string to match */
247 scan_frame *frame_head;
248 scan_frame *frame_last;
252 SV *runtime_code_qr; /* qr with the runtime code blocks */
254 const char *lastparse;
256 U32 study_chunk_recursed_count;
257 AV *paren_name_list; /* idx -> name */
261 #define RExC_lastparse (pRExC_state->lastparse)
262 #define RExC_lastnum (pRExC_state->lastnum)
263 #define RExC_paren_name_list (pRExC_state->paren_name_list)
264 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
265 #define RExC_mysv (pRExC_state->mysv1)
266 #define RExC_mysv1 (pRExC_state->mysv1)
267 #define RExC_mysv2 (pRExC_state->mysv2)
275 bool sWARN_EXPERIMENTAL__VLB;
276 bool sWARN_EXPERIMENTAL__REGEX_SETS;
279 #define RExC_flags (pRExC_state->flags)
280 #define RExC_pm_flags (pRExC_state->pm_flags)
281 #define RExC_precomp (pRExC_state->precomp)
282 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
283 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
284 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
285 #define RExC_precomp_end (pRExC_state->precomp_end)
286 #define RExC_rx_sv (pRExC_state->rx_sv)
287 #define RExC_rx (pRExC_state->rx)
288 #define RExC_rxi (pRExC_state->rxi)
289 #define RExC_start (pRExC_state->start)
290 #define RExC_end (pRExC_state->end)
291 #define RExC_parse (pRExC_state->parse)
292 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
293 #define RExC_whilem_seen (pRExC_state->whilem_seen)
294 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
295 under /d from /u ? */
297 #define RExC_emit (pRExC_state->emit)
298 #define RExC_emit_start (pRExC_state->emit_start)
299 #define RExC_sawback (pRExC_state->sawback)
300 #define RExC_seen (pRExC_state->seen)
301 #define RExC_size (pRExC_state->size)
302 #define RExC_maxlen (pRExC_state->maxlen)
303 #define RExC_npar (pRExC_state->npar)
304 #define RExC_total_parens (pRExC_state->total_par)
305 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
306 #define RExC_nestroot (pRExC_state->nestroot)
307 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
308 #define RExC_utf8 (pRExC_state->utf8)
309 #define RExC_uni_semantics (pRExC_state->uni_semantics)
310 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
311 #define RExC_open_parens (pRExC_state->open_parens)
312 #define RExC_close_parens (pRExC_state->close_parens)
313 #define RExC_end_op (pRExC_state->end_op)
314 #define RExC_paren_names (pRExC_state->paren_names)
315 #define RExC_recurse (pRExC_state->recurse)
316 #define RExC_recurse_count (pRExC_state->recurse_count)
317 #define RExC_sets_depth (pRExC_state->sets_depth)
318 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
319 #define RExC_study_chunk_recursed_bytes \
320 (pRExC_state->study_chunk_recursed_bytes)
321 #define RExC_in_lookaround (pRExC_state->in_lookaround)
322 #define RExC_contains_locale (pRExC_state->contains_locale)
323 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
326 # define SET_recode_x_to_native(x) \
327 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
329 # define SET_recode_x_to_native(x) NOOP
332 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
333 #define RExC_frame_head (pRExC_state->frame_head)
334 #define RExC_frame_last (pRExC_state->frame_last)
335 #define RExC_frame_count (pRExC_state->frame_count)
336 #define RExC_strict (pRExC_state->strict)
337 #define RExC_study_started (pRExC_state->study_started)
338 #define RExC_warn_text (pRExC_state->warn_text)
339 #define RExC_in_script_run (pRExC_state->in_script_run)
340 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
341 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
342 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
343 #define RExC_unlexed_names (pRExC_state->unlexed_names)
346 /***********************************************************************/
347 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
349 * All of these macros depend on the above RExC_ accessor macros, which
350 * in turns depend on a variable pRExC_state being in scope where they
351 * are used. This is the standard regexp parser context variable which is
352 * passed into every non-trivial parse function in this file.
354 * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
355 * of the macros which do not take an argument will operate on the
356 * pRExC_state structure *only*.
358 * Please do NOT modify RExC_parse without using these macros. In the
359 * future these macros will be extended for enhanced debugging and trace
360 * output during the parse process.
363 /* RExC_parse_incf(flag)
365 * Increment RExC_parse to point at the next codepoint, while doing
366 * the right thing depending on whether we are parsing UTF-8 strings
367 * or not. The 'flag' argument determines if content is UTF-8 or not,
368 * intended for cases where this is NOT governed by the UTF macro.
370 * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
372 * WARNING: Does NOT take into account RExC_end; it is the callers
373 * responsibility to make sure there are enough octets left in
374 * RExC_parse to ensure that when processing UTF-8 we would not read
375 * past the end of the string.
377 #define RExC_parse_incf(flag) STMT_START { \
378 RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \
381 /* RExC_parse_inc_safef(flag)
383 * Safely increment RExC_parse to point at the next codepoint,
384 * doing the right thing depending on whether we are parsing
385 * UTF-8 strings or not and NOT reading past the end of the buffer.
386 * The 'flag' argument determines if content is UTF-8 or not,
387 * intended for cases where this is NOT governed by the UTF macro.
389 * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
391 * NOTE: Will NOT read past RExC_end when content is UTF-8.
393 #define RExC_parse_inc_safef(flag) STMT_START { \
394 RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
399 * Increment RExC_parse to point at the next codepoint,
400 * doing the right thing depending on whether we are parsing
401 * UTF-8 strings or not.
403 * WARNING: Does NOT take into account RExC_end, it is the callers
404 * responsibility to make sure there are enough octets left in
405 * RExC_parse to ensure that when processing UTF-8 we would not read
406 * past the end of the string.
408 * NOTE: whether we are parsing UTF-8 or not is determined by the
409 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
410 * macro operates on the pRExC_state structure only.
412 #define RExC_parse_inc() RExC_parse_incf(UTF)
414 /* RExC_parse_inc_safe()
416 * Safely increment RExC_parse to point at the next codepoint,
417 * doing the right thing depending on whether we are parsing
418 * UTF-8 strings or not and NOT reading past the end of the buffer.
420 * NOTE: whether we are parsing UTF-8 or not is determined by the
421 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
422 * macro operates on the pRExC_state structure only.
424 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
426 /* RExC_parse_inc_utf8()
428 * Increment RExC_parse to point at the next utf8 codepoint,
429 * assumes content is UTF-8.
431 * WARNING: Does NOT take into account RExC_end; it is the callers
432 * responsibility to make sure there are enough octets left in RExC_parse
433 * to ensure that when processing UTF-8 we would not read past the end
436 #define RExC_parse_inc_utf8() STMT_START { \
437 RExC_parse += UTF8SKIP(RExC_parse); \
440 /* RExC_parse_inc_if_char()
442 * Increment RExC_parse to point at the next codepoint, if and only
443 * if the current parse point is NOT a NULL, while doing the right thing
444 * depending on whether we are parsing UTF-8 strings or not.
446 * WARNING: Does NOT take into account RExC_end, it is the callers
447 * responsibility to make sure there are enough octets left in RExC_parse
448 * to ensure that when processing UTF-8 we would not read past the end
451 * NOTE: whether we are parsing UTF-8 or not is determined by the
452 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
453 * macro operates on the pRExC_state structure only.
455 #define RExC_parse_inc_if_char() STMT_START { \
456 RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \
459 /* RExC_parse_inc_by(n_octets)
461 * Increment the parse cursor by the number of octets specified by
462 * the 'n_octets' argument.
464 * NOTE: Does NOT check ANY constraints. It is the callers responsibility
465 * that this will not move past the end of the string, or leave the
466 * pointer in the middle of a UTF-8 sequence.
468 * Typically used to advanced past previously analyzed content.
470 #define RExC_parse_inc_by(n_octets) STMT_START { \
471 RExC_parse += (n_octets); \
474 /* RExC_parse_set(to_ptr)
476 * Sets the RExC_parse pointer to the pointer specified by the 'to'
477 * argument. No validation whatsoever is performed on the to pointer.
479 #define RExC_parse_set(to_ptr) STMT_START { \
480 RExC_parse = (to_ptr); \
483 /**********************************************************************/
485 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
486 * a flag to disable back-off on the fixed/floating substrings - if it's
487 * a high complexity pattern we assume the benefit of avoiding a full match
488 * is worth the cost of checking for the substrings even if they rarely help.
490 #define RExC_naughty (pRExC_state->naughty)
491 #define TOO_NAUGHTY (10)
492 #define MARK_NAUGHTY(add) \
493 if (RExC_naughty < TOO_NAUGHTY) \
494 RExC_naughty += (add)
495 #define MARK_NAUGHTY_EXP(exp, add) \
496 if (RExC_naughty < TOO_NAUGHTY) \
497 RExC_naughty += RExC_naughty / (exp) + (add)
499 #define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
500 #define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \
501 || ((*s) == '{' && regcurly(s, e, NULL)))
504 * Flags to be passed up.
506 #define HASWIDTH 0x01 /* Known to not match null strings, could match
508 #define SIMPLE 0x02 /* Exactly one character wide */
509 /* (or LNBREAK as a special case) */
510 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
511 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
512 #define RESTART_PARSE 0x20 /* Need to redo the parse */
513 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
514 calcuate sizes as UTF-8 */
516 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
518 /* whether trie related optimizations are enabled */
519 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
520 #define TRIE_STUDY_OPT
521 #define FULL_TRIE_STUDY
525 /* About the term "restudy" and the var "restudied" and the defines
526 * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
527 * doing multiple study_chunk() calls over the same set of opcodes for* the
528 * purpose of enhanced TRIE optimizations.
530 * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
531 * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
532 * which then causes the Perl_re_op_compile() to then call the optimizer
533 * S_study_chunk() a second time to perform additional optimizations,
534 * including the aho_corasick startclass optimization.
535 * This additional pass will only happen once, which is managed by the
536 * 'restudied' variable in Perl_re_op_compile().
538 * When this second pass is under way the flags passed into study_chunk() will
539 * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
540 * to any recursive calls to S_study_chunk().
542 * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
543 * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
546 * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
547 * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
551 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
552 #define PBITVAL(paren) (1 << ((paren) & 7))
553 #define PAREN_OFFSET(depth) \
554 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
555 #define PAREN_TEST(depth, paren) \
556 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
557 #define PAREN_SET(depth, paren) \
558 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
559 #define PAREN_UNSET(depth, paren) \
560 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
562 #define REQUIRE_UTF8(flagp) STMT_START { \
564 *flagp = RESTART_PARSE|NEED_UTF8; \
569 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
570 * pattern is in UTF-8. This latter condition is in case the outermost rules
571 * are locale. See GH #17278 */
572 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
574 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
575 * a flag that indicates we need to override /d with /u as a result of
576 * something in the pattern. It should only be used in regards to calling
577 * set_regex_charset() or get_regex_charset() */
578 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
580 if (DEPENDS_SEMANTICS) { \
581 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
582 RExC_uni_semantics = 1; \
583 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
584 /* No need to restart the parse if we haven't seen \
585 * anything that differs between /u and /d, and no need \
586 * to restart immediately if we're going to reparse \
587 * anyway to count parens */ \
588 *flagp |= RESTART_PARSE; \
589 return restart_retval; \
594 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
596 RExC_use_BRANCHJ = 1; \
597 *flagp |= RESTART_PARSE; \
598 return restart_retval; \
601 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
602 * less. After that, it must always be positive, because the whole re is
603 * considered to be surrounded by virtual parens. Setting it to negative
604 * indicates there is some construct that needs to know the actual number of
605 * parens to be properly handled. And that means an extra pass will be
606 * required after we've counted them all */
607 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
608 #define REQUIRE_PARENS_PASS \
609 STMT_START { /* No-op if have completed a pass */ \
610 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
612 #define IN_PARENS_PASS (RExC_total_parens < 0)
615 /* This is used to return failure (zero) early from the calling function if
616 * various flags in 'flags' are set. Two flags always cause a return:
617 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
618 * additional flags that should cause a return; 0 if none. If the return will
619 * be done, '*flagp' is first set to be all of the flags that caused the
621 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
623 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
624 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
629 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
631 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
632 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
633 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
634 if (MUST_RESTART(*(flagp))) return 0
636 /* This converts the named class defined in regcomp.h to its equivalent class
637 * number defined in handy.h. */
638 #define namedclass_to_classnum(class) ((int) ((class) / 2))
639 #define classnum_to_namedclass(classnum) ((classnum) * 2)
641 #define _invlist_union_complement_2nd(a, b, output) \
642 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
643 #define _invlist_intersection_complement_2nd(a, b, output) \
644 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
646 /* We add a marker if we are deferring expansion of a property that is both
647 * 1) potentiallly user-defined; and
648 * 2) could also be an official Unicode property.
650 * Without this marker, any deferred expansion can only be for a user-defined
651 * one. This marker shouldn't conflict with any that could be in a legal name,
652 * and is appended to its name to indicate this. There is a string and
654 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
655 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
657 /* What is infinity for optimization purposes */
658 #define OPTIMIZE_INFTY SSize_t_MAX
660 /* About scan_data_t.
662 During optimisation we recurse through the regexp program performing
663 various inplace (keyhole style) optimisations. In addition study_chunk
664 and scan_commit populate this data structure with information about
665 what strings MUST appear in the pattern. We look for the longest
666 string that must appear at a fixed location, and we look for the
667 longest string that may appear at a floating location. So for instance
672 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
673 strings (because they follow a .* construct). study_chunk will identify
674 both FOO and BAR as being the longest fixed and floating strings respectively.
676 The strings can be composites, for instance
680 will result in a composite fixed substring 'foo'.
682 For each string some basic information is maintained:
685 This is the position the string must appear at, or not before.
686 It also implicitly (when combined with minlenp) tells us how many
687 characters must match before the string we are searching for.
688 Likewise when combined with minlenp and the length of the string it
689 tells us how many characters must appear after the string we have
693 Only used for floating strings. This is the rightmost point that
694 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
695 string can occur infinitely far to the right.
696 For fixed strings, it is equal to min_offset.
699 A pointer to the minimum number of characters of the pattern that the
700 string was found inside. This is important as in the case of positive
701 lookahead or positive lookbehind we can have multiple patterns
706 The minimum length of the pattern overall is 3, the minimum length
707 of the lookahead part is 3, but the minimum length of the part that
708 will actually match is 1. So 'FOO's minimum length is 3, but the
709 minimum length for the F is 1. This is important as the minimum length
710 is used to determine offsets in front of and behind the string being
711 looked for. Since strings can be composites this is the length of the
712 pattern at the time it was committed with a scan_commit. Note that
713 the length is calculated by study_chunk, so that the minimum lengths
714 are not known until the full pattern has been compiled, thus the
715 pointer to the value.
719 In the case of lookbehind the string being searched for can be
720 offset past the start point of the final matching string.
721 If this value was just blithely removed from the min_offset it would
722 invalidate some of the calculations for how many chars must match
723 before or after (as they are derived from min_offset and minlen and
724 the length of the string being searched for).
725 When the final pattern is compiled and the data is moved from the
726 scan_data_t structure into the regexp structure the information
727 about lookbehind is factored in, with the information that would
728 have been lost precalculated in the end_shift field for the
731 The fields pos_min and pos_delta are used to store the minimum offset
732 and the delta to the maximum offset at the current point in the pattern.
736 struct scan_data_substrs {
737 SV *str; /* longest substring found in pattern */
738 SSize_t min_offset; /* earliest point in string it can appear */
739 SSize_t max_offset; /* latest point in string it can appear */
740 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
741 SSize_t lookbehind; /* is the pos of the string modified by LB */
742 I32 flags; /* per substring SF_* and SCF_* flags */
745 typedef struct scan_data_t {
746 /*I32 len_min; unused */
747 /*I32 len_delta; unused */
751 SSize_t last_end; /* min value, <0 unless valid. */
752 SSize_t last_start_min;
753 SSize_t last_start_max;
754 U8 cur_is_floating; /* whether the last_* values should be set as
755 * the next fixed (0) or floating (1)
758 /* [0] is longest fixed substring so far, [1] is longest float so far */
759 struct scan_data_substrs substrs[2];
761 I32 flags; /* common SF_* and SCF_* flags */
763 SSize_t *last_closep;
764 regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
765 seen. DO NOT DEREFERENCE the regnode
766 pointer - the op may have been optimized
768 regnode_ssc *start_class;
772 * Forward declarations for pregcomp()'s friends.
775 static const scan_data_t zero_scan_data = {
776 0, 0, NULL, 0, 0, 0, 0,
778 { NULL, 0, 0, 0, 0, 0 },
779 { NULL, 0, 0, 0, 0, 0 },
781 0, 0, NULL, NULL, NULL
786 #define SF_BEFORE_SEOL 0x0001
787 #define SF_BEFORE_MEOL 0x0002
788 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
790 #define SF_IS_INF 0x0040
791 #define SF_HAS_PAR 0x0080
792 #define SF_IN_PAR 0x0100
793 #define SF_HAS_EVAL 0x0200
796 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
797 * longest substring in the pattern. When it is not set the optimiser keeps
798 * track of position, but does not keep track of the actual strings seen,
800 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
803 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
804 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
805 * turned off because of the alternation (BRANCH). */
806 #define SCF_DO_SUBSTR 0x0400
808 #define SCF_DO_STCLASS_AND 0x0800
809 #define SCF_DO_STCLASS_OR 0x1000
810 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
811 #define SCF_WHILEM_VISITED_POS 0x2000
813 #define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()?
814 Search for "restudy" in this file
815 to find a detailed explanation.*/
816 #define SCF_SEEN_ACCEPT 0x8000
817 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
818 Search for "restudy" in this file
819 to find a detailed explanation. */
820 #define SCF_IN_DEFINE 0x20000
824 #define UTF cBOOL(RExC_utf8)
826 /* The enums for all these are ordered so things work out correctly */
827 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
828 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
829 == REGEX_DEPENDS_CHARSET)
830 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
831 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
832 >= REGEX_UNICODE_CHARSET)
833 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
834 == REGEX_ASCII_RESTRICTED_CHARSET)
835 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
836 >= REGEX_ASCII_RESTRICTED_CHARSET)
837 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
838 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
840 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
842 /* For programs that want to be strictly Unicode compatible by dying if any
843 * attempt is made to match a non-Unicode code point against a Unicode
845 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
847 #define OOB_NAMEDCLASS -1
849 /* There is no code point that is out-of-bounds, so this is problematic. But
850 * its only current use is to initialize a variable that is always set before
852 #define OOB_UNICODE 0xDEADBEEF
854 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
857 /* length of regex to show in messages that don't mark a position within */
858 #define RegexLengthToShowInErrorMessages 127
861 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
862 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
863 * op/pragma/warn/regcomp.
865 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
866 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
868 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
869 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
871 /* The code in this file in places uses one level of recursion with parsing
872 * rebased to an alternate string constructed by us in memory. This can take
873 * the form of something that is completely different from the input, or
874 * something that uses the input as part of the alternate. In the first case,
875 * there should be no possibility of an error, as we are in complete control of
876 * the alternate string. But in the second case we don't completely control
877 * the input portion, so there may be errors in that. Here's an example:
879 * is handled specially because \x{df} folds to a sequence of more than one
880 * character: 'ss'. What is done is to create and parse an alternate string,
881 * which looks like this:
882 * /(?:\x{DF}|[abc\x{DF}def])/ui
883 * where it uses the input unchanged in the middle of something it constructs,
884 * which is a branch for the DF outside the character class, and clustering
885 * parens around the whole thing. (It knows enough to skip the DF inside the
886 * class while in this substitute parse.) 'abc' and 'def' may have errors that
887 * need to be reported. The general situation looks like this:
889 * |<------- identical ------>|
891 * Input: ---------------------------------------------------------------
892 * Constructed: ---------------------------------------------------
894 * |<------- identical ------>|
896 * sI..eI is the portion of the input pattern we are concerned with here.
897 * sC..EC is the constructed substitute parse string.
898 * sC..tC is constructed by us
899 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
900 * In the diagram, these are vertically aligned.
901 * eC..EC is also constructed by us.
902 * xC is the position in the substitute parse string where we found a
904 * xI is the position in the original pattern corresponding to xC.
906 * We want to display a message showing the real input string. Thus we need to
907 * translate from xC to xI. We know that xC >= tC, since the portion of the
908 * string sC..tC has been constructed by us, and so shouldn't have errors. We
910 * xI = tI + (xC - tC)
912 * When the substitute parse is constructed, the code needs to set:
915 * RExC_copy_start_in_input (tI)
916 * RExC_copy_start_in_constructed (tC)
917 * and restore them when done.
919 * During normal processing of the input pattern, both
920 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
921 * sI, so that xC equals xI.
924 #define sI RExC_precomp
925 #define eI RExC_precomp_end
926 #define sC RExC_start
928 #define tI RExC_copy_start_in_input
929 #define tC RExC_copy_start_in_constructed
930 #define xI(xC) (tI + (xC - tC))
931 #define xI_offset(xC) (xI(xC) - sI)
933 #define REPORT_LOCATION_ARGS(xC) \
935 (xI(xC) > eI) /* Don't run off end */ \
936 ? eI - sI /* Length before the <--HERE */ \
937 : ((xI_offset(xC) >= 0) \
939 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
940 IVdf " trying to output message for " \
942 __FILE__, __LINE__, (IV) xI_offset(xC), \
943 ((int) (eC - sC)), sC), 0)), \
944 sI), /* The input pattern printed up to the <--HERE */ \
946 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
947 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
949 /* Used to point after bad bytes for an error message, but avoid skipping
950 * past a nul byte. */
951 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
953 /* Set up to clean up after our imminent demise */
954 #define PREPARE_TO_DIE \
957 SAVEFREESV(RExC_rx_sv); \
958 if (RExC_open_parens) \
959 SAVEFREEPV(RExC_open_parens); \
960 if (RExC_close_parens) \
961 SAVEFREEPV(RExC_close_parens); \
965 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
966 * arg. Show regex, up to a maximum length. If it's too long, chop and add
969 #define _FAIL(code) STMT_START { \
970 const char *ellipses = ""; \
971 IV len = RExC_precomp_end - RExC_precomp; \
974 if (len > RegexLengthToShowInErrorMessages) { \
975 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
976 len = RegexLengthToShowInErrorMessages - 10; \
982 #define FAIL(msg) _FAIL( \
983 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
984 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
986 #define FAIL2(msg,arg) _FAIL( \
987 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
988 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
990 #define FAIL3(msg,arg1,arg2) _FAIL( \
991 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
992 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
995 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
997 #define Simple_vFAIL(m) STMT_START { \
998 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
999 m, REPORT_LOCATION_ARGS(RExC_parse)); \
1003 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
1005 #define vFAIL(m) STMT_START { \
1011 * Like Simple_vFAIL(), but accepts two arguments.
1013 #define Simple_vFAIL2(m,a1) STMT_START { \
1014 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
1015 REPORT_LOCATION_ARGS(RExC_parse)); \
1019 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
1021 #define vFAIL2(m,a1) STMT_START { \
1023 Simple_vFAIL2(m, a1); \
1028 * Like Simple_vFAIL(), but accepts three arguments.
1030 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
1031 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
1032 REPORT_LOCATION_ARGS(RExC_parse)); \
1036 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
1038 #define vFAIL3(m,a1,a2) STMT_START { \
1040 Simple_vFAIL3(m, a1, a2); \
1044 * Like Simple_vFAIL(), but accepts four arguments.
1046 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
1047 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
1048 REPORT_LOCATION_ARGS(RExC_parse)); \
1051 #define vFAIL4(m,a1,a2,a3) STMT_START { \
1053 Simple_vFAIL4(m, a1, a2, a3); \
1056 /* A specialized version of vFAIL2 that works with UTF8f */
1057 #define vFAIL2utf8f(m, a1) STMT_START { \
1059 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
1060 REPORT_LOCATION_ARGS(RExC_parse)); \
1063 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
1065 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
1066 REPORT_LOCATION_ARGS(RExC_parse)); \
1069 /* Setting this to NULL is a signal to not output warnings */
1070 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
1072 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
1073 RExC_copy_start_in_constructed = NULL; \
1075 #define RESTORE_WARNINGS \
1076 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
1078 /* Since a warning can be generated multiple times as the input is reparsed, we
1079 * output it the first time we come to that point in the parse, but suppress it
1080 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
1081 * generate any warnings */
1082 #define TO_OUTPUT_WARNINGS(loc) \
1083 ( RExC_copy_start_in_constructed \
1084 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
1086 /* After we've emitted a warning, we save the position in the input so we don't
1087 * output it again */
1088 #define UPDATE_WARNINGS_LOC(loc) \
1090 if (TO_OUTPUT_WARNINGS(loc)) { \
1091 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
1096 /* 'warns' is the output of the packWARNx macro used in 'code' */
1097 #define _WARN_HELPER(loc, warns, code) \
1099 if (! RExC_copy_start_in_constructed) { \
1100 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
1101 " expected at '%s'", \
1102 __FILE__, __LINE__, loc); \
1104 if (TO_OUTPUT_WARNINGS(loc)) { \
1105 if (ckDEAD(warns)) \
1108 UPDATE_WARNINGS_LOC(loc); \
1112 /* m is not necessarily a "literal string", in this macro */
1113 #define warn_non_literal_string(loc, packed_warn, m) \
1114 _WARN_HELPER(loc, packed_warn, \
1115 Perl_warner(aTHX_ packed_warn, \
1116 "%s" REPORT_LOCATION, \
1117 m, REPORT_LOCATION_ARGS(loc)))
1118 #define reg_warn_non_literal_string(loc, m) \
1119 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1121 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
1124 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1125 Newx(format, format_size, char); \
1126 my_strlcpy(format, m, format_size); \
1127 my_strlcat(format, REPORT_LOCATION, format_size); \
1128 SAVEFREEPV(format); \
1129 _WARN_HELPER(loc, packwarn, \
1130 Perl_ck_warner(aTHX_ packwarn, \
1132 a1, REPORT_LOCATION_ARGS(loc))); \
1135 #define ckWARNreg(loc,m) \
1136 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1137 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1138 m REPORT_LOCATION, \
1139 REPORT_LOCATION_ARGS(loc)))
1141 #define vWARN(loc, m) \
1142 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1143 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1144 m REPORT_LOCATION, \
1145 REPORT_LOCATION_ARGS(loc))) \
1147 #define vWARN_dep(loc, m) \
1148 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
1149 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
1150 m REPORT_LOCATION, \
1151 REPORT_LOCATION_ARGS(loc)))
1153 #define ckWARNdep(loc,m) \
1154 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
1155 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
1156 m REPORT_LOCATION, \
1157 REPORT_LOCATION_ARGS(loc)))
1159 #define ckWARNregdep(loc,m) \
1160 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1161 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1163 m REPORT_LOCATION, \
1164 REPORT_LOCATION_ARGS(loc)))
1166 #define ckWARN2reg_d(loc,m, a1) \
1167 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1168 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1169 m REPORT_LOCATION, \
1170 a1, REPORT_LOCATION_ARGS(loc)))
1172 #define ckWARN2reg(loc, m, a1) \
1173 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1174 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1175 m REPORT_LOCATION, \
1176 a1, REPORT_LOCATION_ARGS(loc)))
1178 #define vWARN3(loc, m, a1, a2) \
1179 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1180 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1181 m REPORT_LOCATION, \
1182 a1, a2, REPORT_LOCATION_ARGS(loc)))
1184 #define ckWARN3reg(loc, m, a1, a2) \
1185 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1186 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1187 m REPORT_LOCATION, \
1189 REPORT_LOCATION_ARGS(loc)))
1191 #define vWARN4(loc, m, a1, a2, a3) \
1192 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1193 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1194 m REPORT_LOCATION, \
1196 REPORT_LOCATION_ARGS(loc)))
1198 #define ckWARN4reg(loc, m, a1, a2, a3) \
1199 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1200 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1201 m REPORT_LOCATION, \
1203 REPORT_LOCATION_ARGS(loc)))
1205 #define vWARN5(loc, m, a1, a2, a3, a4) \
1206 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1207 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1208 m REPORT_LOCATION, \
1210 REPORT_LOCATION_ARGS(loc)))
1212 #define ckWARNexperimental(loc, class, m) \
1214 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1215 RExC_warned_ ## class = 1; \
1216 _WARN_HELPER(loc, packWARN(class), \
1217 Perl_ck_warner_d(aTHX_ packWARN(class), \
1218 m REPORT_LOCATION, \
1219 REPORT_LOCATION_ARGS(loc)));\
1223 #define ckWARNexperimental_with_arg(loc, class, m, arg) \
1225 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1226 RExC_warned_ ## class = 1; \
1227 _WARN_HELPER(loc, packWARN(class), \
1228 Perl_ck_warner_d(aTHX_ packWARN(class), \
1229 m REPORT_LOCATION, \
1230 arg, REPORT_LOCATION_ARGS(loc)));\
1234 /* Convert between a pointer to a node and its offset from the beginning of the
1236 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1237 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \
1238 (SSize_t) ((node) - RExC_emit_start))
1240 #define ProgLen(ri) ri->proglen
1241 #define SetProgLen(ri,x) ri->proglen = x
1243 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1244 #define EXPERIMENTAL_INPLACESCAN
1245 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1248 S_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
1250 PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
1252 /* As the name says. The zeroth bit corresponds to the code point given by
1257 Zero(bitmap, len, U8);
1259 invlist_iterinit(invlist);
1260 while (invlist_iternext(invlist, &start, &end)) {
1261 assert(start >= offset);
1263 for (UV i = start; i <= end; i++) {
1264 UV adjusted = i - offset;
1266 BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
1269 invlist_iterfinish(invlist);
1273 S_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
1275 PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
1277 /* As the name says. The zeroth bit corresponds to the code point given by
1282 for (i = 0; i < bitmap_len; i++) {
1283 if (BITMAP_TEST(bitmap, i)) {
1286 /* Save a little work by adding a range all at once instead of bit
1288 while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
1292 *invlist = _add_range_to_invlist(*invlist,
1301 Perl_re_printf(pTHX_ const char *fmt, ...)
1305 PerlIO *f= Perl_debug_log;
1306 PERL_ARGS_ASSERT_RE_PRINTF;
1308 result = PerlIO_vprintf(f, fmt, ap);
1314 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1318 PerlIO *f= Perl_debug_log;
1319 PERL_ARGS_ASSERT_RE_INDENTF;
1320 va_start(ap, depth);
1321 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1322 result = PerlIO_vprintf(f, fmt, ap);
1326 #endif /* DEBUGGING */
1328 #define DEBUG_RExC_seen() \
1329 DEBUG_OPTIMISE_MORE_r({ \
1330 Perl_re_printf( aTHX_ "RExC_seen: "); \
1332 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1333 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1335 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1336 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1338 if (RExC_seen & REG_GPOS_SEEN) \
1339 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1341 if (RExC_seen & REG_RECURSE_SEEN) \
1342 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1344 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1345 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1347 if (RExC_seen & REG_VERBARG_SEEN) \
1348 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1350 if (RExC_seen & REG_CUTGROUP_SEEN) \
1351 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1353 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1354 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1356 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1357 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1359 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1360 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1362 Perl_re_printf( aTHX_ "\n"); \
1365 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1366 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1371 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1372 const char *close_str)
1377 Perl_re_printf( aTHX_ "%s", open_str);
1378 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1379 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1380 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1381 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1382 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1383 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1384 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1385 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1386 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1387 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1388 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1389 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1390 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1391 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1392 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1393 Perl_re_printf( aTHX_ "%s", close_str);
1398 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1399 U32 depth, int is_inf,
1400 SSize_t min, SSize_t stopmin, SSize_t delta)
1402 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1404 DEBUG_OPTIMISE_MORE_r({
1407 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1410 min, stopmin, delta,
1412 (IV)data->pos_delta,
1416 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1418 Perl_re_printf( aTHX_
1419 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1421 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1422 is_inf ? "INF " : ""
1425 if (data->last_found) {
1427 Perl_re_printf(aTHX_
1428 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1429 SvPVX_const(data->last_found),
1431 (IV)data->last_start_min,
1432 (IV)data->last_start_max
1435 for (i = 0; i < 2; i++) {
1436 Perl_re_printf(aTHX_
1437 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1438 data->cur_is_floating == i ? "*" : "",
1439 i ? "Float" : "Fixed",
1440 SvPVX_const(data->substrs[i].str),
1441 (IV)data->substrs[i].min_offset,
1442 (IV)data->substrs[i].max_offset
1444 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1448 Perl_re_printf( aTHX_ "\n");
1454 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1455 regnode *scan, U32 depth, U32 flags)
1457 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1464 Next = regnext(scan);
1465 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1466 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1469 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1470 Next ? (REG_NODE_NUM(Next)) : 0 );
1471 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1472 Perl_re_printf( aTHX_ "\n");
1477 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1478 S_debug_studydata(aTHX_ where, data, depth, is_inf, min, stopmin, delta)
1480 # define DEBUG_PEEP(str, scan, depth, flags) \
1481 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1484 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1485 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1489 /* =========================================================
1490 * BEGIN edit_distance stuff.
1492 * This calculates how many single character changes of any type are needed to
1493 * transform a string into another one. It is taken from version 3.1 of
1495 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1498 /* Our unsorted dictionary linked list. */
1499 /* Note we use UVs, not chars. */
1504 struct dictionary* next;
1506 typedef struct dictionary item;
1509 PERL_STATIC_INLINE item*
1510 push(UV key, item* curr)
1513 Newx(head, 1, item);
1521 PERL_STATIC_INLINE item*
1522 find(item* head, UV key)
1524 item* iterator = head;
1526 if (iterator->key == key){
1529 iterator = iterator->next;
1535 PERL_STATIC_INLINE item*
1536 uniquePush(item* head, UV key)
1538 item* iterator = head;
1541 if (iterator->key == key) {
1544 iterator = iterator->next;
1547 return push(key, head);
1550 PERL_STATIC_INLINE void
1551 dict_free(item* head)
1553 item* iterator = head;
1556 item* temp = iterator;
1557 iterator = iterator->next;
1564 /* End of Dictionary Stuff */
1566 /* All calculations/work are done here */
1568 S_edit_distance(const UV* src,
1570 const STRLEN x, /* length of src[] */
1571 const STRLEN y, /* length of tgt[] */
1572 const SSize_t maxDistance
1576 UV swapCount, swapScore, targetCharCount, i, j;
1578 UV score_ceil = x + y;
1580 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1582 /* intialize matrix start values */
1583 Newx(scores, ( (x + 2) * (y + 2)), UV);
1584 scores[0] = score_ceil;
1585 scores[1 * (y + 2) + 0] = score_ceil;
1586 scores[0 * (y + 2) + 1] = score_ceil;
1587 scores[1 * (y + 2) + 1] = 0;
1588 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1593 for (i=1;i<=x;i++) {
1595 head = uniquePush(head, src[i]);
1596 scores[(i+1) * (y + 2) + 1] = i;
1597 scores[(i+1) * (y + 2) + 0] = score_ceil;
1600 for (j=1;j<=y;j++) {
1603 head = uniquePush(head, tgt[j]);
1604 scores[1 * (y + 2) + (j + 1)] = j;
1605 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1608 targetCharCount = find(head, tgt[j-1])->value;
1609 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1611 if (src[i-1] != tgt[j-1]){
1612 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1616 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1620 find(head, src[i-1])->value = i;
1624 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1627 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1631 /* END of edit_distance() stuff
1632 * ========================================================= */
1634 /* Mark that we cannot extend a found fixed substring at this point.
1635 Update the longest found anchored substring or the longest found
1636 floating substrings if needed. */
1639 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1640 SSize_t *minlenp, int is_inf)
1642 const STRLEN l = CHR_SVLEN(data->last_found);
1643 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1644 const STRLEN old_l = CHR_SVLEN(longest_sv);
1645 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1647 PERL_ARGS_ASSERT_SCAN_COMMIT;
1649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1650 const U8 i = data->cur_is_floating;
1651 SvSetMagicSV(longest_sv, data->last_found);
1652 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1655 data->substrs[0].max_offset = data->substrs[0].min_offset;
1657 data->substrs[1].max_offset =
1661 ? data->last_start_max
1662 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1664 : data->pos_min + data->pos_delta));
1667 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1668 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1669 data->substrs[i].minlenp = minlenp;
1670 data->substrs[i].lookbehind = 0;
1673 SvCUR_set(data->last_found, 0);
1675 SV * const sv = data->last_found;
1676 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1677 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1682 data->last_end = -1;
1683 data->flags &= ~SF_BEFORE_EOL;
1684 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
1687 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1688 * list that describes which code points it matches */
1691 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1693 /* Set the SSC 'ssc' to match an empty string or any code point */
1695 PERL_ARGS_ASSERT_SSC_ANYTHING;
1697 assert(is_ANYOF_SYNTHETIC(ssc));
1699 /* mortalize so won't leak */
1700 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1701 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1705 S_ssc_is_anything(const regnode_ssc *ssc)
1707 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1708 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1709 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1710 * in any way, so there's no point in using it */
1712 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
1715 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1717 assert(is_ANYOF_SYNTHETIC(ssc));
1719 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1723 /* See if the list consists solely of the range 0 - Infinity */
1724 invlist_iterinit(ssc->invlist);
1725 ret = invlist_iternext(ssc->invlist, &start, &end)
1729 invlist_iterfinish(ssc->invlist);
1735 /* If e.g., both \w and \W are set, matches everything */
1736 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1738 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1739 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1749 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1751 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1752 * string, any code point, or any posix class under locale */
1754 PERL_ARGS_ASSERT_SSC_INIT;
1756 Zero(ssc, 1, regnode_ssc);
1757 set_ANYOF_SYNTHETIC(ssc);
1758 ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
1761 /* If any portion of the regex is to operate under locale rules that aren't
1762 * fully known at compile time, initialization includes it. The reason
1763 * this isn't done for all regexes is that the optimizer was written under
1764 * the assumption that locale was all-or-nothing. Given the complexity and
1765 * lack of documentation in the optimizer, and that there are inadequate
1766 * test cases for locale, many parts of it may not work properly, it is
1767 * safest to avoid locale unless necessary. */
1768 if (RExC_contains_locale) {
1769 ANYOF_POSIXL_SETALL(ssc);
1772 ANYOF_POSIXL_ZERO(ssc);
1777 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1778 const regnode_ssc *ssc)
1780 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1781 * to the list of code points matched, and locale posix classes; hence does
1782 * not check its flags) */
1784 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
1787 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1789 assert(is_ANYOF_SYNTHETIC(ssc));
1791 invlist_iterinit(ssc->invlist);
1792 ret = invlist_iternext(ssc->invlist, &start, &end)
1796 invlist_iterfinish(ssc->invlist);
1802 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1809 #define INVLIST_INDEX 0
1810 #define ONLY_LOCALE_MATCHES_INDEX 1
1811 #define DEFERRED_USER_DEFINED_INDEX 2
1814 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1815 const regnode_charclass* const node)
1817 /* Returns a mortal inversion list defining which code points are matched
1818 * by 'node', which is of ANYOF-ish type . Handles complementing the
1819 * result if appropriate. If some code points aren't knowable at this
1820 * time, the returned list must, and will, contain every code point that is
1824 SV* only_utf8_locale_invlist = NULL;
1825 bool new_node_has_latin1 = FALSE;
1826 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
1830 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1832 /* Look at the data structure created by S_set_ANYOF_arg() */
1833 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
1834 invlist = sv_2mortal(_new_invlist(1));
1835 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
1837 else if (ANYOF_HAS_AUX(node)) {
1838 const U32 n = ARG(node);
1839 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1840 AV * const av = MUTABLE_AV(SvRV(rv));
1841 SV **const ary = AvARRAY(av);
1843 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1845 /* Here there are things that won't be known until runtime -- we
1846 * have to assume it could be anything */
1847 invlist = sv_2mortal(_new_invlist(1));
1848 return _add_range_to_invlist(invlist, 0, UV_MAX);
1850 else if (ary[INVLIST_INDEX]) {
1852 /* Use the node's inversion list */
1853 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1856 /* Get the code points valid only under UTF-8 locales */
1857 if ( (flags & ANYOFL_FOLD)
1858 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1860 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1865 invlist = sv_2mortal(_new_invlist(0));
1868 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1869 * code points, and an inversion list for the others, but if there are code
1870 * points that should match only conditionally on the target string being
1871 * UTF-8, those are placed in the inversion list, and not the bitmap.
1872 * Since there are circumstances under which they could match, they are
1873 * included in the SSC. But if the ANYOF node is to be inverted, we have
1874 * to exclude them here, so that when we invert below, the end result
1875 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1876 * have to do this here before we add the unconditionally matched code
1878 if (flags & ANYOF_INVERT) {
1879 _invlist_intersection_complement_2nd(invlist,
1884 /* Add in the points from the bit map */
1885 if (REGNODE_TYPE(OP(node)) == ANYOF){
1886 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1887 if (ANYOF_BITMAP_TEST(node, i)) {
1888 unsigned int start = i++;
1890 for (; i < NUM_ANYOF_CODE_POINTS
1891 && ANYOF_BITMAP_TEST(node, i); ++i)
1895 invlist = _add_range_to_invlist(invlist, start, i-1);
1896 new_node_has_latin1 = TRUE;
1901 /* If this can match all upper Latin1 code points, have to add them
1902 * as well. But don't add them if inverting, as when that gets done below,
1903 * it would exclude all these characters, including the ones it shouldn't
1904 * that were added just above */
1905 if ( ! (flags & ANYOF_INVERT)
1906 && OP(node) == ANYOFD
1907 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
1909 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1912 /* Similarly for these */
1913 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
1914 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1917 if (flags & ANYOF_INVERT) {
1918 _invlist_invert(invlist);
1920 else if (flags & ANYOFL_FOLD) {
1921 if (new_node_has_latin1) {
1923 /* These folds are potential in Turkic locales */
1924 if (_invlist_contains_cp(invlist, 'i')) {
1925 invlist = add_cp_to_invlist(invlist,
1926 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1928 if (_invlist_contains_cp(invlist, 'I')) {
1929 invlist = add_cp_to_invlist(invlist,
1930 LATIN_SMALL_LETTER_DOTLESS_I);
1933 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1934 * the locale. We can skip this if there are no 0-255 at all. */
1935 _invlist_union(invlist, PL_Latin1, &invlist);
1938 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1939 invlist = add_cp_to_invlist(invlist, 'I');
1941 if (_invlist_contains_cp(invlist,
1942 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1944 invlist = add_cp_to_invlist(invlist, 'i');
1949 /* Similarly add the UTF-8 locale possible matches. These have to be
1950 * deferred until after the non-UTF-8 locale ones are taken care of just
1951 * above, or it leads to wrong results under ANYOF_INVERT */
1952 if (only_utf8_locale_invlist) {
1953 _invlist_union_maybe_complement_2nd(invlist,
1954 only_utf8_locale_invlist,
1955 flags & ANYOF_INVERT,
1962 /* These two functions currently do the exact same thing */
1963 #define ssc_init_zero ssc_init
1965 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1966 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1968 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1969 * should not be inverted. */
1972 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1973 const regnode_charclass *and_with)
1975 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1976 * another SSC or a regular ANYOF class. Can create false positives. */
1979 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
1980 ? ANYOF_FLAGS(and_with)
1984 PERL_ARGS_ASSERT_SSC_AND;
1986 assert(is_ANYOF_SYNTHETIC(ssc));
1988 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1989 * the code point inversion list and just the relevant flags */
1990 if (is_ANYOF_SYNTHETIC(and_with)) {
1991 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1992 anded_flags = and_with_flags;
1994 /* XXX This is a kludge around what appears to be deficiencies in the
1995 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1996 * there are paths through the optimizer where it doesn't get weeded
1997 * out when it should. And if we don't make some extra provision for
1998 * it like the code just below, it doesn't get added when it should.
1999 * This solution is to add it only when AND'ing, which is here, and
2000 * only when what is being AND'ed is the pristine, original node
2001 * matching anything. Thus it is like adding it to ssc_anything() but
2002 * only when the result is to be AND'ed. Probably the same solution
2003 * could be adopted for the same problem we have with /l matching,
2004 * which is solved differently in S_ssc_init(), and that would lead to
2005 * fewer false positives than that solution has. But if this solution
2006 * creates bugs, the consequences are only that a warning isn't raised
2007 * that should be; while the consequences for having /l bugs is
2008 * incorrect matches */
2009 if (ssc_is_anything((regnode_ssc *)and_with)) {
2010 anded_flags |= ANYOF_WARN_SUPER__shared;
2014 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
2015 if (OP(and_with) == ANYOFD) {
2016 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
2019 anded_flags = and_with_flags
2020 & ( ANYOF_COMMON_FLAGS
2021 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
2022 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
2023 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
2024 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
2029 ANYOF_FLAGS(ssc) &= anded_flags;
2031 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2032 * C2 is the list of code points in 'and-with'; P2, its posix classes.
2033 * 'and_with' may be inverted. When not inverted, we have the situation of
2035 * (C1 | P1) & (C2 | P2)
2036 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
2037 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
2038 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
2039 * <= ((C1 & C2) | P1 | P2)
2040 * Alternatively, the last few steps could be:
2041 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
2042 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
2043 * <= (C1 | C2 | (P1 & P2))
2044 * We favor the second approach if either P1 or P2 is non-empty. This is
2045 * because these components are a barrier to doing optimizations, as what
2046 * they match cannot be known until the moment of matching as they are
2047 * dependent on the current locale, 'AND"ing them likely will reduce or
2049 * But we can do better if we know that C1,P1 are in their initial state (a
2050 * frequent occurrence), each matching everything:
2051 * (<everything>) & (C2 | P2) = C2 | P2
2052 * Similarly, if C2,P2 are in their initial state (again a frequent
2053 * occurrence), the result is a no-op
2054 * (C1 | P1) & (<everything>) = C1 | P1
2057 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
2058 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
2059 * <= (C1 & ~C2) | (P1 & ~P2)
2062 if ((and_with_flags & ANYOF_INVERT)
2063 && ! is_ANYOF_SYNTHETIC(and_with))
2067 ssc_intersection(ssc,
2069 FALSE /* Has already been inverted */
2072 /* If either P1 or P2 is empty, the intersection will be also; can skip
2074 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
2075 ANYOF_POSIXL_ZERO(ssc);
2077 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2079 /* Note that the Posix class component P from 'and_with' actually
2081 * P = Pa | Pb | ... | Pn
2082 * where each component is one posix class, such as in [\w\s].
2084 * ~P = ~(Pa | Pb | ... | Pn)
2085 * = ~Pa & ~Pb & ... & ~Pn
2086 * <= ~Pa | ~Pb | ... | ~Pn
2087 * The last is something we can easily calculate, but unfortunately
2088 * is likely to have many false positives. We could do better
2089 * in some (but certainly not all) instances if two classes in
2090 * P have known relationships. For example
2091 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
2093 * :lower: & :print: = :lower:
2094 * And similarly for classes that must be disjoint. For example,
2095 * since \s and \w can have no elements in common based on rules in
2096 * the POSIX standard,
2097 * \w & ^\S = nothing
2098 * Unfortunately, some vendor locales do not meet the Posix
2099 * standard, in particular almost everything by Microsoft.
2100 * The loop below just changes e.g., \w into \W and vice versa */
2102 regnode_charclass_posixl temp;
2103 int add = 1; /* To calculate the index of the complement */
2105 Zero(&temp, 1, regnode_charclass_posixl);
2106 ANYOF_POSIXL_ZERO(&temp);
2107 for (i = 0; i < ANYOF_MAX; i++) {
2109 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
2110 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
2112 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
2113 ANYOF_POSIXL_SET(&temp, i + add);
2115 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
2117 ANYOF_POSIXL_AND(&temp, ssc);
2119 } /* else ssc already has no posixes */
2120 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
2121 in its initial state */
2122 else if (! is_ANYOF_SYNTHETIC(and_with)
2123 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
2125 /* But if 'ssc' is in its initial state, the result is just 'and_with';
2126 * copy it over 'ssc' */
2127 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
2128 if (is_ANYOF_SYNTHETIC(and_with)) {
2129 StructCopy(and_with, ssc, regnode_ssc);
2132 ssc->invlist = anded_cp_list;
2133 ANYOF_POSIXL_ZERO(ssc);
2134 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
2135 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
2139 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
2140 || (and_with_flags & ANYOF_MATCHES_POSIXL))
2142 /* One or the other of P1, P2 is non-empty. */
2143 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
2144 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
2146 ssc_union(ssc, anded_cp_list, FALSE);
2148 else { /* P1 = P2 = empty */
2149 ssc_intersection(ssc, anded_cp_list, FALSE);
2155 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
2156 const regnode_charclass *or_with)
2158 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
2159 * another SSC or a regular ANYOF class. Can create false positives if
2160 * 'or_with' is to be inverted. */
2164 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
2165 ? ANYOF_FLAGS(or_with)
2168 PERL_ARGS_ASSERT_SSC_OR;
2170 assert(is_ANYOF_SYNTHETIC(ssc));
2172 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2173 * the code point inversion list and just the relevant flags */
2174 if (is_ANYOF_SYNTHETIC(or_with)) {
2175 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2176 ored_flags = or_with_flags;
2179 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2180 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2181 if (OP(or_with) != ANYOFD) {
2183 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
2184 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
2185 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
2186 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
2191 ANYOF_FLAGS(ssc) |= ored_flags;
2193 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2194 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2195 * 'or_with' may be inverted. When not inverted, we have the simple
2196 * situation of computing:
2197 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2198 * If P1|P2 yields a situation with both a class and its complement are
2199 * set, like having both \w and \W, this matches all code points, and we
2200 * can delete these from the P component of the ssc going forward. XXX We
2201 * might be able to delete all the P components, but I (khw) am not certain
2202 * about this, and it is better to be safe.
2205 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2206 * <= (C1 | P1) | ~C2
2207 * <= (C1 | ~C2) | P1
2208 * (which results in actually simpler code than the non-inverted case)
2211 if ((or_with_flags & ANYOF_INVERT)
2212 && ! is_ANYOF_SYNTHETIC(or_with))
2214 /* We ignore P2, leaving P1 going forward */
2215 } /* else Not inverted */
2216 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2217 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2218 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2220 for (i = 0; i < ANYOF_MAX; i += 2) {
2221 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2223 ssc_match_all_cp(ssc);
2224 ANYOF_POSIXL_CLEAR(ssc, i);
2225 ANYOF_POSIXL_CLEAR(ssc, i+1);
2233 FALSE /* Already has been inverted */
2238 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2240 PERL_ARGS_ASSERT_SSC_UNION;
2242 assert(is_ANYOF_SYNTHETIC(ssc));
2244 _invlist_union_maybe_complement_2nd(ssc->invlist,
2251 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2253 const bool invert2nd)
2255 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2257 assert(is_ANYOF_SYNTHETIC(ssc));
2259 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2266 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2268 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2270 assert(is_ANYOF_SYNTHETIC(ssc));
2272 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2276 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2278 /* AND just the single code point 'cp' into the SSC 'ssc' */
2280 SV* cp_list = _new_invlist(2);
2282 PERL_ARGS_ASSERT_SSC_CP_AND;
2284 assert(is_ANYOF_SYNTHETIC(ssc));
2286 cp_list = add_cp_to_invlist(cp_list, cp);
2287 ssc_intersection(ssc, cp_list,
2288 FALSE /* Not inverted */
2290 SvREFCNT_dec_NN(cp_list);
2294 S_ssc_clear_locale(regnode_ssc *ssc)
2296 /* Set the SSC 'ssc' to not match any locale things */
2297 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2299 assert(is_ANYOF_SYNTHETIC(ssc));
2301 ANYOF_POSIXL_ZERO(ssc);
2302 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2306 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2308 /* The synthetic start class is used to hopefully quickly winnow down
2309 * places where a pattern could start a match in the target string. If it
2310 * doesn't really narrow things down that much, there isn't much point to
2311 * having the overhead of using it. This function uses some very crude
2312 * heuristics to decide if to use the ssc or not.
2314 * It returns TRUE if 'ssc' rules out more than half what it considers to
2315 * be the "likely" possible matches, but of course it doesn't know what the
2316 * actual things being matched are going to be; these are only guesses
2318 * For /l matches, it assumes that the only likely matches are going to be
2319 * in the 0-255 range, uniformly distributed, so half of that is 127
2320 * For /a and /d matches, it assumes that the likely matches will be just
2321 * the ASCII range, so half of that is 63
2322 * For /u and there isn't anything matching above the Latin1 range, it
2323 * assumes that that is the only range likely to be matched, and uses
2324 * half that as the cut-off: 127. If anything matches above Latin1,
2325 * it assumes that all of Unicode could match (uniformly), except for
2326 * non-Unicode code points and things in the General Category "Other"
2327 * (unassigned, private use, surrogates, controls and formats). This
2328 * is a much large number. */
2330 U32 count = 0; /* Running total of number of code points matched by
2332 UV start, end; /* Start and end points of current range in inversion
2333 XXX outdated. UTF-8 locales are common, what about invert? list */
2334 const U32 max_code_points = (LOC)
2336 : (( ! UNI_SEMANTICS
2337 || invlist_highest(ssc->invlist) < 256)
2340 const U32 max_match = max_code_points / 2;
2342 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2344 invlist_iterinit(ssc->invlist);
2345 while (invlist_iternext(ssc->invlist, &start, &end)) {
2346 if (start >= max_code_points) {
2349 end = MIN(end, max_code_points - 1);
2350 count += end - start + 1;
2351 if (count >= max_match) {
2352 invlist_iterfinish(ssc->invlist);
2362 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2364 /* The inversion list in the SSC is marked mortal; now we need a more
2365 * permanent copy, which is stored the same way that is done in a regular
2366 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2369 SV* invlist = invlist_clone(ssc->invlist, NULL);
2371 PERL_ARGS_ASSERT_SSC_FINALIZE;
2373 assert(is_ANYOF_SYNTHETIC(ssc));
2375 /* The code in this file assumes that all but these flags aren't relevant
2376 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2377 * by the time we reach here */
2378 assert(! (ANYOF_FLAGS(ssc)
2379 & ~( ANYOF_COMMON_FLAGS
2380 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
2381 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
2383 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
2385 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2386 SvREFCNT_dec(invlist);
2388 /* Make sure is clone-safe */
2389 ssc->invlist = NULL;
2391 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2392 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2393 OP(ssc) = ANYOFPOSIXL;
2395 else if (RExC_contains_locale) {
2399 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2402 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2403 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2404 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2405 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2406 ? (TRIE_LIST_CUR( idx ) - 1) \
2412 dump_trie(trie,widecharmap,revcharmap)
2413 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2414 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2416 These routines dump out a trie in a somewhat readable format.
2417 The _interim_ variants are used for debugging the interim
2418 tables that are used to generate the final compressed
2419 representation which is what dump_trie expects.
2421 Part of the reason for their existence is to provide a form
2422 of documentation as to how the different representations function.
2427 Dumps the final compressed table form of the trie to Perl_debug_log.
2428 Used for debugging make_trie().
2432 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2433 AV *revcharmap, U32 depth)
2436 SV *sv=sv_newmortal();
2437 int colwidth= widecharmap ? 6 : 4;
2439 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2441 PERL_ARGS_ASSERT_DUMP_TRIE;
2443 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2444 depth+1, "Match","Base","Ofs" );
2446 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2447 SV ** const tmp = av_fetch( revcharmap, state, 0);
2449 Perl_re_printf( aTHX_ "%*s",
2451 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2452 PL_colors[0], PL_colors[1],
2453 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2454 PERL_PV_ESCAPE_FIRSTCHAR
2459 Perl_re_printf( aTHX_ "\n");
2460 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2462 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2463 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2464 Perl_re_printf( aTHX_ "\n");
2466 for( state = 1 ; state < trie->statecount ; state++ ) {
2467 const U32 base = trie->states[ state ].trans.base;
2469 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2471 if ( trie->states[ state ].wordnum ) {
2472 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2474 Perl_re_printf( aTHX_ "%6s", "" );
2477 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2482 while( ( base + ofs < trie->uniquecharcount ) ||
2483 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2484 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2488 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2490 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2491 if ( ( base + ofs >= trie->uniquecharcount )
2492 && ( base + ofs - trie->uniquecharcount
2494 && trie->trans[ base + ofs
2495 - trie->uniquecharcount ].check == state )
2497 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2498 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2501 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2505 Perl_re_printf( aTHX_ "]");
2508 Perl_re_printf( aTHX_ "\n" );
2510 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2512 for (word=1; word <= trie->wordcount; word++) {
2513 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2514 (int)word, (int)(trie->wordinfo[word].prev),
2515 (int)(trie->wordinfo[word].len));
2517 Perl_re_printf( aTHX_ "\n" );
2520 Dumps a fully constructed but uncompressed trie in list form.
2521 List tries normally only are used for construction when the number of
2522 possible chars (trie->uniquecharcount) is very high.
2523 Used for debugging make_trie().
2526 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2527 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2531 SV *sv=sv_newmortal();
2532 int colwidth= widecharmap ? 6 : 4;
2533 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2535 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2537 /* print out the table precompression. */
2538 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2540 Perl_re_indentf( aTHX_ "%s",
2541 depth+1, "------:-----+-----------------\n" );
2543 for( state=1 ; state < next_alloc ; state ++ ) {
2546 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2547 depth+1, (UV)state );
2548 if ( ! trie->states[ state ].wordnum ) {
2549 Perl_re_printf( aTHX_ "%5s| ","");
2551 Perl_re_printf( aTHX_ "W%4x| ",
2552 trie->states[ state ].wordnum
2555 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2556 SV ** const tmp = av_fetch( revcharmap,
2557 TRIE_LIST_ITEM(state, charid).forid, 0);
2559 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2561 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2563 PL_colors[0], PL_colors[1],
2564 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2565 | PERL_PV_ESCAPE_FIRSTCHAR
2567 TRIE_LIST_ITEM(state, charid).forid,
2568 (UV)TRIE_LIST_ITEM(state, charid).newstate
2571 Perl_re_printf( aTHX_ "\n%*s| ",
2572 (int)((depth * 2) + 14), "");
2575 Perl_re_printf( aTHX_ "\n");
2580 Dumps a fully constructed but uncompressed trie in table form.
2581 This is the normal DFA style state transition table, with a few
2582 twists to facilitate compression later.
2583 Used for debugging make_trie().
2586 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2587 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2592 SV *sv=sv_newmortal();
2593 int colwidth= widecharmap ? 6 : 4;
2594 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2596 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2599 print out the table precompression so that we can do a visual check
2600 that they are identical.
2603 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2605 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2606 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2608 Perl_re_printf( aTHX_ "%*s",
2610 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2611 PL_colors[0], PL_colors[1],
2612 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2613 PERL_PV_ESCAPE_FIRSTCHAR
2619 Perl_re_printf( aTHX_ "\n");
2620 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2622 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2623 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2626 Perl_re_printf( aTHX_ "\n" );
2628 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2630 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2632 (UV)TRIE_NODENUM( state ) );
2634 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2635 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2637 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2639 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2641 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2642 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2643 (UV)trie->trans[ state ].check );
2645 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2646 (UV)trie->trans[ state ].check,
2647 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2655 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2656 startbranch: the first branch in the whole branch sequence
2657 first : start branch of sequence of branch-exact nodes.
2658 May be the same as startbranch
2659 last : Thing following the last branch.
2660 May be the same as tail.
2661 tail : item following the branch sequence
2662 count : words in the sequence
2663 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2664 depth : indent depth
2666 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2668 A trie is an N'ary tree where the branches are determined by digital
2669 decomposition of the key. IE, at the root node you look up the 1st character and
2670 follow that branch repeat until you find the end of the branches. Nodes can be
2671 marked as "accepting" meaning they represent a complete word. Eg:
2675 would convert into the following structure. Numbers represent states, letters
2676 following numbers represent valid transitions on the letter from that state, if
2677 the number is in square brackets it represents an accepting state, otherwise it
2678 will be in parenthesis.
2680 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2684 (1) +-i->(6)-+-s->[7]
2686 +-s->(3)-+-h->(4)-+-e->[5]
2688 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2690 This shows that when matching against the string 'hers' we will begin at state 1
2691 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2692 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2693 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2694 single traverse. We store a mapping from accepting to state to which word was
2695 matched, and then when we have multiple possibilities we try to complete the
2696 rest of the regex in the order in which they occurred in the alternation.
2698 The only prior NFA like behaviour that would be changed by the TRIE support is
2699 the silent ignoring of duplicate alternations which are of the form:
2701 / (DUPE|DUPE) X? (?{ ... }) Y /x
2703 Thus EVAL blocks following a trie may be called a different number of times with
2704 and without the optimisation. With the optimisations dupes will be silently
2705 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2706 the following demonstrates:
2708 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2710 which prints out 'word' three times, but
2712 'words'=~/(word|word|word)(?{ print $1 })S/
2714 which doesnt print it out at all. This is due to other optimisations kicking in.
2716 Example of what happens on a structural level:
2718 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2720 1: CURLYM[1] {1,32767}(18)
2731 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2732 and should turn into:
2734 1: CURLYM[1] {1,32767}(18)
2736 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2744 Cases where tail != last would be like /(?foo|bar)baz/:
2754 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2755 and would end up looking like:
2758 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2765 d = uvchr_to_utf8_flags(d, uv, 0);
2767 is the recommended Unicode-aware way of saying
2772 #define TRIE_STORE_REVCHAR(val) \
2775 SV *zlopp = newSV(UTF8_MAXBYTES); \
2776 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2777 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2779 SvCUR_set(zlopp, kapow - flrbbbbb); \
2782 av_push(revcharmap, zlopp); \
2784 char ooooff = (char)val; \
2785 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2789 /* This gets the next character from the input, folding it if not already
2791 #define TRIE_READ_CHAR STMT_START { \
2794 /* if it is UTF then it is either already folded, or does not need \
2796 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2798 else if (folder == PL_fold_latin1) { \
2799 /* This folder implies Unicode rules, which in the range expressible \
2800 * by not UTF is the lower case, with the two exceptions, one of \
2801 * which should have been taken care of before calling this */ \
2802 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2803 uvc = toLOWER_L1(*uc); \
2804 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2807 /* raw data, will be folded later if needed */ \
2815 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2816 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2817 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2818 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2819 TRIE_LIST_LEN( state ) = ging; \
2821 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2823 TRIE_LIST_CUR( state )++; \
2826 #define TRIE_LIST_NEW(state) STMT_START { \
2827 Newx( trie->states[ state ].trans.list, \
2828 4, reg_trie_trans_le ); \
2829 TRIE_LIST_CUR( state ) = 1; \
2830 TRIE_LIST_LEN( state ) = 4; \
2833 #define TRIE_HANDLE_WORD(state) STMT_START { \
2834 U16 dupe= trie->states[ state ].wordnum; \
2835 regnode * const noper_next = regnext( noper ); \
2838 /* store the word for dumping */ \
2840 if (OP(noper) != NOTHING) \
2841 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2843 tmp = newSVpvn_utf8( "", 0, UTF ); \
2844 av_push( trie_words, tmp ); \
2848 trie->wordinfo[curword].prev = 0; \
2849 trie->wordinfo[curword].len = wordlen; \
2850 trie->wordinfo[curword].accept = state; \
2852 if ( noper_next < tail ) { \
2854 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2856 trie->jump[curword] = (U16)(noper_next - convert); \
2858 jumper = noper_next; \
2860 nextbranch= regnext(cur); \
2864 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2865 /* chain, so that when the bits of chain are later */\
2866 /* linked together, the dups appear in the chain */\
2867 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2868 trie->wordinfo[dupe].prev = curword; \
2870 /* we haven't inserted this word yet. */ \
2871 trie->states[ state ].wordnum = curword; \
2876 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2877 ( ( base + charid >= ucharcount \
2878 && base + charid < ubound \
2879 && state == trie->trans[ base - ucharcount + charid ].check \
2880 && trie->trans[ base - ucharcount + charid ].next ) \
2881 ? trie->trans[ base - ucharcount + charid ].next \
2882 : ( state==1 ? special : 0 ) \
2885 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2887 TRIE_BITMAP_SET(trie, uvc); \
2888 /* store the folded codepoint */ \
2890 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2893 /* store first byte of utf8 representation of */ \
2894 /* variant codepoints */ \
2895 if (! UVCHR_IS_INVARIANT(uvc)) { \
2896 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2901 #define MADE_JUMP_TRIE 2
2902 #define MADE_EXACT_TRIE 4
2905 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2906 regnode *first, regnode *last, regnode *tail,
2907 U32 word_count, U32 flags, U32 depth)
2909 /* first pass, loop through and scan words */
2910 reg_trie_data *trie;
2911 HV *widecharmap = NULL;
2912 AV *revcharmap = newAV();
2918 regnode *jumper = NULL;
2919 regnode *nextbranch = NULL;
2920 regnode *convert = NULL;
2921 U32 *prev_states; /* temp array mapping each state to previous one */
2922 /* we just use folder as a flag in utf8 */
2923 const U8 * folder = NULL;
2925 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2926 * which stands for one trie structure, one hash, optionally followed
2929 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2930 AV *trie_words = NULL;
2931 /* along with revcharmap, this only used during construction but both are
2932 * useful during debugging so we store them in the struct when debugging.
2935 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2936 STRLEN trie_charcount=0;
2938 SV *re_trie_maxbuff;
2939 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2941 PERL_ARGS_ASSERT_MAKE_TRIE;
2943 PERL_UNUSED_ARG(depth);
2947 case EXACT: case EXACT_REQ8: case EXACTL: break;
2951 case EXACTFLU8: folder = PL_fold_latin1; break;
2952 case EXACTF: folder = PL_fold; break;
2953 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
2956 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2958 trie->startstate = 1;
2959 trie->wordcount = word_count;
2960 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2961 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2962 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2963 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2964 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2965 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2968 trie_words = newAV();
2971 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2972 assert(re_trie_maxbuff);
2973 if (!SvIOK(re_trie_maxbuff)) {
2974 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2976 DEBUG_TRIE_COMPILE_r({
2977 Perl_re_indentf( aTHX_
2978 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2980 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2981 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2984 /* Find the node we are going to overwrite */
2985 if ( first == startbranch && OP( last ) != BRANCH ) {
2986 /* whole branch chain */
2989 /* branch sub-chain */
2990 convert = REGNODE_AFTER( first );
2993 /* -- First loop and Setup --
2995 We first traverse the branches and scan each word to determine if it
2996 contains widechars, and how many unique chars there are, this is
2997 important as we have to build a table with at least as many columns as we
3000 We use an array of integers to represent the character codes 0..255
3001 (trie->charmap) and we use a an HV* to store Unicode characters. We use
3002 the native representation of the character value as the key and IV's for
3005 *TODO* If we keep track of how many times each character is used we can
3006 remap the columns so that the table compression later on is more
3007 efficient in terms of memory by ensuring the most common value is in the
3008 middle and the least common are on the outside. IMO this would be better
3009 than a most to least common mapping as theres a decent chance the most
3010 common letter will share a node with the least common, meaning the node
3011 will not be compressible. With a middle is most common approach the worst
3012 case is when we have the least common nodes twice.
3016 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3017 regnode *noper = REGNODE_AFTER( cur );
3021 U32 wordlen = 0; /* required init */
3022 STRLEN minchars = 0;
3023 STRLEN maxchars = 0;
3024 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
3027 if (OP(noper) == NOTHING) {
3028 /* skip past a NOTHING at the start of an alternation
3029 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
3031 * If the next node is not something we are supposed to process
3032 * we will just ignore it due to the condition guarding the
3036 regnode *noper_next= regnext(noper);
3037 if (noper_next < tail)
3042 && ( OP(noper) == flags
3043 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3044 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3045 || OP(noper) == EXACTFUP))))
3047 uc= (U8*)STRING(noper);
3048 e= uc + STR_LEN(noper);
3055 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
3056 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
3057 regardless of encoding */
3058 if (OP( noper ) == EXACTFUP) {
3059 /* false positives are ok, so just set this */
3060 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
3064 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
3066 TRIE_CHARCOUNT(trie)++;
3069 /* TRIE_READ_CHAR returns the current character, or its fold if /i
3070 * is in effect. Under /i, this character can match itself, or
3071 * anything that folds to it. If not under /i, it can match just
3072 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
3073 * all fold to k, and all are single characters. But some folds
3074 * expand to more than one character, so for example LATIN SMALL
3075 * LIGATURE FFI folds to the three character sequence 'ffi'. If
3076 * the string beginning at 'uc' is 'ffi', it could be matched by
3077 * three characters, or just by the one ligature character. (It
3078 * could also be matched by two characters: LATIN SMALL LIGATURE FF
3079 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
3080 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
3081 * match.) The trie needs to know the minimum and maximum number
3082 * of characters that could match so that it can use size alone to
3083 * quickly reject many match attempts. The max is simple: it is
3084 * the number of folded characters in this branch (since a fold is
3085 * never shorter than what folds to it. */
3089 /* And the min is equal to the max if not under /i (indicated by
3090 * 'folder' being NULL), or there are no multi-character folds. If
3091 * there is a multi-character fold, the min is incremented just
3092 * once, for the character that folds to the sequence. Each
3093 * character in the sequence needs to be added to the list below of
3094 * characters in the trie, but we count only the first towards the
3095 * min number of characters needed. This is done through the
3096 * variable 'foldlen', which is returned by the macros that look
3097 * for these sequences as the number of bytes the sequence
3098 * occupies. Each time through the loop, we decrement 'foldlen' by
3099 * how many bytes the current char occupies. Only when it reaches
3100 * 0 do we increment 'minchars' or look for another multi-character
3102 if (folder == NULL) {
3105 else if (foldlen > 0) {
3106 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
3111 /* See if *uc is the beginning of a multi-character fold. If
3112 * so, we decrement the length remaining to look at, to account
3113 * for the current character this iteration. (We can use 'uc'
3114 * instead of the fold returned by TRIE_READ_CHAR because the
3115 * macro is smart enough to account for any unfolded
3118 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
3119 foldlen -= UTF8SKIP(uc);
3122 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
3127 /* The current character (and any potential folds) should be added
3128 * to the possible matching characters for this position in this
3132 U8 folded= folder[ (U8) uvc ];
3133 if ( !trie->charmap[ folded ] ) {
3134 trie->charmap[ folded ]=( ++trie->uniquecharcount );
3135 TRIE_STORE_REVCHAR( folded );
3138 if ( !trie->charmap[ uvc ] ) {
3139 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
3140 TRIE_STORE_REVCHAR( uvc );
3143 /* store the codepoint in the bitmap, and its folded
3145 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
3146 set_bit = 0; /* We've done our bit :-) */
3150 /* XXX We could come up with the list of code points that fold
3151 * to this using PL_utf8_foldclosures, except not for
3152 * multi-char folds, as there may be multiple combinations
3153 * there that could work, which needs to wait until runtime to
3154 * resolve (The comment about LIGATURE FFI above is such an
3159 widecharmap = newHV();
3161 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
3164 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3166 if ( !SvTRUE( *svpp ) ) {
3167 sv_setiv( *svpp, ++trie->uniquecharcount );
3168 TRIE_STORE_REVCHAR(uvc);
3171 } /* end loop through characters in this branch of the trie */
3173 /* We take the min and max for this branch and combine to find the min
3174 * and max for all branches processed so far */
3175 if( cur == first ) {
3176 trie->minlen = minchars;
3177 trie->maxlen = maxchars;
3178 } else if (minchars < trie->minlen) {
3179 trie->minlen = minchars;
3180 } else if (maxchars > trie->maxlen) {
3181 trie->maxlen = maxchars;
3183 } /* end first pass */
3184 DEBUG_TRIE_COMPILE_r(
3185 Perl_re_indentf( aTHX_
3186 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3188 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3189 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3190 (int)trie->minlen, (int)trie->maxlen )
3194 We now know what we are dealing with in terms of unique chars and
3195 string sizes so we can calculate how much memory a naive
3196 representation using a flat table will take. If it's over a reasonable
3197 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3198 conservative but potentially much slower representation using an array
3201 At the end we convert both representations into the same compressed
3202 form that will be used in regexec.c for matching with. The latter
3203 is a form that cannot be used to construct with but has memory
3204 properties similar to the list form and access properties similar
3205 to the table form making it both suitable for fast searches and
3206 small enough that its feasable to store for the duration of a program.
3208 See the comment in the code where the compressed table is produced
3209 inplace from the flat tabe representation for an explanation of how
3210 the compression works.
3215 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3218 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3219 > SvIV(re_trie_maxbuff) )
3222 Second Pass -- Array Of Lists Representation
3224 Each state will be represented by a list of charid:state records
3225 (reg_trie_trans_le) the first such element holds the CUR and LEN
3226 points of the allocated array. (See defines above).
3228 We build the initial structure using the lists, and then convert
3229 it into the compressed table form which allows faster lookups
3230 (but cant be modified once converted).
3233 STRLEN transcount = 1;
3235 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3238 trie->states = (reg_trie_state *)
3239 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3240 sizeof(reg_trie_state) );
3244 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3246 regnode *noper = REGNODE_AFTER( cur );
3247 U32 state = 1; /* required init */
3248 U16 charid = 0; /* sanity init */
3249 U32 wordlen = 0; /* required init */
3251 if (OP(noper) == NOTHING) {
3252 regnode *noper_next= regnext(noper);
3253 if (noper_next < tail)
3255 /* we will undo this assignment if noper does not
3256 * point at a trieable type in the else clause of
3257 * the following statement. */
3261 && ( OP(noper) == flags
3262 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3263 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3264 || OP(noper) == EXACTFUP))))
3266 const U8 *uc= (U8*)STRING(noper);
3267 const U8 *e= uc + STR_LEN(noper);
3269 for ( ; uc < e ; uc += len ) {
3274 charid = trie->charmap[ uvc ];
3276 SV** const svpp = hv_fetch( widecharmap,
3283 charid=(U16)SvIV( *svpp );
3286 /* charid is now 0 if we dont know the char read, or
3287 * nonzero if we do */
3294 if ( !trie->states[ state ].trans.list ) {
3295 TRIE_LIST_NEW( state );
3298 check <= TRIE_LIST_USED( state );
3301 if ( TRIE_LIST_ITEM( state, check ).forid
3304 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3309 newstate = next_alloc++;
3310 prev_states[newstate] = state;
3311 TRIE_LIST_PUSH( state, charid, newstate );
3316 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3320 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3321 * on a trieable type. So we need to reset noper back to point at the first regop
3322 * in the branch before we call TRIE_HANDLE_WORD()
3324 noper= REGNODE_AFTER(cur);
3326 TRIE_HANDLE_WORD(state);
3328 } /* end second pass */
3330 /* next alloc is the NEXT state to be allocated */
3331 trie->statecount = next_alloc;
3332 trie->states = (reg_trie_state *)
3333 PerlMemShared_realloc( trie->states,
3335 * sizeof(reg_trie_state) );
3337 /* and now dump it out before we compress it */
3338 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3339 revcharmap, next_alloc,
3343 trie->trans = (reg_trie_trans *)
3344 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3351 for( state=1 ; state < next_alloc ; state ++ ) {
3355 DEBUG_TRIE_COMPILE_MORE_r(
3356 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3360 if (trie->states[state].trans.list) {
3361 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3365 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3366 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3367 if ( forid < minid ) {
3369 } else if ( forid > maxid ) {
3373 if ( transcount < tp + maxid - minid + 1) {
3375 trie->trans = (reg_trie_trans *)
3376 PerlMemShared_realloc( trie->trans,
3378 * sizeof(reg_trie_trans) );
3379 Zero( trie->trans + (transcount / 2),
3383 base = trie->uniquecharcount + tp - minid;
3384 if ( maxid == minid ) {
3386 for ( ; zp < tp ; zp++ ) {
3387 if ( ! trie->trans[ zp ].next ) {
3388 base = trie->uniquecharcount + zp - minid;
3389 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3391 trie->trans[ zp ].check = state;
3397 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3399 trie->trans[ tp ].check = state;
3404 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3405 const U32 tid = base
3406 - trie->uniquecharcount
3407 + TRIE_LIST_ITEM( state, idx ).forid;
3408 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3410 trie->trans[ tid ].check = state;
3412 tp += ( maxid - minid + 1 );
3414 Safefree(trie->states[ state ].trans.list);
3417 DEBUG_TRIE_COMPILE_MORE_r(
3418 Perl_re_printf( aTHX_ " base: %d\n",base);
3421 trie->states[ state ].trans.base=base;
3423 trie->lasttrans = tp + 1;
3427 Second Pass -- Flat Table Representation.
3429 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3430 each. We know that we will need Charcount+1 trans at most to store
3431 the data (one row per char at worst case) So we preallocate both
3432 structures assuming worst case.
3434 We then construct the trie using only the .next slots of the entry
3437 We use the .check field of the first entry of the node temporarily
3438 to make compression both faster and easier by keeping track of how
3439 many non zero fields are in the node.
3441 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3444 There are two terms at use here: state as a TRIE_NODEIDX() which is
3445 a number representing the first entry of the node, and state as a
3446 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3447 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3448 if there are 2 entrys per node. eg:
3456 The table is internally in the right hand, idx form. However as we
3457 also have to deal with the states array which is indexed by nodenum
3458 we have to use TRIE_NODENUM() to convert.
3461 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3464 trie->trans = (reg_trie_trans *)
3465 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3466 * trie->uniquecharcount + 1,
3467 sizeof(reg_trie_trans) );
3468 trie->states = (reg_trie_state *)
3469 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3470 sizeof(reg_trie_state) );
3471 next_alloc = trie->uniquecharcount + 1;
3474 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3476 regnode *noper = REGNODE_AFTER( cur );
3478 U32 state = 1; /* required init */
3480 U16 charid = 0; /* sanity init */
3481 U32 accept_state = 0; /* sanity init */
3483 U32 wordlen = 0; /* required init */
3485 if (OP(noper) == NOTHING) {
3486 regnode *noper_next= regnext(noper);
3487 if (noper_next < tail)
3489 /* we will undo this assignment if noper does not
3490 * point at a trieable type in the else clause of
3491 * the following statement. */
3495 && ( OP(noper) == flags
3496 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3497 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3498 || OP(noper) == EXACTFUP))))
3500 const U8 *uc= (U8*)STRING(noper);
3501 const U8 *e= uc + STR_LEN(noper);
3503 for ( ; uc < e ; uc += len ) {
3508 charid = trie->charmap[ uvc ];
3510 SV* const * const svpp = hv_fetch( widecharmap,
3514 charid = svpp ? (U16)SvIV(*svpp) : 0;
3518 if ( !trie->trans[ state + charid ].next ) {
3519 trie->trans[ state + charid ].next = next_alloc;
3520 trie->trans[ state ].check++;
3521 prev_states[TRIE_NODENUM(next_alloc)]
3522 = TRIE_NODENUM(state);
3523 next_alloc += trie->uniquecharcount;
3525 state = trie->trans[ state + charid ].next;
3527 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3529 /* charid is now 0 if we dont know the char read, or
3530 * nonzero if we do */
3533 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3534 * on a trieable type. So we need to reset noper back to point at the first regop
3535 * in the branch before we call TRIE_HANDLE_WORD().
3537 noper= REGNODE_AFTER(cur);
3539 accept_state = TRIE_NODENUM( state );
3540 TRIE_HANDLE_WORD(accept_state);
3542 } /* end second pass */
3544 /* and now dump it out before we compress it */
3545 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3547 next_alloc, depth+1));
3551 * Inplace compress the table.*
3553 For sparse data sets the table constructed by the trie algorithm will
3554 be mostly 0/FAIL transitions or to put it another way mostly empty.
3555 (Note that leaf nodes will not contain any transitions.)
3557 This algorithm compresses the tables by eliminating most such
3558 transitions, at the cost of a modest bit of extra work during lookup:
3560 - Each states[] entry contains a .base field which indicates the
3561 index in the state[] array wheres its transition data is stored.
3563 - If .base is 0 there are no valid transitions from that node.
3565 - If .base is nonzero then charid is added to it to find an entry in
3568 -If trans[states[state].base+charid].check!=state then the
3569 transition is taken to be a 0/Fail transition. Thus if there are fail
3570 transitions at the front of the node then the .base offset will point
3571 somewhere inside the previous nodes data (or maybe even into a node
3572 even earlier), but the .check field determines if the transition is
3576 The following process inplace converts the table to the compressed
3577 table: We first do not compress the root node 1,and mark all its
3578 .check pointers as 1 and set its .base pointer as 1 as well. This
3579 allows us to do a DFA construction from the compressed table later,
3580 and ensures that any .base pointers we calculate later are greater
3583 - We set 'pos' to indicate the first entry of the second node.
3585 - We then iterate over the columns of the node, finding the first and
3586 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3587 and set the .check pointers accordingly, and advance pos
3588 appropriately and repreat for the next node. Note that when we copy
3589 the next pointers we have to convert them from the original
3590 NODEIDX form to NODENUM form as the former is not valid post
3593 - If a node has no transitions used we mark its base as 0 and do not
3594 advance the pos pointer.
3596 - If a node only has one transition we use a second pointer into the
3597 structure to fill in allocated fail transitions from other states.
3598 This pointer is independent of the main pointer and scans forward
3599 looking for null transitions that are allocated to a state. When it
3600 finds one it writes the single transition into the "hole". If the
3601 pointer doesnt find one the single transition is appended as normal.
3603 - Once compressed we can Renew/realloc the structures to release the
3606 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3607 specifically Fig 3.47 and the associated pseudocode.
3611 const U32 laststate = TRIE_NODENUM( next_alloc );
3614 trie->statecount = laststate;
3616 for ( state = 1 ; state < laststate ; state++ ) {
3618 const U32 stateidx = TRIE_NODEIDX( state );
3619 const U32 o_used = trie->trans[ stateidx ].check;
3620 U32 used = trie->trans[ stateidx ].check;
3621 trie->trans[ stateidx ].check = 0;
3624 used && charid < trie->uniquecharcount;
3627 if ( flag || trie->trans[ stateidx + charid ].next ) {
3628 if ( trie->trans[ stateidx + charid ].next ) {
3630 for ( ; zp < pos ; zp++ ) {
3631 if ( ! trie->trans[ zp ].next ) {
3635 trie->states[ state ].trans.base
3637 + trie->uniquecharcount
3639 trie->trans[ zp ].next
3640 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3642 trie->trans[ zp ].check = state;
3643 if ( ++zp > pos ) pos = zp;
3650 trie->states[ state ].trans.base
3651 = pos + trie->uniquecharcount - charid ;
3653 trie->trans[ pos ].next
3654 = SAFE_TRIE_NODENUM(
3655 trie->trans[ stateidx + charid ].next );
3656 trie->trans[ pos ].check = state;
3661 trie->lasttrans = pos + 1;
3662 trie->states = (reg_trie_state *)
3663 PerlMemShared_realloc( trie->states, laststate
3664 * sizeof(reg_trie_state) );
3665 DEBUG_TRIE_COMPILE_MORE_r(
3666 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3668 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3672 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3675 } /* end table compress */
3677 DEBUG_TRIE_COMPILE_MORE_r(
3678 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3680 (UV)trie->statecount,
3681 (UV)trie->lasttrans)
3683 /* resize the trans array to remove unused space */
3684 trie->trans = (reg_trie_trans *)
3685 PerlMemShared_realloc( trie->trans, trie->lasttrans
3686 * sizeof(reg_trie_trans) );
3688 { /* Modify the program and insert the new TRIE node */
3689 U8 nodetype =(U8) flags;
3693 regnode *optimize = NULL;
3694 #endif /* DEBUGGING */
3696 This means we convert either the first branch or the first Exact,
3697 depending on whether the thing following (in 'last') is a branch
3698 or not and whther first is the startbranch (ie is it a sub part of
3699 the alternation or is it the whole thing.)
3700 Assuming its a sub part we convert the EXACT otherwise we convert
3701 the whole branch sequence, including the first.
3703 /* Find the node we are going to overwrite */
3704 if ( first != startbranch || OP( last ) == BRANCH ) {
3705 /* branch sub-chain */
3706 NEXT_OFF( first ) = (U16)(last - first);
3707 /* whole branch chain */
3709 /* But first we check to see if there is a common prefix we can
3710 split out as an EXACT and put in front of the TRIE node. */
3711 trie->startstate= 1;
3712 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3713 /* we want to find the first state that has more than
3714 * one transition, if that state is not the first state
3715 * then we have a common prefix which we can remove.
3718 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3720 I32 first_ofs = -1; /* keeps track of the ofs of the first
3721 transition, -1 means none */
3723 const U32 base = trie->states[ state ].trans.base;
3725 /* does this state terminate an alternation? */
3726 if ( trie->states[state].wordnum )
3729 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3730 if ( ( base + ofs >= trie->uniquecharcount ) &&
3731 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3732 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3734 if ( ++count > 1 ) {
3735 /* we have more than one transition */
3738 /* if this is the first state there is no common prefix
3739 * to extract, so we can exit */
3740 if ( state == 1 ) break;
3741 tmp = av_fetch( revcharmap, ofs, 0);
3742 ch = (U8*)SvPV_nolen_const( *tmp );
3744 /* if we are on count 2 then we need to initialize the
3745 * bitmap, and store the previous char if there was one
3748 /* clear the bitmap */
3749 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3751 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3754 if (first_ofs >= 0) {
3755 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3756 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3758 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3760 Perl_re_printf( aTHX_ "%s", (char*)ch)
3764 /* store the current firstchar in the bitmap */
3765 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3766 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3772 /* This state has only one transition, its transition is part
3773 * of a common prefix - we need to concatenate the char it
3774 * represents to what we have so far. */
3775 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3777 char *ch = SvPV( *tmp, len );
3779 SV *sv=sv_newmortal();
3780 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3782 (UV)state, (UV)first_ofs,
3783 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3784 PL_colors[0], PL_colors[1],
3785 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3786 PERL_PV_ESCAPE_FIRSTCHAR
3791 OP( convert ) = nodetype;
3792 str=STRING(convert);
3793 setSTR_LEN(convert, 0);
3795 assert( ( STR_LEN(convert) + len ) < 256 );
3796 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3802 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3807 trie->prefixlen = (state-1);
3809 regnode *n = REGNODE_AFTER(convert);
3810 assert( n - convert <= U16_MAX );
3811 NEXT_OFF(convert) = n - convert;
3812 trie->startstate = state;
3813 trie->minlen -= (state - 1);
3814 trie->maxlen -= (state - 1);
3816 /* At least the UNICOS C compiler choked on this
3817 * being argument to DEBUG_r(), so let's just have
3820 #ifdef PERL_EXT_RE_BUILD
3826 U32 word = trie->wordcount;
3828 SV ** const tmp = av_fetch( trie_words, word, 0 );
3830 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3831 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3833 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3841 NEXT_OFF(convert) = (U16)(tail - convert);
3842 DEBUG_r(optimize= n);
3848 if ( trie->maxlen ) {
3849 NEXT_OFF( convert ) = (U16)(tail - convert);
3850 ARG_SET( convert, data_slot );
3851 /* Store the offset to the first unabsorbed branch in
3852 jump[0], which is otherwise unused by the jump logic.
3853 We use this when dumping a trie and during optimisation. */
3855 trie->jump[0] = (U16)(nextbranch - convert);
3857 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3858 * and there is a bitmap
3859 * and the first "jump target" node we found leaves enough room
3860 * then convert the TRIE node into a TRIEC node, with the bitmap
3861 * embedded inline in the opcode - this is hypothetically faster.
3863 if ( !trie->states[trie->startstate].wordnum
3865 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3867 OP( convert ) = TRIEC;
3868 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3869 PerlMemShared_free(trie->bitmap);
3872 OP( convert ) = TRIE;
3874 /* store the type in the flags */
3875 convert->flags = nodetype;
3879 + REGNODE_ARG_LEN( OP( convert ) );
3881 /* XXX We really should free up the resource in trie now,
3882 as we won't use them - (which resources?) dmq */
3884 /* needed for dumping*/
3885 DEBUG_r(if (optimize) {
3887 Try to clean up some of the debris left after the
3890 while( optimize < jumper ) {
3891 OP( optimize ) = OPTIMIZED;
3895 } /* end node insert */
3897 /* Finish populating the prev field of the wordinfo array. Walk back
3898 * from each accept state until we find another accept state, and if
3899 * so, point the first word's .prev field at the second word. If the
3900 * second already has a .prev field set, stop now. This will be the
3901 * case either if we've already processed that word's accept state,
3902 * or that state had multiple words, and the overspill words were
3903 * already linked up earlier.
3910 for (word=1; word <= trie->wordcount; word++) {
3912 if (trie->wordinfo[word].prev)
3914 state = trie->wordinfo[word].accept;
3916 state = prev_states[state];
3919 prev = trie->states[state].wordnum;
3923 trie->wordinfo[word].prev = prev;
3925 Safefree(prev_states);
3929 /* and now dump out the compressed format */
3930 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3932 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3934 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3935 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3937 SvREFCNT_dec_NN(revcharmap);
3941 : trie->startstate>1
3947 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3949 /* The Trie is constructed and compressed now so we can build a fail array if
3952 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3954 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3958 We find the fail state for each state in the trie, this state is the longest
3959 proper suffix of the current state's 'word' that is also a proper prefix of
3960 another word in our trie. State 1 represents the word '' and is thus the
3961 default fail state. This allows the DFA not to have to restart after its
3962 tried and failed a word at a given point, it simply continues as though it
3963 had been matching the other word in the first place.
3965 'abcdgu'=~/abcdefg|cdgu/
3966 When we get to 'd' we are still matching the first word, we would encounter
3967 'g' which would fail, which would bring us to the state representing 'd' in
3968 the second word where we would try 'g' and succeed, proceeding to match
3971 /* add a fail transition */
3972 const U32 trie_offset = ARG(source);
3973 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3975 const U32 ucharcount = trie->uniquecharcount;
3976 const U32 numstates = trie->statecount;
3977 const U32 ubound = trie->lasttrans + ucharcount;
3981 U32 base = trie->states[ 1 ].trans.base;
3984 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3986 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3988 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3989 PERL_UNUSED_CONTEXT;
3991 PERL_UNUSED_ARG(depth);
3994 if ( OP(source) == TRIE ) {
3995 struct regnode_1 *op = (struct regnode_1 *)
3996 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3997 StructCopy(source, op, struct regnode_1);
3998 stclass = (regnode *)op;
4000 struct regnode_charclass *op = (struct regnode_charclass *)
4001 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4002 StructCopy(source, op, struct regnode_charclass);
4003 stclass = (regnode *)op;
4005 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
4007 ARG_SET( stclass, data_slot );
4008 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
4009 RExC_rxi->data->data[ data_slot ] = (void*)aho;
4010 aho->trie=trie_offset;
4011 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
4012 Copy( trie->states, aho->states, numstates, reg_trie_state );
4013 Newx( q, numstates, U32);
4014 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
4017 /* initialize fail[0..1] to be 1 so that we always have
4018 a valid final fail state */
4019 fail[ 0 ] = fail[ 1 ] = 1;
4021 for ( charid = 0; charid < ucharcount ; charid++ ) {
4022 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
4024 q[ q_write ] = newstate;
4025 /* set to point at the root */
4026 fail[ q[ q_write++ ] ]=1;
4029 while ( q_read < q_write) {
4030 const U32 cur = q[ q_read++ % numstates ];
4031 base = trie->states[ cur ].trans.base;
4033 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
4034 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
4036 U32 fail_state = cur;
4039 fail_state = fail[ fail_state ];
4040 fail_base = aho->states[ fail_state ].trans.base;
4041 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
4043 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
4044 fail[ ch_state ] = fail_state;
4045 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
4047 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
4049 q[ q_write++ % numstates] = ch_state;
4053 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
4054 when we fail in state 1, this allows us to use the
4055 charclass scan to find a valid start char. This is based on the principle
4056 that theres a good chance the string being searched contains lots of stuff
4057 that cant be a start char.
4059 fail[ 0 ] = fail[ 1 ] = 0;
4060 DEBUG_TRIE_COMPILE_r({
4061 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
4062 depth, (UV)numstates
4064 for( q_read=1; q_read<numstates; q_read++ ) {
4065 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
4067 Perl_re_printf( aTHX_ "\n");
4070 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
4075 /* The below joins as many adjacent EXACTish nodes as possible into a single
4076 * one. The regop may be changed if the node(s) contain certain sequences that
4077 * require special handling. The joining is only done if:
4078 * 1) there is room in the current conglomerated node to entirely contain the
4080 * 2) they are compatible node types
4082 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
4083 * these get optimized out
4085 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
4086 * as possible, even if that means splitting an existing node so that its first
4087 * part is moved to the preceding node. This would maximise the efficiency of
4088 * memEQ during matching.
4090 * If a node is to match under /i (folded), the number of characters it matches
4091 * can be different than its character length if it contains a multi-character
4092 * fold. *min_subtract is set to the total delta number of characters of the
4095 * And *unfolded_multi_char is set to indicate whether or not the node contains
4096 * an unfolded multi-char fold. This happens when it won't be known until
4097 * runtime whether the fold is valid or not; namely
4098 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
4099 * target string being matched against turns out to be UTF-8 is that fold
4101 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
4103 * (Multi-char folds whose components are all above the Latin1 range are not
4104 * run-time locale dependent, and have already been folded by the time this
4105 * function is called.)
4107 * This is as good a place as any to discuss the design of handling these
4108 * multi-character fold sequences. It's been wrong in Perl for a very long
4109 * time. There are three code points in Unicode whose multi-character folds
4110 * were long ago discovered to mess things up. The previous designs for
4111 * dealing with these involved assigning a special node for them. This
4112 * approach doesn't always work, as evidenced by this example:
4113 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
4114 * Both sides fold to "sss", but if the pattern is parsed to create a node that
4115 * would match just the \xDF, it won't be able to handle the case where a
4116 * successful match would have to cross the node's boundary. The new approach
4117 * that hopefully generally solves the problem generates an EXACTFUP node
4118 * that is "sss" in this case.
4120 * It turns out that there are problems with all multi-character folds, and not
4121 * just these three. Now the code is general, for all such cases. The
4122 * approach taken is:
4123 * 1) This routine examines each EXACTFish node that could contain multi-
4124 * character folded sequences. Since a single character can fold into
4125 * such a sequence, the minimum match length for this node is less than
4126 * the number of characters in the node. This routine returns in
4127 * *min_subtract how many characters to subtract from the actual
4128 * length of the string to get a real minimum match length; it is 0 if
4129 * there are no multi-char foldeds. This delta is used by the caller to
4130 * adjust the min length of the match, and the delta between min and max,
4131 * so that the optimizer doesn't reject these possibilities based on size
4134 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4135 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4136 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4137 * EXACTFU nodes. The node type of such nodes is then changed to
4138 * EXACTFUP, indicating it is problematic, and needs careful handling.
4139 * (The procedures in step 1) above are sufficient to handle this case in
4140 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4141 * the only case where there is a possible fold length change in non-UTF-8
4142 * patterns. By reserving a special node type for problematic cases, the
4143 * far more common regular EXACTFU nodes can be processed faster.
4144 * regexec.c takes advantage of this.
4146 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4147 * problematic cases. These all only occur when the pattern is not
4148 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4149 * length change, it handles the situation where the string cannot be
4150 * entirely folded. The strings in an EXACTFish node are folded as much
4151 * as possible during compilation in regcomp.c. This saves effort in
4152 * regex matching. By using an EXACTFUP node when it is not possible to
4153 * fully fold at compile time, regexec.c can know that everything in an
4154 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4155 * case where folding in EXACTFU nodes can't be done at compile time is
4156 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4157 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4158 * handle two very different cases. Alternatively, there could have been
4159 * a node type where there are length changes, one for unfolded, and one
4160 * for both. If yet another special case needed to be created, the number
4161 * of required node types would have to go to 7. khw figures that even
4162 * though there are plenty of node types to spare, that the maintenance
4163 * cost wasn't worth the small speedup of doing it that way, especially
4164 * since he thinks the MICRO SIGN is rarely encountered in practice.
4166 * There are other cases where folding isn't done at compile time, but
4167 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4168 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4169 * changes. Some folds in EXACTF depend on if the runtime target string
4170 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4171 * when no fold in it depends on the UTF-8ness of the target string.)
4173 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4174 * validity of the fold won't be known until runtime, and so must remain
4175 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4176 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4177 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4178 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4179 * The reason this is a problem is that the optimizer part of regexec.c
4180 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4181 * that a character in the pattern corresponds to at most a single
4182 * character in the target string. (And I do mean character, and not byte
4183 * here, unlike other parts of the documentation that have never been
4184 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4185 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4186 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4187 * EXACTFL nodes, violate the assumption, and they are the only instances
4188 * where it is violated. I'm reluctant to try to change the assumption,
4189 * as the code involved is impenetrable to me (khw), so instead the code
4190 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4191 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4192 * boolean indicating whether or not the node contains such a fold. When
4193 * it is true, the caller sets a flag that later causes the optimizer in
4194 * this file to not set values for the floating and fixed string lengths,
4195 * and thus avoids the optimizer code in regexec.c that makes the invalid
4196 * assumption. Thus, there is no optimization based on string lengths for
4197 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4198 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4199 * assumption is wrong only in these cases is that all other non-UTF-8
4200 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4201 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4202 * EXACTF nodes because we don't know at compile time if it actually
4203 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4204 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4205 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4206 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4207 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4208 * string would require the pattern to be forced into UTF-8, the overhead
4209 * of which we want to avoid. Similarly the unfolded multi-char folds in
4210 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4213 * Similarly, the code that generates tries doesn't currently handle
4214 * not-already-folded multi-char folds, and it looks like a pain to change
4215 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4216 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4217 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4218 * using /iaa matching will be doing so almost entirely with ASCII
4219 * strings, so this should rarely be encountered in practice */
4222 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4223 UV *min_subtract, bool *unfolded_multi_char,
4224 U32 flags, regnode *val, U32 depth)
4226 /* Merge several consecutive EXACTish nodes into one. */
4228 regnode *n = regnext(scan);
4230 regnode *next = REGNODE_AFTER_varies(scan);
4234 regnode *stop = scan;
4235 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4237 PERL_UNUSED_ARG(depth);
4240 PERL_ARGS_ASSERT_JOIN_EXACT;
4241 #ifndef EXPERIMENTAL_INPLACESCAN
4242 PERL_UNUSED_ARG(flags);
4243 PERL_UNUSED_ARG(val);
4245 DEBUG_PEEP("join", scan, depth, 0);
4247 assert(REGNODE_TYPE(OP(scan)) == EXACT);
4249 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4250 * EXACT ones that are mergeable to the current one. */
4252 && ( REGNODE_TYPE(OP(n)) == NOTHING
4253 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
4255 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4258 if (OP(n) == TAIL || n > next)
4260 if (REGNODE_TYPE(OP(n)) == NOTHING) {
4261 DEBUG_PEEP("skip:", n, depth, 0);
4262 NEXT_OFF(scan) += NEXT_OFF(n);
4263 next = n + NODE_STEP_REGNODE;
4270 else if (stringok) {
4271 const unsigned int oldl = STR_LEN(scan);
4272 regnode * const nnext = regnext(n);
4274 /* XXX I (khw) kind of doubt that this works on platforms (should
4275 * Perl ever run on one) where U8_MAX is above 255 because of lots
4276 * of other assumptions */
4277 /* Don't join if the sum can't fit into a single node */
4278 if (oldl + STR_LEN(n) > U8_MAX)
4281 /* Joining something that requires UTF-8 with something that
4282 * doesn't, means the result requires UTF-8. */
4283 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4284 OP(scan) = EXACT_REQ8;
4286 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4287 ; /* join is compatible, no need to change OP */
4289 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4290 OP(scan) = EXACTFU_REQ8;
4292 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4293 ; /* join is compatible, no need to change OP */
4295 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4296 ; /* join is compatible, no need to change OP */
4298 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4300 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4301 * which can join with EXACTFU ones. We check for this case
4302 * here. These need to be resolved to either EXACTFU or
4303 * EXACTF at joining time. They have nothing in them that
4304 * would forbid them from being the more desirable EXACTFU
4305 * nodes except that they begin and/or end with a single [Ss].
4306 * The reason this is problematic is because they could be
4307 * joined in this loop with an adjacent node that ends and/or
4308 * begins with [Ss] which would then form the sequence 'ss',
4309 * which matches differently under /di than /ui, in which case
4310 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4311 * formed, the nodes get absorbed into any adjacent EXACTFU
4312 * node. And if the only adjacent node is EXACTF, they get
4313 * absorbed into that, under the theory that a longer node is
4314 * better than two shorter ones, even if one is EXACTFU. Note
4315 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4316 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4318 if (STRING(n)[STR_LEN(n)-1] == 's') {
4320 /* Here the joined node would end with 's'. If the node
4321 * following the combination is an EXACTF one, it's better to
4322 * join this trailing edge 's' node with that one, leaving the
4323 * current one in 'scan' be the more desirable EXACTFU */
4324 if (OP(nnext) == EXACTF) {
4328 OP(scan) = EXACTFU_S_EDGE;
4330 } /* Otherwise, the beginning 's' of the 2nd node just
4331 becomes an interior 's' in 'scan' */
4333 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4334 ; /* join is compatible, no need to change OP */
4336 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4338 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4339 * nodes. But the latter nodes can be also joined with EXACTFU
4340 * ones, and that is a better outcome, so if the node following
4341 * 'n' is EXACTFU, quit now so that those two can be joined
4343 if (OP(nnext) == EXACTFU) {
4347 /* The join is compatible, and the combined node will be
4348 * EXACTF. (These don't care if they begin or end with 's' */
4350 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4351 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4352 && STRING(n)[0] == 's')
4354 /* When combined, we have the sequence 'ss', which means we
4355 * have to remain /di */
4359 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4360 if (STRING(n)[0] == 's') {
4361 ; /* Here the join is compatible and the combined node
4362 starts with 's', no need to change OP */
4364 else { /* Now the trailing 's' is in the interior */
4368 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4370 /* The join is compatible, and the combined node will be
4371 * EXACTF. (These don't care if they begin or end with 's' */
4374 else if (OP(scan) != OP(n)) {
4376 /* The only other compatible joinings are the same node type */
4380 DEBUG_PEEP("merg", n, depth, 0);
4383 next = REGNODE_AFTER_varies(n);
4384 NEXT_OFF(scan) += NEXT_OFF(n);
4385 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4386 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4387 /* Now we can overwrite *n : */
4388 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4396 #ifdef EXPERIMENTAL_INPLACESCAN
4397 if (flags && !NEXT_OFF(n)) {
4398 DEBUG_PEEP("atch", val, depth, 0);
4399 if (REGNODE_OFF_BY_ARG(OP(n))) {
4400 ARG_SET(n, val - n);
4403 NEXT_OFF(n) = val - n;
4410 /* This temporary node can now be turned into EXACTFU, and must, as
4411 * regexec.c doesn't handle it */
4412 if (OP(scan) == EXACTFU_S_EDGE) {
4417 *unfolded_multi_char = FALSE;
4419 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4420 * can now analyze for sequences of problematic code points. (Prior to
4421 * this final joining, sequences could have been split over boundaries, and
4422 * hence missed). The sequences only happen in folding, hence for any
4423 * non-EXACT EXACTish node */
4424 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4425 U8* s0 = (U8*) STRING(scan);
4427 U8* s_end = s0 + STR_LEN(scan);
4429 int total_count_delta = 0; /* Total delta number of characters that
4430 multi-char folds expand to */
4432 /* One pass is made over the node's string looking for all the
4433 * possibilities. To avoid some tests in the loop, there are two main
4434 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4439 if (OP(scan) == EXACTFL) {
4442 /* An EXACTFL node would already have been changed to another
4443 * node type unless there is at least one character in it that
4444 * is problematic; likely a character whose fold definition
4445 * won't be known until runtime, and so has yet to be folded.
4446 * For all but the UTF-8 locale, folds are 1-1 in length, but
4447 * to handle the UTF-8 case, we need to create a temporary
4448 * folded copy using UTF-8 locale rules in order to analyze it.
4449 * This is because our macros that look to see if a sequence is
4450 * a multi-char fold assume everything is folded (otherwise the
4451 * tests in those macros would be too complicated and slow).
4452 * Note that here, the non-problematic folds will have already
4453 * been done, so we can just copy such characters. We actually
4454 * don't completely fold the EXACTFL string. We skip the
4455 * unfolded multi-char folds, as that would just create work
4456 * below to figure out the size they already are */
4458 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4461 STRLEN s_len = UTF8SKIP(s);
4462 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4463 Copy(s, d, s_len, U8);
4466 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4467 *unfolded_multi_char = TRUE;
4468 Copy(s, d, s_len, U8);
4471 else if (isASCII(*s)) {
4472 *(d++) = toFOLD(*s);
4476 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4482 /* Point the remainder of the routine to look at our temporary
4486 } /* End of creating folded copy of EXACTFL string */
4488 /* Examine the string for a multi-character fold sequence. UTF-8
4489 * patterns have all characters pre-folded by the time this code is
4491 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4492 length sequence we are looking for is 2 */
4494 int count = 0; /* How many characters in a multi-char fold */
4495 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4496 if (! len) { /* Not a multi-char fold: get next char */
4501 { /* Here is a generic multi-char fold. */
4502 U8* multi_end = s + len;
4504 /* Count how many characters are in it. In the case of
4505 * /aa, no folds which contain ASCII code points are
4506 * allowed, so check for those, and skip if found. */
4507 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4508 count = utf8_length(s, multi_end);
4512 while (s < multi_end) {
4515 goto next_iteration;
4525 /* The delta is how long the sequence is minus 1 (1 is how long
4526 * the character that folds to the sequence is) */
4527 total_count_delta += count - 1;
4531 /* We created a temporary folded copy of the string in EXACTFL
4532 * nodes. Therefore we need to be sure it doesn't go below zero,
4533 * as the real string could be shorter */
4534 if (OP(scan) == EXACTFL) {
4535 int total_chars = utf8_length((U8*) STRING(scan),
4536 (U8*) STRING(scan) + STR_LEN(scan));
4537 if (total_count_delta > total_chars) {
4538 total_count_delta = total_chars;
4542 *min_subtract += total_count_delta;
4545 else if (OP(scan) == EXACTFAA) {
4547 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4548 * fold to the ASCII range (and there are no existing ones in the
4549 * upper latin1 range). But, as outlined in the comments preceding
4550 * this function, we need to flag any occurrences of the sharp s.
4551 * This character forbids trie formation (because of added
4553 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4554 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4555 || UNICODE_DOT_DOT_VERSION > 0)
4557 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4558 OP(scan) = EXACTFAA_NO_TRIE;
4559 *unfolded_multi_char = TRUE;
4565 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4567 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4568 * folds that are all Latin1. As explained in the comments
4569 * preceding this function, we look also for the sharp s in EXACTF
4570 * and EXACTFL nodes; it can be in the final position. Otherwise
4571 * we can stop looking 1 byte earlier because have to find at least
4572 * two characters for a multi-fold */
4573 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4578 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4579 if (! len) { /* Not a multi-char fold. */
4580 if (*s == LATIN_SMALL_LETTER_SHARP_S
4581 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4583 *unfolded_multi_char = TRUE;
4590 && isALPHA_FOLD_EQ(*s, 's')
4591 && isALPHA_FOLD_EQ(*(s+1), 's'))
4594 /* EXACTF nodes need to know that the minimum length
4595 * changed so that a sharp s in the string can match this
4596 * ss in the pattern, but they remain EXACTF nodes, as they
4597 * won't match this unless the target string is in UTF-8,
4598 * which we don't know until runtime. EXACTFL nodes can't
4599 * transform into EXACTFU nodes */
4600 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4601 OP(scan) = EXACTFUP;
4605 *min_subtract += len - 1;
4613 /* Allow dumping but overwriting the collection of skipped
4614 * ops and/or strings with fake optimized ops */
4615 n = REGNODE_AFTER_varies(scan);
4623 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4627 /* REx optimizer. Converts nodes into quicker variants "in place".
4628 Finds fixed substrings. */
4630 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4631 to the position after last scanned or to NULL. */
4633 #define INIT_AND_WITHP \
4634 assert(!and_withp); \
4635 Newx(and_withp, 1, regnode_ssc); \
4636 SAVEFREEPV(and_withp)
4640 S_unwind_scan_frames(pTHX_ const void *p)
4642 scan_frame *f= (scan_frame *)p;
4644 scan_frame *n= f->next_frame;
4650 /* Follow the next-chain of the current node and optimize away
4651 all the NOTHINGs from it.
4654 S_rck_elide_nothing(pTHX_ regnode *node)
4656 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4658 if (OP(node) != CURLYX) {
4659 const int max = (REGNODE_OFF_BY_ARG(OP(node))
4661 /* I32 may be smaller than U16 on CRAYs! */
4662 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4663 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node));
4667 /* Skip NOTHING and LONGJMP. */
4671 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
4672 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4678 if (REGNODE_OFF_BY_ARG(OP(node)))
4681 NEXT_OFF(node) = off;
4686 /* the return from this sub is the minimum length that could possibly match */
4689 RExC_state_t *pRExC_state,
4690 regnode **scanp, /* Start here (read-write). */
4691 SSize_t *minlenp, /* used for the minlen of substrings? */
4692 SSize_t *deltap, /* Write maxlen-minlen here. */
4693 regnode *last, /* Stop before this one. */
4694 scan_data_t *data, /* string data about the pattern */
4695 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
4696 U32 recursed_depth, /* how deep have we recursed via GOSUB */
4697 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
4698 U32 flags, /* flags controlling this call, see SCF_ flags */
4699 U32 depth, /* how deep have we recursed period */
4700 bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
4701 FALSE only if the caller (recursively) was
4702 prohibited from modifying the regops, because
4703 a higher caller is holding a ptr to them. */
4706 /* vars about the regnodes we are working with */
4707 regnode *scan = *scanp; /* the current opcode we are inspecting */
4708 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
4709 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
4710 the first non open regop, if the init
4711 val IS an OPEN then we will skip past
4712 it just after the var decls section */
4713 I32 code = 0; /* temp var used to hold the optype of a regop */
4715 /* vars about the min and max length of the pattern */
4716 SSize_t min = 0; /* min length of this part of the pattern */
4717 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
4718 this is adjusted down if we find
4720 SSize_t delta = 0; /* difference between min and max length
4721 (not accounting for stopmin) */
4723 /* vars about capture buffers in the pattern */
4724 I32 pars = 0; /* count of OPEN opcodes */
4725 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; /* is this op an OPEN? */
4727 /* vars about whether this pattern contains something that can match
4728 * infinitely long strings, eg, X* or X+ */
4729 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4730 int is_inf_internal = 0; /* The studied chunk is infinite */
4732 /* scan_data_t (struct) is used to hold information about the substrings
4733 * and start class we have extracted from the string */
4734 scan_data_t data_fake; /* temp var used for recursing in some cases */
4736 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
4737 trie optimizations */
4739 scan_frame *frame = NULL; /* used as part of fake recursion */
4741 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4743 PERL_ARGS_ASSERT_STUDY_CHUNK;
4744 RExC_study_started= 1;
4746 Zero(&data_fake, 1, scan_data_t);
4749 while (first_non_open && OP(first_non_open) == OPEN)
4750 first_non_open=regnext(first_non_open);
4755 RExC_study_chunk_recursed_count++;
4757 DEBUG_OPTIMISE_MORE_r(
4759 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4760 depth, (long)stopparen,
4761 (unsigned long)RExC_study_chunk_recursed_count,
4762 (unsigned long)depth, (unsigned long)recursed_depth,
4765 if (recursed_depth) {
4768 for ( j = 0 ; j < recursed_depth ; j++ ) {
4769 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4770 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4771 Perl_re_printf( aTHX_ " %d",(int)i);
4775 if ( j + 1 < recursed_depth ) {
4776 Perl_re_printf( aTHX_ ",");
4780 Perl_re_printf( aTHX_ "\n");
4783 while ( scan && OP(scan) != END && scan < last ){
4784 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4785 node length to get a real minimum (because
4786 the folded version may be shorter) */
4787 bool unfolded_multi_char = FALSE;
4788 /* avoid mutating ops if we are anywhere within the recursed or
4789 * enframed handling for a GOSUB: the outermost level will handle it.
4791 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4792 /* Peephole optimizer: */
4793 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
4794 DEBUG_PEEP("Peep", scan, depth, flags);
4797 /* The reason we do this here is that we need to deal with things like
4798 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4799 * parsing code, as each (?:..) is handled by a different invocation of
4802 if (REGNODE_TYPE(OP(scan)) == EXACT
4803 && OP(scan) != LEXACT
4804 && OP(scan) != LEXACT_REQ8
4807 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4808 0, NULL, depth + 1);
4811 /* Follow the next-chain of the current node and optimize
4812 away all the NOTHINGs from it.
4814 rck_elide_nothing(scan);
4816 /* The principal pseudo-switch. Cannot be a switch, since we look into
4817 * several different things. */
4818 if ( OP(scan) == DEFINEP ) {
4820 SSize_t deltanext = 0;
4821 SSize_t fake_last_close = 0;
4822 regnode *fake_last_close_op = NULL;
4823 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
4825 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4826 scan = regnext(scan);
4827 assert( OP(scan) == IFTHEN );
4828 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4830 data_fake.last_closep= &fake_last_close;
4831 data_fake.last_close_opp= &fake_last_close_op;
4833 next = regnext(scan);
4834 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
4835 DEBUG_PEEP("scan", scan, depth, flags);
4836 DEBUG_PEEP("next", next, depth, flags);
4838 /* we suppose the run is continuous, last=next...
4839 * NOTE we dont use the return here! */
4840 /* DEFINEP study_chunk() recursion */
4841 (void)study_chunk(pRExC_state, &scan, &minlen,
4842 &deltanext, next, &data_fake, stopparen,
4843 recursed_depth, NULL, f, depth+1, mutate_ok);
4848 OP(scan) == BRANCH ||
4849 OP(scan) == BRANCHJ ||
4852 next = regnext(scan);
4855 /* The op(next)==code check below is to see if we
4856 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4857 * IFTHEN is special as it might not appear in pairs.
4858 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4859 * we dont handle it cleanly. */
4860 if (OP(next) == code || code == IFTHEN) {
4861 /* NOTE - There is similar code to this block below for
4862 * handling TRIE nodes on a re-study. If you change stuff here
4863 * check there too. */
4864 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4866 regnode * const startbranch=scan;
4868 if (flags & SCF_DO_SUBSTR) {
4869 /* Cannot merge strings after this. */
4870 scan_commit(pRExC_state, data, minlenp, is_inf);
4873 if (flags & SCF_DO_STCLASS)
4874 ssc_init_zero(pRExC_state, &accum);
4876 while (OP(scan) == code) {
4877 SSize_t deltanext, minnext, fake_last_close = 0;
4878 regnode *fake_last_close_op = NULL;
4879 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
4880 regnode_ssc this_class;
4882 DEBUG_PEEP("Branch", scan, depth, flags);
4885 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4887 data_fake.whilem_c = data->whilem_c;
4888 data_fake.last_closep = data->last_closep;
4889 data_fake.last_close_opp = data->last_close_opp;
4892 data_fake.last_closep = &fake_last_close;
4893 data_fake.last_close_opp = &fake_last_close_op;
4896 data_fake.pos_delta = delta;
4897 next = regnext(scan);
4899 scan = REGNODE_AFTER_opcode(scan, code);
4901 if (flags & SCF_DO_STCLASS) {
4902 ssc_init(pRExC_state, &this_class);
4903 data_fake.start_class = &this_class;
4904 f |= SCF_DO_STCLASS_AND;
4906 if (flags & SCF_WHILEM_VISITED_POS)
4907 f |= SCF_WHILEM_VISITED_POS;
4909 /* we suppose the run is continuous, last=next...*/
4910 /* recurse study_chunk() for each BRANCH in an alternation */
4911 minnext = study_chunk(pRExC_state, &scan, minlenp,
4912 &deltanext, next, &data_fake, stopparen,
4913 recursed_depth, NULL, f, depth+1,
4918 if (deltanext == OPTIMIZE_INFTY) {
4919 is_inf = is_inf_internal = 1;
4920 max1 = OPTIMIZE_INFTY;
4921 } else if (max1 < minnext + deltanext)
4922 max1 = minnext + deltanext;
4924 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4926 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4927 if ( stopmin > minnext)
4928 stopmin = min + min1;
4929 flags &= ~SCF_DO_SUBSTR;
4931 data->flags |= SCF_SEEN_ACCEPT;
4934 if (data_fake.flags & SF_HAS_EVAL)
4935 data->flags |= SF_HAS_EVAL;
4936 data->whilem_c = data_fake.whilem_c;
4938 if (flags & SCF_DO_STCLASS)
4939 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4940 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
4942 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4944 if (flags & SCF_DO_SUBSTR) {
4945 data->pos_min += min1;
4946 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4947 data->pos_delta = OPTIMIZE_INFTY;
4949 data->pos_delta += max1 - min1;
4950 if (max1 != min1 || is_inf)
4951 data->cur_is_floating = 1;
4954 if (delta == OPTIMIZE_INFTY
4955 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4956 delta = OPTIMIZE_INFTY;
4958 delta += max1 - min1;
4959 if (flags & SCF_DO_STCLASS_OR) {
4960 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4962 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4963 flags &= ~SCF_DO_STCLASS;
4966 else if (flags & SCF_DO_STCLASS_AND) {
4968 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4969 flags &= ~SCF_DO_STCLASS;
4972 /* Switch to OR mode: cache the old value of
4973 * data->start_class */
4975 StructCopy(data->start_class, and_withp, regnode_ssc);
4976 flags &= ~SCF_DO_STCLASS_AND;
4977 StructCopy(&accum, data->start_class, regnode_ssc);
4978 flags |= SCF_DO_STCLASS_OR;
4981 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
4983 if (PERL_ENABLE_TRIE_OPTIMISATION
4984 && OP(startbranch) == BRANCH
4989 Assuming this was/is a branch we are dealing with: 'scan'
4990 now points at the item that follows the branch sequence,
4991 whatever it is. We now start at the beginning of the
4992 sequence and look for subsequences of
4998 which would be constructed from a pattern like
5001 If we can find such a subsequence we need to turn the first
5002 element into a trie and then add the subsequent branch exact
5003 strings to the trie.
5007 1. patterns where the whole set of branches can be
5010 2. patterns where only a subset can be converted.
5012 In case 1 we can replace the whole set with a single regop
5013 for the trie. In case 2 we need to keep the start and end
5016 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
5017 becomes BRANCH TRIE; BRANCH X;
5019 There is an additional case, that being where there is a
5020 common prefix, which gets split out into an EXACT like node
5021 preceding the TRIE node.
5023 If X(1..n)==tail then we can do a simple trie, if not we make
5024 a "jump" trie, such that when we match the appropriate word
5025 we "jump" to the appropriate tail node. Essentially we turn
5026 a nested if into a case structure of sorts.
5031 if (!re_trie_maxbuff) {
5032 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
5033 if (!SvIOK(re_trie_maxbuff))
5034 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
5036 if ( SvIV(re_trie_maxbuff)>=0 ) {
5038 regnode *first = (regnode *)NULL;
5039 regnode *prev = (regnode *)NULL;
5040 regnode *tail = scan;
5044 /* var tail is used because there may be a TAIL
5045 regop in the way. Ie, the exacts will point to the
5046 thing following the TAIL, but the last branch will
5047 point at the TAIL. So we advance tail. If we
5048 have nested (?:) we may have to move through several
5052 while ( OP( tail ) == TAIL ) {
5053 /* this is the TAIL generated by (?:) */
5054 tail = regnext( tail );
5058 DEBUG_TRIE_COMPILE_r({
5059 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
5060 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
5062 "Looking for TRIE'able sequences. Tail node is ",
5063 (UV) REGNODE_OFFSET(tail),
5064 SvPV_nolen_const( RExC_mysv )
5070 Step through the branches
5071 cur represents each branch,
5072 noper is the first thing to be matched as part
5074 noper_next is the regnext() of that node.
5076 We normally handle a case like this
5077 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
5078 support building with NOJUMPTRIE, which restricts
5079 the trie logic to structures like /FOO|BAR/.
5081 If noper is a trieable nodetype then the branch is
5082 a possible optimization target. If we are building
5083 under NOJUMPTRIE then we require that noper_next is
5084 the same as scan (our current position in the regex
5087 Once we have two or more consecutive such branches
5088 we can create a trie of the EXACT's contents and
5089 stitch it in place into the program.
5091 If the sequence represents all of the branches in
5092 the alternation we replace the entire thing with a
5095 Otherwise when it is a subsequence we need to
5096 stitch it in place and replace only the relevant
5097 branches. This means the first branch has to remain
5098 as it is used by the alternation logic, and its
5099 next pointer, and needs to be repointed at the item
5100 on the branch chain following the last branch we
5101 have optimized away.
5103 This could be either a BRANCH, in which case the
5104 subsequence is internal, or it could be the item
5105 following the branch sequence in which case the
5106 subsequence is at the end (which does not
5107 necessarily mean the first node is the start of the
5110 TRIE_TYPE(X) is a define which maps the optype to a
5114 ----------------+-----------
5119 EXACTFU_REQ8 | EXACTFU
5123 EXACTFLU8 | EXACTFLU8
5127 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
5129 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
5131 : ( EXACTFU == (X) \
5132 || EXACTFU_REQ8 == (X) \
5133 || EXACTFUP == (X) ) \
5135 : ( EXACTFAA == (X) ) \
5137 : ( EXACTL == (X) ) \
5139 : ( EXACTFLU8 == (X) ) \
5143 /* dont use tail as the end marker for this traverse */
5144 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
5145 regnode * const noper = REGNODE_AFTER( cur );
5146 U8 noper_type = OP( noper );
5147 U8 noper_trietype = TRIE_TYPE( noper_type );
5148 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
5149 regnode * const noper_next = regnext( noper );
5150 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5151 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
5154 DEBUG_TRIE_COMPILE_r({
5155 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5156 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
5158 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5160 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5161 Perl_re_printf( aTHX_ " -> %d:%s",
5162 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5165 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5166 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5167 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5169 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5170 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5171 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
5175 /* Is noper a trieable nodetype that can be merged
5176 * with the current trie (if there is one)? */
5180 ( noper_trietype == NOTHING )
5181 || ( trietype == NOTHING )
5182 || ( trietype == noper_trietype )
5185 && noper_next >= tail
5189 /* Handle mergable triable node Either we are
5190 * the first node in a new trieable sequence,
5191 * in which case we do some bookkeeping,
5192 * otherwise we update the end pointer. */
5195 if ( noper_trietype == NOTHING ) {
5196 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5197 regnode * const noper_next = regnext( noper );
5198 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5199 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5202 if ( noper_next_trietype ) {
5203 trietype = noper_next_trietype;
5204 } else if (noper_next_type) {
5205 /* a NOTHING regop is 1 regop wide.
5206 * We need at least two for a trie
5207 * so we can't merge this in */
5211 trietype = noper_trietype;
5214 if ( trietype == NOTHING )
5215 trietype = noper_trietype;
5220 } /* end handle mergable triable node */
5222 /* handle unmergable node -
5223 * noper may either be a triable node which can
5224 * not be tried together with the current trie,
5225 * or a non triable node */
5227 /* If last is set and trietype is not
5228 * NOTHING then we have found at least two
5229 * triable branch sequences in a row of a
5230 * similar trietype so we can turn them
5231 * into a trie. If/when we allow NOTHING to
5232 * start a trie sequence this condition
5233 * will be required, and it isn't expensive
5234 * so we leave it in for now. */
5235 if ( trietype && trietype != NOTHING )
5236 make_trie( pRExC_state,
5237 startbranch, first, cur, tail,
5238 count, trietype, depth+1 );
5239 prev = NULL; /* note: we clear/update
5240 first, trietype etc below,
5241 so we dont do it here */
5245 && noper_next >= tail
5248 /* noper is triable, so we can start a new
5252 trietype = noper_trietype;
5254 /* if we already saw a first but the
5255 * current node is not triable then we have
5256 * to reset the first information. */
5261 } /* end handle unmergable node */
5262 } /* loop over branches */
5263 DEBUG_TRIE_COMPILE_r({
5264 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5265 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5266 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5267 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5268 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5269 REGNODE_NAME(trietype)
5273 if ( prev && trietype ) {
5274 if ( trietype != NOTHING ) {
5275 /* the last branch of the sequence was part of
5276 * a trie, so we have to construct it here
5277 * outside of the loop */
5278 made= make_trie( pRExC_state, startbranch,
5279 first, scan, tail, count,
5280 trietype, depth+1 );
5281 #ifdef TRIE_STUDY_OPT
5282 if ( ((made == MADE_EXACT_TRIE &&
5283 startbranch == first)
5284 || ( first_non_open == first )) &&
5286 flags |= SCF_TRIE_RESTUDY;
5287 if ( startbranch == first
5290 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5295 /* at this point we know whatever we have is a
5296 * NOTHING sequence/branch AND if 'startbranch'
5297 * is 'first' then we can turn the whole thing
5300 if ( startbranch == first ) {
5302 /* the entire thing is a NOTHING sequence,
5303 * something like this: (?:|) So we can
5304 * turn it into a plain NOTHING op. */
5305 DEBUG_TRIE_COMPILE_r({
5306 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5307 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5309 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5312 OP(startbranch)= NOTHING;
5313 NEXT_OFF(startbranch)= tail - startbranch;
5314 for ( opt= startbranch + 1; opt < tail ; opt++ )
5318 } /* end if ( prev) */
5319 } /* TRIE_MAXBUF is non zero */
5321 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
5324 scan = REGNODE_AFTER_opcode(scan,code);
5326 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5328 regnode *start = NULL;
5329 regnode *end = NULL;
5330 U32 my_recursed_depth= recursed_depth;
5332 if (OP(scan) != SUSPEND) { /* GOSUB */
5333 /* Do setup, note this code has side effects beyond
5334 * the rest of this block. Specifically setting
5335 * RExC_recurse[] must happen at least once during
5338 RExC_recurse[ARG2L(scan)] = scan;
5339 start = REGNODE_p(RExC_open_parens[paren]);
5340 end = REGNODE_p(RExC_close_parens[paren]);
5342 /* NOTE we MUST always execute the above code, even
5343 * if we do nothing with a GOSUB */
5345 ( flags & SCF_IN_DEFINE )
5348 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5350 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5353 /* no need to do anything here if we are in a define. */
5354 /* or we are after some kind of infinite construct
5355 * so we can skip recursing into this item.
5356 * Since it is infinite we will not change the maxlen
5357 * or delta, and if we miss something that might raise
5358 * the minlen it will merely pessimise a little.
5360 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5361 * might result in a minlen of 1 and not of 4,
5362 * but this doesn't make us mismatch, just try a bit
5363 * harder than we should.
5365 * However we must assume this GOSUB is infinite, to
5366 * avoid wrongly applying other optimizations in the
5367 * enclosing scope - see GH 18096, for example.
5369 is_inf = is_inf_internal = 1;
5370 scan= regnext(scan);
5376 || !PAREN_TEST(recursed_depth - 1, paren)
5378 /* it is quite possible that there are more efficient ways
5379 * to do this. We maintain a bitmap per level of recursion
5380 * of which patterns we have entered so we can detect if a
5381 * pattern creates a possible infinite loop. When we
5382 * recurse down a level we copy the previous levels bitmap
5383 * down. When we are at recursion level 0 we zero the top
5384 * level bitmap. It would be nice to implement a different
5385 * more efficient way of doing this. In particular the top
5386 * level bitmap may be unnecessary.
5388 if (!recursed_depth) {
5389 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5391 Copy(PAREN_OFFSET(recursed_depth - 1),
5392 PAREN_OFFSET(recursed_depth),
5393 RExC_study_chunk_recursed_bytes, U8);
5395 /* we havent recursed into this paren yet, so recurse into it */
5396 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
5397 PAREN_SET(recursed_depth, paren);
5398 my_recursed_depth= recursed_depth + 1;
5400 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
5401 /* some form of infinite recursion, assume infinite length
5403 if (flags & SCF_DO_SUBSTR) {
5404 scan_commit(pRExC_state, data, minlenp, is_inf);
5405 data->cur_is_floating = 1;
5407 is_inf = is_inf_internal = 1;
5408 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5409 ssc_anything(data->start_class);
5410 flags &= ~SCF_DO_STCLASS;
5412 start= NULL; /* reset start so we dont recurse later on. */
5417 end = regnext(scan);
5420 scan_frame *newframe;
5422 if (!RExC_frame_last) {
5423 Newxz(newframe, 1, scan_frame);
5424 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5425 RExC_frame_head= newframe;
5427 } else if (!RExC_frame_last->next_frame) {
5428 Newxz(newframe, 1, scan_frame);
5429 RExC_frame_last->next_frame= newframe;
5430 newframe->prev_frame= RExC_frame_last;
5433 newframe= RExC_frame_last->next_frame;
5435 RExC_frame_last= newframe;
5437 newframe->next_regnode = regnext(scan);
5438 newframe->last_regnode = last;
5439 newframe->stopparen = stopparen;
5440 newframe->prev_recursed_depth = recursed_depth;
5441 newframe->this_prev_frame= frame;
5442 newframe->in_gosub = (
5443 (frame && frame->in_gosub) || OP(scan) == GOSUB
5446 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
5447 DEBUG_PEEP("fnew", scan, depth, flags);
5454 recursed_depth= my_recursed_depth;
5459 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
5460 SSize_t bytelen = STR_LEN(scan), charlen;
5464 const U8 * const s = (U8*)STRING(scan);
5465 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5466 charlen = utf8_length(s, s + bytelen);
5468 uc = *((U8*)STRING(scan));
5472 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5473 /* The code below prefers earlier match for fixed
5474 offset, later match for variable offset. */
5475 if (data->last_end == -1) { /* Update the start info. */
5476 data->last_start_min = data->pos_min;
5477 data->last_start_max =
5478 is_inf ? OPTIMIZE_INFTY
5479 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5480 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5482 sv_catpvn(data->last_found, STRING(scan), bytelen);
5484 SvUTF8_on(data->last_found);
5486 SV * const sv = data->last_found;
5487 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5488 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5489 if (mg && mg->mg_len >= 0)
5490 mg->mg_len += charlen;
5492 data->last_end = data->pos_min + charlen;
5493 data->pos_min += charlen; /* As in the first entry. */
5494 data->flags &= ~SF_BEFORE_EOL;
5497 /* ANDing the code point leaves at most it, and not in locale, and
5498 * can't match null string */
5499 if (flags & SCF_DO_STCLASS_AND) {
5500 ssc_cp_and(data->start_class, uc);
5501 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5502 ssc_clear_locale(data->start_class);
5504 else if (flags & SCF_DO_STCLASS_OR) {
5505 ssc_add_cp(data->start_class, uc);
5506 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5508 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5509 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5511 flags &= ~SCF_DO_STCLASS;
5512 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
5514 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
5515 /* But OP != EXACT!, so is EXACTFish */
5516 SSize_t bytelen = STR_LEN(scan), charlen;
5517 const U8 * s = (U8*)STRING(scan);
5519 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5520 * with the mask set to the complement of the bit that differs
5521 * between upper and lower case, and the lowest code point of the
5522 * pair (which the '&' forces) */
5525 && ( OP(scan) == EXACTFAA
5526 || ( OP(scan) == EXACTFU
5527 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5530 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5533 ARG_SET(scan, *s & mask);
5535 /* We're not EXACTFish any more, so restudy.
5536 * Search for "restudy" in this file to find
5537 * a comment with details. */
5541 /* Search for fixed substrings supports EXACT only. */
5542 if (flags & SCF_DO_SUBSTR) {
5544 scan_commit(pRExC_state, data, minlenp, is_inf);
5546 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5547 if (unfolded_multi_char) {
5548 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5550 min += charlen - min_subtract;
5552 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
5553 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
5555 delta += min_subtract;
5557 delta = OPTIMIZE_INFTY;
5559 if (flags & SCF_DO_SUBSTR) {
5560 data->pos_min += charlen - min_subtract;
5561 if (data->pos_min < 0) {
5564 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
5565 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
5567 data->pos_delta += min_subtract;
5569 data->pos_delta = OPTIMIZE_INFTY;
5572 data->cur_is_floating = 1; /* float */
5576 if (flags & SCF_DO_STCLASS) {
5577 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5579 assert(EXACTF_invlist);
5580 if (flags & SCF_DO_STCLASS_AND) {
5581 if (OP(scan) != EXACTFL)
5582 ssc_clear_locale(data->start_class);
5583 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5584 ANYOF_POSIXL_ZERO(data->start_class);
5585 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5587 else { /* SCF_DO_STCLASS_OR */
5588 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5589 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5591 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5592 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5594 flags &= ~SCF_DO_STCLASS;
5595 SvREFCNT_dec(EXACTF_invlist);
5597 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
5599 else if (REGNODE_VARIES(OP(scan))) {
5600 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5603 regnode * const oscan = scan;
5604 regnode_ssc this_class;
5605 regnode_ssc *oclass = NULL;
5606 I32 next_is_eval = 0;
5608 switch (REGNODE_TYPE(OP(scan))) {
5609 case WHILEM: /* End of (?:...)* . */
5610 scan = REGNODE_AFTER(scan);
5613 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5614 next = REGNODE_AFTER(scan);
5615 if ( ( REGNODE_TYPE(OP(next)) == EXACT
5616 && ! isEXACTFish(OP(next)))
5617 || (flags & SCF_DO_STCLASS))
5620 maxcount = REG_INFTY;
5621 next = regnext(scan);
5622 scan = REGNODE_AFTER(scan);
5626 if (flags & SCF_DO_SUBSTR)
5628 /* This will bypass the formal 'min += minnext * mincount'
5629 * calculation in the do_curly path, so assumes min width
5630 * of the PLUS payload is exactly one. */
5634 next = REGNODE_AFTER(scan);
5636 /* This temporary node can now be turned into EXACTFU, and
5637 * must, as regexec.c doesn't handle it */
5638 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5642 if ( STR_LEN(next) == 1
5643 && isALPHA_A(* STRING(next))
5644 && ( OP(next) == EXACTFAA
5645 || ( OP(next) == EXACTFU
5646 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5649 /* These differ in just one bit */
5650 U8 mask = ~ ('A' ^ 'a');
5652 assert(isALPHA_A(* STRING(next)));
5654 /* Then replace it by an ANYOFM node, with
5655 * the mask set to the complement of the
5656 * bit that differs between upper and lower
5657 * case, and the lowest code point of the
5658 * pair (which the '&' forces) */
5660 ARG_SET(next, *STRING(next) & mask);
5664 if (flags & SCF_DO_STCLASS) {
5666 maxcount = REG_INFTY;
5667 next = regnext(scan);
5668 scan = REGNODE_AFTER(scan);
5671 if (flags & SCF_DO_SUBSTR) {
5672 scan_commit(pRExC_state, data, minlenp, is_inf);
5673 /* Cannot extend fixed substrings */
5674 data->cur_is_floating = 1; /* float */
5676 is_inf = is_inf_internal = 1;
5677 scan = regnext(scan);
5678 goto optimize_curly_tail;
5680 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5681 && (scan->flags == stopparen))
5686 mincount = ARG1(scan);
5687 maxcount = ARG2(scan);
5689 next = regnext(scan);
5690 if (OP(scan) == CURLYX) {
5691 I32 lp = (data ? *(data->last_closep) : 0);
5692 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5694 scan = REGNODE_AFTER(scan);
5695 next_is_eval = (OP(scan) == EVAL);
5697 if (flags & SCF_DO_SUBSTR) {
5699 scan_commit(pRExC_state, data, minlenp, is_inf);
5700 /* Cannot extend fixed substrings */
5701 pos_before = data->pos_min;
5705 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5707 data->flags |= SF_IS_INF;
5709 if (flags & SCF_DO_STCLASS) {
5710 ssc_init(pRExC_state, &this_class);
5711 oclass = data->start_class;
5712 data->start_class = &this_class;
5713 f |= SCF_DO_STCLASS_AND;
5714 f &= ~SCF_DO_STCLASS_OR;
5716 /* Exclude from super-linear cache processing any {n,m}
5717 regops for which the combination of input pos and regex
5718 pos is not enough information to determine if a match
5721 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5722 regex pos at the \s*, the prospects for a match depend not
5723 only on the input position but also on how many (bar\s*)
5724 repeats into the {4,8} we are. */
5725 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5726 f &= ~SCF_WHILEM_VISITED_POS;
5728 /* This will finish on WHILEM, setting scan, or on NULL: */
5729 /* recurse study_chunk() on loop bodies */
5730 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5731 last, data, stopparen, recursed_depth, NULL,
5733 ? (f & ~SCF_DO_SUBSTR)
5735 , depth+1, mutate_ok);
5737 if (data && data->flags & SCF_SEEN_ACCEPT) {
5742 if (flags & SCF_DO_STCLASS)
5743 data->start_class = oclass;
5744 if (mincount == 0 || minnext == 0) {
5745 if (flags & SCF_DO_STCLASS_OR) {
5746 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5748 else if (flags & SCF_DO_STCLASS_AND) {
5749 /* Switch to OR mode: cache the old value of
5750 * data->start_class */
5752 StructCopy(data->start_class, and_withp, regnode_ssc);
5753 flags &= ~SCF_DO_STCLASS_AND;
5754 StructCopy(&this_class, data->start_class, regnode_ssc);
5755 flags |= SCF_DO_STCLASS_OR;
5756 ANYOF_FLAGS(data->start_class)
5757 |= SSC_MATCHES_EMPTY_STRING;
5759 } else { /* Non-zero len */
5760 if (flags & SCF_DO_STCLASS_OR) {
5761 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5762 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5764 else if (flags & SCF_DO_STCLASS_AND)
5765 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5766 flags &= ~SCF_DO_STCLASS;
5768 if (!scan) /* It was not CURLYX, but CURLY. */
5770 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5771 /* ? quantifier ok, except for (?{ ... }) */
5772 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5773 && (minnext == 0) && (deltanext == 0)
5774 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5775 && maxcount <= REG_INFTY/3) /* Complement check for big
5778 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5779 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5780 "Quantifier unexpected on zero-length expression "
5781 "in regex m/%" UTF8f "/",
5782 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5786 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5787 || min >= SSize_t_MAX - minnext * mincount )
5789 FAIL("Regexp out of space");
5792 min += minnext * mincount;
5793 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5794 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5795 is_inf |= is_inf_internal;
5797 delta = OPTIMIZE_INFTY;
5799 delta += (minnext + deltanext) * maxcount
5800 - minnext * mincount;
5803 if (data && data->flags & SCF_SEEN_ACCEPT) {
5804 if (flags & SCF_DO_SUBSTR) {
5805 scan_commit(pRExC_state, data, minlenp, is_inf);
5806 flags &= ~SCF_DO_SUBSTR;
5810 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
5812 /* Try powerful optimization CURLYX => CURLYN. */
5813 if ( OP(oscan) == CURLYX && data
5814 && data->flags & SF_IN_PAR
5815 && !(data->flags & SF_HAS_EVAL)
5816 && !deltanext && minnext == 1
5819 /* Try to optimize to CURLYN. */
5820 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
5821 regnode * const nxt1 = nxt;
5828 if (!REGNODE_SIMPLE(OP(nxt))
5829 && !(REGNODE_TYPE(OP(nxt)) == EXACT
5830 && STR_LEN(nxt) == 1))
5836 if (OP(nxt) != CLOSE)
5838 if (RExC_open_parens) {
5841 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5844 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5846 /* Now we know that nxt2 is the only contents: */
5847 oscan->flags = (U8)ARG(nxt);
5849 OP(nxt1) = NOTHING; /* was OPEN. */
5852 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5853 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5854 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5855 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5856 OP(nxt + 1) = OPTIMIZED; /* was count. */
5857 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5862 /* Try optimization CURLYX => CURLYM. */
5863 if ( OP(oscan) == CURLYX && data
5864 && !(data->flags & SF_HAS_PAR)
5865 && !(data->flags & SF_HAS_EVAL)
5866 && !deltanext /* atom is fixed width */
5867 && minnext != 0 /* CURLYM can't handle zero width */
5868 /* Nor characters whose fold at run-time may be
5869 * multi-character */
5870 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5873 /* XXXX How to optimize if data == 0? */
5874 /* Optimize to a simpler form. */
5875 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
5879 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5880 && (OP(nxt2) != WHILEM))
5882 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5883 /* Need to optimize away parenths. */
5884 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5885 /* Set the parenth number. */
5886 /* note that we have changed the type of oscan to CURLYM here */
5887 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
5889 oscan->flags = (U8)ARG(nxt);
5890 if (RExC_open_parens) {
5892 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5895 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5898 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5899 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5902 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5903 OP(nxt + 1) = OPTIMIZED; /* was count. */
5904 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5905 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5908 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5909 regnode *nnxt = regnext(nxt1);
5911 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
5912 ARG_SET(nxt1, nxt2 - nxt1);
5913 else if (nxt2 - nxt1 < U16_MAX)
5914 NEXT_OFF(nxt1) = nxt2 - nxt1;
5916 OP(nxt) = NOTHING; /* Cannot beautify */
5921 /* Optimize again: */
5922 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5923 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5924 NULL, stopparen, recursed_depth, NULL, 0,
5925 depth+1, mutate_ok);
5930 else if ((OP(oscan) == CURLYX)
5931 && (flags & SCF_WHILEM_VISITED_POS)
5932 /* See the comment on a similar expression above.
5933 However, this time it's not a subexpression
5934 we care about, but the expression itself. */
5935 && (maxcount == REG_INFTY)
5937 /* This stays as CURLYX, we can put the count/of pair. */
5938 /* Find WHILEM (as in regexec.c) */
5939 regnode *nxt = oscan + NEXT_OFF(oscan);
5941 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
5943 nxt = REGNODE_BEFORE(nxt);
5944 if (nxt->flags & 0xf) {
5945 /* we've already set whilem count on this node */
5946 } else if (++data->whilem_c < 16) {
5947 assert(data->whilem_c <= RExC_whilem_seen);
5948 nxt->flags = (U8)(data->whilem_c
5949 | (RExC_whilem_seen << 4)); /* On WHILEM */
5952 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5954 if (flags & SCF_DO_SUBSTR) {
5955 SV *last_str = NULL;
5956 STRLEN last_chrs = 0;
5957 int counted = mincount != 0;
5959 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5961 SSize_t b = pos_before >= data->last_start_min
5962 ? pos_before : data->last_start_min;
5964 const char * const s = SvPV_const(data->last_found, l);
5965 SSize_t old = b - data->last_start_min;
5969 old = utf8_hop_forward((U8*)s, old,
5970 (U8 *) SvEND(data->last_found))
5973 /* Get the added string: */
5974 last_str = newSVpvn_utf8(s + old, l, UTF);
5975 last_chrs = UTF ? utf8_length((U8*)(s + old),
5976 (U8*)(s + old + l)) : l;
5977 if (deltanext == 0 && pos_before == b) {
5978 /* What was added is a constant string */
5981 SvGROW(last_str, (mincount * l) + 1);
5982 repeatcpy(SvPVX(last_str) + l,
5983 SvPVX_const(last_str), l,
5985 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5986 /* Add additional parts. */
5987 SvCUR_set(data->last_found,
5988 SvCUR(data->last_found) - l);
5989 sv_catsv(data->last_found, last_str);
5991 SV * sv = data->last_found;
5993 SvUTF8(sv) && SvMAGICAL(sv) ?
5994 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5995 if (mg && mg->mg_len >= 0)
5996 mg->mg_len += last_chrs * (mincount-1);
5998 last_chrs *= mincount;
5999 data->last_end += l * (mincount - 1);
6002 /* start offset must point into the last copy */
6003 data->last_start_min += minnext * (mincount - 1);
6004 data->last_start_max =
6007 : data->last_start_max +
6008 (maxcount - 1) * (minnext + data->pos_delta);
6011 /* It is counted once already... */
6012 data->pos_min += minnext * (mincount - counted);
6014 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
6015 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
6016 " maxcount=%" UVuf " mincount=%" UVuf
6017 " data->pos_delta=%" UVuf "\n",
6018 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
6019 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
6020 if (deltanext != OPTIMIZE_INFTY)
6021 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
6022 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
6023 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
6025 if (deltanext == OPTIMIZE_INFTY
6026 || data->pos_delta == OPTIMIZE_INFTY
6027 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
6028 data->pos_delta = OPTIMIZE_INFTY;
6030 data->pos_delta += - counted * deltanext +
6031 (minnext + deltanext) * maxcount - minnext * mincount;
6032 if (mincount != maxcount) {
6033 /* Cannot extend fixed substrings found inside
6035 scan_commit(pRExC_state, data, minlenp, is_inf);
6036 if (mincount && last_str) {
6037 SV * const sv = data->last_found;
6038 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
6039 mg_find(sv, PERL_MAGIC_utf8) : NULL;
6043 sv_setsv(sv, last_str);
6044 data->last_end = data->pos_min;
6045 data->last_start_min = data->pos_min - last_chrs;
6046 data->last_start_max = is_inf
6048 : data->pos_min + data->pos_delta - last_chrs;
6050 data->cur_is_floating = 1; /* float */
6052 SvREFCNT_dec(last_str);
6054 if (data && (fl & SF_HAS_EVAL))
6055 data->flags |= SF_HAS_EVAL;
6056 optimize_curly_tail:
6057 rck_elide_nothing(oscan);
6061 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
6065 if (flags & SCF_DO_SUBSTR) {
6066 /* Cannot expect anything... */
6067 scan_commit(pRExC_state, data, minlenp, is_inf);
6068 data->cur_is_floating = 1; /* float */
6070 is_inf = is_inf_internal = 1;
6071 if (flags & SCF_DO_STCLASS_OR) {
6072 if (OP(scan) == CLUMP) {
6073 /* Actually is any start char, but very few code points
6074 * aren't start characters */
6075 ssc_match_all_cp(data->start_class);
6078 ssc_anything(data->start_class);
6081 flags &= ~SCF_DO_STCLASS;
6085 else if (OP(scan) == LNBREAK) {
6086 if (flags & SCF_DO_STCLASS) {
6087 if (flags & SCF_DO_STCLASS_AND) {
6088 ssc_intersection(data->start_class,
6089 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
6090 ssc_clear_locale(data->start_class);
6091 ANYOF_FLAGS(data->start_class)
6092 &= ~SSC_MATCHES_EMPTY_STRING;
6094 else if (flags & SCF_DO_STCLASS_OR) {
6095 ssc_union(data->start_class,
6096 PL_XPosix_ptrs[CC_VERTSPACE_],
6098 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6100 /* See commit msg for
6101 * 749e076fceedeb708a624933726e7989f2302f6a */
6102 ANYOF_FLAGS(data->start_class)
6103 &= ~SSC_MATCHES_EMPTY_STRING;
6105 flags &= ~SCF_DO_STCLASS;
6108 if (delta != OPTIMIZE_INFTY)
6109 delta++; /* Because of the 2 char string cr-lf */
6110 if (flags & SCF_DO_SUBSTR) {
6111 /* Cannot expect anything... */
6112 scan_commit(pRExC_state, data, minlenp, is_inf);
6114 if (data->pos_delta != OPTIMIZE_INFTY) {
6115 data->pos_delta += 1;
6117 data->cur_is_floating = 1; /* float */
6120 else if (REGNODE_SIMPLE(OP(scan))) {
6122 if (flags & SCF_DO_SUBSTR) {
6123 scan_commit(pRExC_state, data, minlenp, is_inf);
6127 if (flags & SCF_DO_STCLASS) {
6129 SV* my_invlist = NULL;
6132 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
6133 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
6135 /* Some of the logic below assumes that switching
6136 locale on will only add false positives. */
6141 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
6145 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6146 ssc_match_all_cp(data->start_class);
6151 SV* REG_ANY_invlist = _new_invlist(2);
6152 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
6154 if (flags & SCF_DO_STCLASS_OR) {
6155 ssc_union(data->start_class,
6157 TRUE /* TRUE => invert, hence all but \n
6161 else if (flags & SCF_DO_STCLASS_AND) {
6162 ssc_intersection(data->start_class,
6164 TRUE /* TRUE => invert */
6166 ssc_clear_locale(data->start_class);
6168 SvREFCNT_dec_NN(REG_ANY_invlist);
6180 if (flags & SCF_DO_STCLASS_AND)
6181 ssc_and(pRExC_state, data->start_class,
6182 (regnode_charclass *) scan);
6184 ssc_or(pRExC_state, data->start_class,
6185 (regnode_charclass *) scan);
6190 SV* cp_list = get_ANYOFHbbm_contents(scan);
6192 if (flags & SCF_DO_STCLASS_OR) {
6193 ssc_union(data->start_class, cp_list, invert);
6195 else if (flags & SCF_DO_STCLASS_AND) {
6196 ssc_intersection(data->start_class, cp_list, invert);
6199 SvREFCNT_dec_NN(cp_list);
6203 case NANYOFM: /* NANYOFM already contains the inversion of the
6204 input ANYOF data, so, unlike things like
6205 NPOSIXA, don't change 'invert' to TRUE */
6209 SV* cp_list = get_ANYOFM_contents(scan);
6211 if (flags & SCF_DO_STCLASS_OR) {
6212 ssc_union(data->start_class, cp_list, invert);
6214 else if (flags & SCF_DO_STCLASS_AND) {
6215 ssc_intersection(data->start_class, cp_list, invert);
6218 SvREFCNT_dec_NN(cp_list);
6227 cp_list = _add_range_to_invlist(cp_list,
6229 ANYOFRbase(scan) + ANYOFRdelta(scan));
6231 if (flags & SCF_DO_STCLASS_OR) {
6232 ssc_union(data->start_class, cp_list, invert);
6234 else if (flags & SCF_DO_STCLASS_AND) {
6235 ssc_intersection(data->start_class, cp_list, invert);
6238 SvREFCNT_dec_NN(cp_list);
6247 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6248 if (flags & SCF_DO_STCLASS_AND) {
6249 bool was_there = cBOOL(
6250 ANYOF_POSIXL_TEST(data->start_class,
6252 ANYOF_POSIXL_ZERO(data->start_class);
6253 if (was_there) { /* Do an AND */
6254 ANYOF_POSIXL_SET(data->start_class, namedclass);
6256 /* No individual code points can now match */
6257 data->start_class->invlist
6258 = sv_2mortal(_new_invlist(0));
6261 int complement = namedclass + ((invert) ? -1 : 1);
6263 assert(flags & SCF_DO_STCLASS_OR);
6265 /* If the complement of this class was already there,
6266 * the result is that they match all code points,
6267 * (\d + \D == everything). Remove the classes from
6268 * future consideration. Locale is not relevant in
6270 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6271 ssc_match_all_cp(data->start_class);
6272 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6273 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6275 else { /* The usual case; just add this class to the
6277 ANYOF_POSIXL_SET(data->start_class, namedclass);
6282 case NPOSIXA: /* For these, we always know the exact set of
6287 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6288 goto join_posix_and_ascii;
6296 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6298 /* NPOSIXD matches all upper Latin1 code points unless the
6299 * target string being matched is UTF-8, which is
6300 * unknowable until match time. Since we are going to
6301 * invert, we want to get rid of all of them so that the
6302 * inversion will match all */
6303 if (OP(scan) == NPOSIXD) {
6304 _invlist_subtract(my_invlist, PL_UpperLatin1,
6308 join_posix_and_ascii:
6310 if (flags & SCF_DO_STCLASS_AND) {
6311 ssc_intersection(data->start_class, my_invlist, invert);
6312 ssc_clear_locale(data->start_class);
6315 assert(flags & SCF_DO_STCLASS_OR);
6316 ssc_union(data->start_class, my_invlist, invert);
6318 SvREFCNT_dec(my_invlist);
6320 if (flags & SCF_DO_STCLASS_OR)
6321 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6322 flags &= ~SCF_DO_STCLASS;
6325 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
6326 data->flags |= (OP(scan) == MEOL
6329 scan_commit(pRExC_state, data, minlenp, is_inf);
6332 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
6333 /* Lookbehind, or need to calculate parens/evals/stclass: */
6334 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6335 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6337 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6338 || OP(scan) == UNLESSM )
6340 /* Negative Lookahead/lookbehind
6341 In this case we can't do fixed string optimisation.
6344 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
6345 SSize_t deltanext, minnext;
6346 SSize_t fake_last_close = 0;
6347 regnode *fake_last_close_op = NULL;
6348 regnode *cur_last_close_op;
6351 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
6353 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6355 data_fake.whilem_c = data->whilem_c;
6356 data_fake.last_closep = data->last_closep;
6357 data_fake.last_close_opp = data->last_close_opp;
6360 data_fake.last_closep = &fake_last_close;
6361 data_fake.last_close_opp = &fake_last_close_op;
6364 /* remember the last_close_op we saw so we can see if
6365 * we are dealing with variable length lookbehind that
6366 * contains capturing buffers, which are considered
6368 cur_last_close_op= *(data_fake.last_close_opp);
6370 data_fake.pos_delta = delta;
6371 if ( flags & SCF_DO_STCLASS && !scan->flags
6372 && OP(scan) == IFMATCH ) { /* Lookahead */
6373 ssc_init(pRExC_state, &intrnl);
6374 data_fake.start_class = &intrnl;
6375 f |= SCF_DO_STCLASS_AND;
6377 if (flags & SCF_WHILEM_VISITED_POS)
6378 f |= SCF_WHILEM_VISITED_POS;
6379 next = regnext(scan);
6380 nscan = REGNODE_AFTER(scan);
6382 /* recurse study_chunk() for lookahead body */
6383 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6384 last, &data_fake, stopparen,
6385 recursed_depth, NULL, f, depth+1,
6390 || deltanext > (I32) U8_MAX
6391 || minnext > (I32)U8_MAX
6392 || minnext + deltanext > (I32)U8_MAX)
6394 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6398 /* The 'next_off' field has been repurposed to count the
6399 * additional starting positions to try beyond the initial
6400 * one. (This leaves it at 0 for non-variable length
6401 * matches to avoid breakage for those not using this
6404 scan->next_off = deltanext;
6406 /* See a CLOSE op inside this lookbehind? */
6407 cur_last_close_op != *(data_fake.last_close_opp)
6408 /* and not doing restudy. see: restudied */
6409 && !(flags & SCF_TRIE_DOING_RESTUDY)
6411 /* this is positive variable length lookbehind with
6412 * capture buffers inside of it */
6413 ckWARNexperimental_with_arg(RExC_parse,
6414 WARN_EXPERIMENTAL__VLB,
6415 "Variable length %s lookbehind with capturing is experimental",
6416 is_positive ? "positive" : "negative");
6419 scan->flags = (U8)minnext + deltanext;
6422 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6424 if (data_fake.flags & SF_HAS_EVAL)
6425 data->flags |= SF_HAS_EVAL;
6426 data->whilem_c = data_fake.whilem_c;
6428 if (f & SCF_DO_STCLASS_AND) {
6429 if (flags & SCF_DO_STCLASS_OR) {
6430 /* OR before, AND after: ideally we would recurse with
6431 * data_fake to get the AND applied by study of the
6432 * remainder of the pattern, and then derecurse;
6433 * *** HACK *** for now just treat as "no information".
6434 * See [perl #56690].
6436 ssc_init(pRExC_state, data->start_class);
6438 /* AND before and after: combine and continue. These
6439 * assertions are zero-length, so can match an EMPTY
6441 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6442 ANYOF_FLAGS(data->start_class)
6443 |= SSC_MATCHES_EMPTY_STRING;
6446 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
6448 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6450 /* Positive Lookahead/lookbehind
6451 In this case we can do fixed string optimisation,
6452 but we must be careful about it. Note in the case of
6453 lookbehind the positions will be offset by the minimum
6454 length of the pattern, something we won't know about
6455 until after the recurse.
6457 SSize_t deltanext, fake_last_close = 0;
6458 regnode *last_close_op = NULL;
6461 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
6462 /* We use SAVEFREEPV so that when the full compile
6463 is finished perl will clean up the allocated
6464 minlens when it's all done. This way we don't
6465 have to worry about freeing them when we know
6466 they wont be used, which would be a pain.
6469 Newx( minnextp, 1, SSize_t );
6470 SAVEFREEPV(minnextp);
6473 StructCopy(data, &data_fake, scan_data_t);
6474 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6477 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6478 data_fake.last_found=newSVsv(data->last_found);
6482 data_fake.last_closep = &fake_last_close;
6483 data_fake.last_close_opp = &fake_last_close_opp;
6485 data_fake.flags = 0;
6486 data_fake.substrs[0].flags = 0;
6487 data_fake.substrs[1].flags = 0;
6488 data_fake.pos_delta = delta;
6490 data_fake.flags |= SF_IS_INF;
6491 if ( flags & SCF_DO_STCLASS && !scan->flags
6492 && OP(scan) == IFMATCH ) { /* Lookahead */
6493 ssc_init(pRExC_state, &intrnl);
6494 data_fake.start_class = &intrnl;
6495 f |= SCF_DO_STCLASS_AND;
6497 if (flags & SCF_WHILEM_VISITED_POS)
6498 f |= SCF_WHILEM_VISITED_POS;
6499 next = regnext(scan);
6500 nscan = REGNODE_AFTER(scan);
6502 /* positive lookahead study_chunk() recursion */
6503 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6504 &deltanext, last, &data_fake,
6505 stopparen, recursed_depth, NULL,
6506 f, depth+1, mutate_ok);
6508 assert(0); /* This code has never been tested since this
6509 is normally not compiled */
6511 || deltanext > (I32) U8_MAX
6512 || *minnextp > (I32)U8_MAX
6513 || *minnextp + deltanext > (I32)U8_MAX)
6515 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6520 scan->next_off = deltanext;
6522 scan->flags = (U8)*minnextp + deltanext;
6527 if (f & SCF_DO_STCLASS_AND) {
6528 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6529 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6532 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6534 if (data_fake.flags & SF_HAS_EVAL)
6535 data->flags |= SF_HAS_EVAL;
6536 data->whilem_c = data_fake.whilem_c;
6537 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6539 if (RExC_rx->minlen < *minnextp)
6540 RExC_rx->minlen = *minnextp;
6541 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6542 SvREFCNT_dec_NN(data_fake.last_found);
6544 for (i = 0; i < 2; i++) {
6545 if (data_fake.substrs[i].minlenp != minlenp) {
6546 data->substrs[i].min_offset =
6547 data_fake.substrs[i].min_offset;
6548 data->substrs[i].max_offset =
6549 data_fake.substrs[i].max_offset;
6550 data->substrs[i].minlenp =
6551 data_fake.substrs[i].minlenp;
6552 data->substrs[i].lookbehind += scan->flags;
6560 else if (OP(scan) == OPEN) {
6561 if (stopparen != (I32)ARG(scan))
6564 else if (OP(scan) == CLOSE) {
6565 if (stopparen == (I32)ARG(scan)) {
6568 if ((I32)ARG(scan) == is_par) {
6569 next = regnext(scan);
6571 if ( next && (OP(next) != WHILEM) && next < last)
6572 is_par = 0; /* Disable optimization */
6575 *(data->last_closep) = ARG(scan);
6576 *(data->last_close_opp) = scan;
6579 else if (OP(scan) == EVAL) {
6581 data->flags |= SF_HAS_EVAL;
6583 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
6584 if (flags & SCF_DO_SUBSTR) {
6585 scan_commit(pRExC_state, data, minlenp, is_inf);
6586 flags &= ~SCF_DO_SUBSTR;
6588 if (OP(scan)==ACCEPT) {
6589 /* m{(*ACCEPT)x} does not have to start with 'x' */
6590 flags &= ~SCF_DO_STCLASS;
6592 data->flags |= SCF_SEEN_ACCEPT;
6597 else if (OP(scan) == COMMIT) {
6598 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
6599 * must not end up with "abcxyz" as a fixed substring else we'll
6600 * skip straight to attempting to match at offset 4.
6602 if (flags & SCF_DO_SUBSTR) {
6603 scan_commit(pRExC_state, data, minlenp, is_inf);
6604 flags &= ~SCF_DO_SUBSTR;
6607 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6609 if (flags & SCF_DO_SUBSTR) {
6610 scan_commit(pRExC_state, data, minlenp, is_inf);
6611 data->cur_is_floating = 1; /* float */
6613 is_inf = is_inf_internal = 1;
6614 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6615 ssc_anything(data->start_class);
6616 flags &= ~SCF_DO_STCLASS;
6618 else if (OP(scan) == GPOS) {
6619 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6620 !(delta || is_inf || (data && data->pos_delta)))
6622 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6623 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6624 if (RExC_rx->gofs < (STRLEN)min)
6625 RExC_rx->gofs = min;
6627 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6631 #ifdef TRIE_STUDY_OPT
6632 #ifdef FULL_TRIE_STUDY
6633 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
6634 /* NOTE - There is similar code to this block above for handling
6635 BRANCH nodes on the initial study. If you change stuff here
6637 regnode *trie_node= scan;
6638 regnode *tail= regnext(scan);
6639 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6640 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6643 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6644 /* Cannot merge strings after this. */
6645 scan_commit(pRExC_state, data, minlenp, is_inf);
6647 if (flags & SCF_DO_STCLASS)
6648 ssc_init_zero(pRExC_state, &accum);
6654 const regnode *nextbranch= NULL;
6657 for ( word=1 ; word <= trie->wordcount ; word++)
6659 SSize_t deltanext = 0, minnext = 0;
6660 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
6661 SSize_t fake_last_close = 0;
6662 regnode *fake_last_close_op = NULL;
6663 regnode_ssc this_class;
6665 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6667 data_fake.whilem_c = data->whilem_c;
6668 data_fake.last_closep = data->last_closep;
6669 data_fake.last_close_opp = data->last_close_opp;
6672 data_fake.last_closep = &fake_last_close;
6673 data_fake.last_close_opp = &fake_last_close_op;
6675 data_fake.pos_delta = delta;
6676 if (flags & SCF_DO_STCLASS) {
6677 ssc_init(pRExC_state, &this_class);
6678 data_fake.start_class = &this_class;
6679 f |= SCF_DO_STCLASS_AND;
6681 if (flags & SCF_WHILEM_VISITED_POS)
6682 f |= SCF_WHILEM_VISITED_POS;
6684 if (trie->jump[word]) {
6686 nextbranch = trie_node + trie->jump[0];
6687 scan= trie_node + trie->jump[word];
6688 /* We go from the jump point to the branch that follows
6689 it. Note this means we need the vestigal unused
6690 branches even though they arent otherwise used. */
6691 /* optimise study_chunk() for TRIE */
6692 minnext = study_chunk(pRExC_state, &scan, minlenp,
6693 &deltanext, (regnode *)nextbranch, &data_fake,
6694 stopparen, recursed_depth, NULL, f, depth+1,
6697 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
6698 nextbranch= regnext((regnode*)nextbranch);
6700 if (min1 > (SSize_t)(minnext + trie->minlen))
6701 min1 = minnext + trie->minlen;
6702 if (deltanext == OPTIMIZE_INFTY) {
6703 is_inf = is_inf_internal = 1;
6704 max1 = OPTIMIZE_INFTY;
6705 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6706 max1 = minnext + deltanext + trie->maxlen;
6708 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6710 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6711 if ( stopmin > min + min1)
6712 stopmin = min + min1;
6713 flags &= ~SCF_DO_SUBSTR;
6715 data->flags |= SCF_SEEN_ACCEPT;
6718 if (data_fake.flags & SF_HAS_EVAL)
6719 data->flags |= SF_HAS_EVAL;
6720 data->whilem_c = data_fake.whilem_c;
6722 if (flags & SCF_DO_STCLASS)
6723 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6725 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
6727 if (flags & SCF_DO_SUBSTR) {
6728 data->pos_min += min1;
6729 data->pos_delta += max1 - min1;
6730 if (max1 != min1 || is_inf)
6731 data->cur_is_floating = 1; /* float */
6734 if (delta != OPTIMIZE_INFTY) {
6735 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6736 delta += max1 - min1;
6738 delta = OPTIMIZE_INFTY;
6740 if (flags & SCF_DO_STCLASS_OR) {
6741 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6743 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6744 flags &= ~SCF_DO_STCLASS;
6747 else if (flags & SCF_DO_STCLASS_AND) {
6749 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6750 flags &= ~SCF_DO_STCLASS;
6753 /* Switch to OR mode: cache the old value of
6754 * data->start_class */
6756 StructCopy(data->start_class, and_withp, regnode_ssc);
6757 flags &= ~SCF_DO_STCLASS_AND;
6758 StructCopy(&accum, data->start_class, regnode_ssc);
6759 flags |= SCF_DO_STCLASS_OR;
6763 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
6767 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
6768 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6771 min += trie->minlen;
6772 delta += (trie->maxlen - trie->minlen);
6773 flags &= ~SCF_DO_STCLASS; /* xxx */
6774 if (flags & SCF_DO_SUBSTR) {
6775 /* Cannot expect anything... */
6776 scan_commit(pRExC_state, data, minlenp, is_inf);
6777 data->pos_min += trie->minlen;
6778 data->pos_delta += (trie->maxlen - trie->minlen);
6779 if (trie->maxlen != trie->minlen)
6780 data->cur_is_floating = 1; /* float */
6782 if (trie->jump) /* no more substrings -- for now /grr*/
6783 flags &= ~SCF_DO_SUBSTR;
6786 #endif /* old or new */
6787 #endif /* TRIE_STUDY_OPT */
6789 else if (OP(scan) == REGEX_SET) {
6790 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6791 " before optimization", REGNODE_NAME(REGEX_SET));
6794 /* Else: zero-length, ignore. */
6795 scan = regnext(scan);
6800 /* we need to unwind recursion. */
6803 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
6804 DEBUG_PEEP("fend", scan, depth, flags);
6806 /* restore previous context */
6807 last = frame->last_regnode;
6808 scan = frame->next_regnode;
6809 stopparen = frame->stopparen;
6810 recursed_depth = frame->prev_recursed_depth;
6812 RExC_frame_last = frame->prev_frame;
6813 frame = frame->this_prev_frame;
6814 goto fake_study_recurse;
6818 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
6820 /* is this pattern infinite? Eg, consider /(a|b+)/ */
6821 if (is_inf_internal)
6822 delta = OPTIMIZE_INFTY;
6824 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
6825 if (min > stopmin) {
6827 At this point 'min' represents the minimum length string we can
6828 match while *ignoring* the implication of ACCEPT, and 'delta'
6829 represents the difference between the minimum length and maximum
6830 length, and if the pattern matches an infinitely long string
6831 (consider the + and * quantifiers) then we use the special delta
6832 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
6833 minimum length that can be matched *and* accepted.
6835 A pattern is accepted when matching was successful *and*
6836 complete, and thus there is no further matching needing to be
6837 done, no backtracking to occur, etc. Prior to the introduction
6838 of ACCEPT the only opcode that signaled acceptance was the END
6839 opcode, which is always the very last opcode in a regex program.
6840 ACCEPT is thus conceptually an early successful return out of
6841 the matching process. stopmin starts out as OPTIMIZE_INFTY to
6842 represent "the entire pattern", and is ratched down to the
6843 "current min" if necessary when an ACCEPT opcode is encountered.
6845 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
6846 and we now need to account for it in both min and delta.
6847 Consider that in a pattern /AB/ normally the min length it can
6848 match can be computed as min(A)+min(B). But (*ACCEPT) means
6849 that it might be something else, not even neccesarily min(A) at
6852 A = /(foo(*ACCEPT)|x+)/
6854 AB = /(foo(*ACCEPT)|x+)whop/
6856 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
6857 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
6858 "whop", and the delta of 0 as the pattern is of fixed length, the
6859 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
6860 When handling AB we expect to see a min of 5 for "xwhop", and a
6861 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
6862 for "foo". This should result in a final min of 3 for "foo", and
6863 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
6865 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
6866 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
6867 stop min would be 4 for "dude". This should result in a final
6868 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
6870 When min is smaller than stopmin then we can ignore it. In the
6871 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
6872 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
6873 the ACCEPT doesn't reduce the minimum length of the string that
6874 might be matched, nor affect the maximum length.
6876 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
6877 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
6878 "foo". We currently turn this into a min of 3 for "foo" and a
6879 delta of 3 for "foobar" even though technically "foobar" isn't
6880 possible. ACCEPT affects some aspects of the optimizer, like
6881 length computations and mandatory substring optimizations, but
6882 there are other optimzations this routine perfoms that are not
6883 affected and this compromise simplifies implementation.
6885 It might be helpful to consider that this C function is called
6886 recursively on the pattern in a bottom up fashion, and that the
6887 min returned by a nested call may be marked as coming from an
6888 ACCEPT, causing its callers to treat the returned min as a
6889 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
6890 multiple calls into this function in different ways.
6893 if (OPTIMIZE_INFTY - delta >= min - stopmin)
6894 delta += min - stopmin;
6896 delta = OPTIMIZE_INFTY;
6903 if (flags & SCF_DO_SUBSTR && is_inf)
6904 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6905 if (is_par > (I32)U8_MAX)
6907 if (is_par && pars==1 && data) {
6908 data->flags |= SF_IN_PAR;
6909 data->flags &= ~SF_HAS_PAR;
6911 else if (pars && data) {
6912 data->flags |= SF_HAS_PAR;
6913 data->flags &= ~SF_IN_PAR;
6915 if (flags & SCF_DO_STCLASS_OR)
6916 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6917 if (flags & SCF_TRIE_RESTUDY)
6918 data->flags |= SCF_TRIE_RESTUDY;
6921 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6922 if (min > OPTIMIZE_INFTY - delta)
6923 RExC_maxlen = OPTIMIZE_INFTY;
6924 else if (RExC_maxlen < min + delta)
6925 RExC_maxlen = min + delta;
6927 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
6931 /* add a data member to the struct reg_data attached to this regex, it should
6932 * always return a non-zero return. the 's' argument is the type of the items
6933 * being added and the n is the number of items. The length of 's' should match
6934 * the number of items. */
6936 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6938 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
6940 PERL_ARGS_ASSERT_ADD_DATA;
6942 /* in the below expression we have (count + n - 1), the minus one is there
6943 * because the struct that we allocate already contains a slot for 1 data
6944 * item, so we do not need to allocate it the first time. IOW, the
6945 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
6946 * to allocate. See struct reg_data in regcomp.h
6948 Renewc(RExC_rxi->data,
6949 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
6950 char, struct reg_data);
6951 /* however in the data->what expression we use (count + n) and do not
6952 * subtract one from the result because the data structure contains a
6953 * pointer to an array, and does not allocate the first element as part of
6954 * the data struct. */
6956 Renew(RExC_rxi->data->what, (count + n), U8);
6958 /* when count == 1 it means we have not initialized anything.
6959 * we always fill the 0 slot of the data array with a '%' entry, which
6960 * means "zero" (all the other types are letters) which exists purely
6961 * so the return from add_data is ALWAYS true, so we can tell it apart
6962 * from a "no value" idx=0 in places where we would return an index
6963 * into add_data. This is particularly important with the new "single
6964 * pass, usually, but not always" strategy that we use, where the code
6965 * will use a 0 to represent "not able to compute this yet".
6967 Newx(RExC_rxi->data->what, n+1, U8);
6968 /* fill in the placeholder slot of 0 with a what of '%', we use
6969 * this because it sorta looks like a zero (0/0) and it is not a letter
6970 * like any of the other "whats", this type should never be created
6971 * any other way but here. '%' happens to also not appear in this
6972 * file for any other reason (at the time of writing this comment)*/
6973 RExC_rxi->data->what[0]= '%';
6974 RExC_rxi->data->data[0]= NULL;
6976 RExC_rxi->data->count = count + n;
6977 Copy(s, RExC_rxi->data->what + count, n, U8);
6982 /*XXX: todo make this not included in a non debugging perl, but appears to be
6983 * used anyway there, in 'use re' */
6984 #ifndef PERL_IN_XSUB_RE
6986 Perl_reginitcolors(pTHX)
6988 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6990 char *t = savepv(s);
6994 t = strchr(t, '\t');
7000 PL_colors[i] = t = (char *)"";
7005 PL_colors[i++] = (char *)"";
7012 #ifdef TRIE_STUDY_OPT
7013 /* search for "restudy" in this file for a detailed explanation */
7014 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
7017 (data.flags & SCF_TRIE_RESTUDY) \
7025 #define CHECK_RESTUDY_GOTO_butfirst
7029 * pregcomp - compile a regular expression into internal code
7031 * Decides which engine's compiler to call based on the hint currently in
7035 #ifndef PERL_IN_XSUB_RE
7037 /* return the currently in-scope regex engine (or the default if none) */
7039 regexp_engine const *
7040 Perl_current_re_engine(pTHX)
7042 if (IN_PERL_COMPILETIME) {
7043 HV * const table = GvHV(PL_hintgv);
7046 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
7047 return &PL_core_reg_engine;
7048 ptr = hv_fetchs(table, "regcomp", FALSE);
7049 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
7050 return &PL_core_reg_engine;
7051 return INT2PTR(regexp_engine*, SvIV(*ptr));
7055 if (!PL_curcop->cop_hints_hash)
7056 return &PL_core_reg_engine;
7057 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
7058 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
7059 return &PL_core_reg_engine;
7060 return INT2PTR(regexp_engine*, SvIV(ptr));
7066 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
7068 regexp_engine const *eng = current_re_engine();
7069 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7071 PERL_ARGS_ASSERT_PREGCOMP;
7073 /* Dispatch a request to compile a regexp to correct regexp engine. */
7075 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
7078 return CALLREGCOMP_ENG(eng, pattern, flags);
7083 =for apidoc re_compile
7085 Compile the regular expression pattern C<pattern>, returning a pointer to the
7086 compiled object for later matching with the internal regex engine.
7088 This function is typically used by a custom regexp engine C<.comp()> function
7089 to hand off to the core regexp engine those patterns it doesn't want to handle
7090 itself (typically passing through the same flags it was called with). In
7091 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
7092 to compile using the currently active regexp engine.
7094 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
7095 pointer to the input. Otherwise the PV is extracted and treated like a string
7096 representing a pattern. See L<perlre>.
7098 The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
7099 all begin with C<RXf_>.
7103 * public entry point for the perl core's own regex compiling code.
7104 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
7105 * pattern rather than a list of OPs, and uses the internal engine rather
7106 * than the current one */
7109 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
7111 SV *pat = pattern; /* defeat constness! */
7113 PERL_ARGS_ASSERT_RE_COMPILE;
7115 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
7116 #ifdef PERL_IN_XSUB_RE
7119 &PL_core_reg_engine,
7121 NULL, NULL, rx_flags, 0);
7125 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
7129 if (--cbs->refcnt > 0)
7131 for (n = 0; n < cbs->count; n++) {
7132 REGEXP *rx = cbs->cb[n].src_regex;
7134 cbs->cb[n].src_regex = NULL;
7135 SvREFCNT_dec_NN(rx);
7143 static struct reg_code_blocks *
7144 S_alloc_code_blocks(pTHX_ int ncode)
7146 struct reg_code_blocks *cbs;
7147 Newx(cbs, 1, struct reg_code_blocks);
7150 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
7152 Newx(cbs->cb, ncode, struct reg_code_block);
7159 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
7160 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
7161 * point to the realloced string and length.
7163 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
7167 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
7168 char **pat_p, STRLEN *plen_p, int num_code_blocks)
7170 U8 *const src = (U8*)*pat_p;
7175 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7177 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7178 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
7180 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
7181 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
7184 while (s < *plen_p) {
7185 append_utf8_from_native_byte(src[s], &d);
7187 if (n < num_code_blocks) {
7188 assert(pRExC_state->code_blocks);
7189 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
7190 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
7191 assert(*(d - 1) == '(');
7194 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
7195 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
7196 assert(*(d - 1) == ')');
7205 *pat_p = (char*) dst;
7207 RExC_orig_utf8 = RExC_utf8 = 1;
7212 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
7213 * while recording any code block indices, and handling overloading,
7214 * nested qr// objects etc. If pat is null, it will allocate a new
7215 * string, or just return the first arg, if there's only one.
7217 * Returns the malloced/updated pat.
7218 * patternp and pat_count is the array of SVs to be concatted;
7219 * oplist is the optional list of ops that generated the SVs;
7220 * recompile_p is a pointer to a boolean that will be set if
7221 * the regex will need to be recompiled.
7222 * delim, if non-null is an SV that will be inserted between each element
7226 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
7227 SV *pat, SV ** const patternp, int pat_count,
7228 OP *oplist, bool *recompile_p, SV *delim)
7232 bool use_delim = FALSE;
7233 bool alloced = FALSE;
7235 /* if we know we have at least two args, create an empty string,
7236 * then concatenate args to that. For no args, return an empty string */
7237 if (!pat && pat_count != 1) {
7243 for (svp = patternp; svp < patternp + pat_count; svp++) {
7246 STRLEN orig_patlen = 0;
7248 SV *msv = use_delim ? delim : *svp;
7249 if (!msv) msv = &PL_sv_undef;
7251 /* if we've got a delimiter, we go round the loop twice for each
7252 * svp slot (except the last), using the delimiter the second
7261 if (SvTYPE(msv) == SVt_PVAV) {
7262 /* we've encountered an interpolated array within
7263 * the pattern, e.g. /...@a..../. Expand the list of elements,
7264 * then recursively append elements.
7265 * The code in this block is based on S_pushav() */
7267 AV *const av = (AV*)msv;
7268 const SSize_t maxarg = AvFILL(av) + 1;
7272 assert(oplist->op_type == OP_PADAV
7273 || oplist->op_type == OP_RV2AV);
7274 oplist = OpSIBLING(oplist);
7277 if (SvRMAGICAL(av)) {
7280 Newx(array, maxarg, SV*);
7282 for (i=0; i < maxarg; i++) {
7283 SV ** const svp = av_fetch(av, i, FALSE);
7284 array[i] = svp ? *svp : &PL_sv_undef;
7288 array = AvARRAY(av);
7290 pat = S_concat_pat(aTHX_ pRExC_state, pat,
7291 array, maxarg, NULL, recompile_p,
7293 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
7299 /* we make the assumption here that each op in the list of
7300 * op_siblings maps to one SV pushed onto the stack,
7301 * except for code blocks, with have both an OP_NULL and
7303 * This allows us to match up the list of SVs against the
7304 * list of OPs to find the next code block.
7306 * Note that PUSHMARK PADSV PADSV ..
7308 * PADRANGE PADSV PADSV ..
7309 * so the alignment still works. */
7312 if (oplist->op_type == OP_NULL
7313 && (oplist->op_flags & OPf_SPECIAL))
7315 assert(n < pRExC_state->code_blocks->count);
7316 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
7317 pRExC_state->code_blocks->cb[n].block = oplist;
7318 pRExC_state->code_blocks->cb[n].src_regex = NULL;
7321 oplist = OpSIBLING(oplist); /* skip CONST */
7324 oplist = OpSIBLING(oplist);;
7327 /* apply magic and QR overloading to arg */
7330 if (SvROK(msv) && SvAMAGIC(msv)) {
7331 SV *sv = AMG_CALLunary(msv, regexp_amg);
7335 if (SvTYPE(sv) != SVt_REGEXP)
7336 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
7341 /* try concatenation overload ... */
7342 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
7343 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
7346 /* overloading involved: all bets are off over literal
7347 * code. Pretend we haven't seen it */
7349 pRExC_state->code_blocks->count -= n;
7353 /* ... or failing that, try "" overload */
7354 while (SvAMAGIC(msv)
7355 && (sv = AMG_CALLunary(msv, string_amg))
7359 && SvRV(msv) == SvRV(sv))
7364 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
7368 /* this is a partially unrolled
7369 * sv_catsv_nomg(pat, msv);
7370 * that allows us to adjust code block indices if
7373 char *dst = SvPV_force_nomg(pat, dlen);
7375 if (SvUTF8(msv) && !SvUTF8(pat)) {
7376 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
7377 sv_setpvn(pat, dst, dlen);
7380 sv_catsv_nomg(pat, msv);
7384 /* We have only one SV to process, but we need to verify
7385 * it is properly null terminated or we will fail asserts
7386 * later. In theory we probably shouldn't get such SV's,
7387 * but if we do we should handle it gracefully. */
7388 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7389 /* not a string, or a string with a trailing null */
7392 /* a string with no trailing null, we need to copy it
7393 * so it has a trailing null */
7394 pat = sv_2mortal(newSVsv(msv));
7399 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7402 /* extract any code blocks within any embedded qr//'s */
7403 if (rx && SvTYPE(rx) == SVt_REGEXP
7404 && RX_ENGINE((REGEXP*)rx)->op_comp)
7407 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7408 if (ri->code_blocks && ri->code_blocks->count) {
7410 /* the presence of an embedded qr// with code means
7411 * we should always recompile: the text of the
7412 * qr// may not have changed, but it may be a
7413 * different closure than last time */
7415 if (pRExC_state->code_blocks) {
7416 int new_count = pRExC_state->code_blocks->count
7417 + ri->code_blocks->count;
7418 Renew(pRExC_state->code_blocks->cb,
7419 new_count, struct reg_code_block);
7420 pRExC_state->code_blocks->count = new_count;
7423 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7424 ri->code_blocks->count);
7426 for (i=0; i < ri->code_blocks->count; i++) {
7427 struct reg_code_block *src, *dst;
7428 STRLEN offset = orig_patlen
7429 + ReANY((REGEXP *)rx)->pre_prefix;
7430 assert(n < pRExC_state->code_blocks->count);
7431 src = &ri->code_blocks->cb[i];
7432 dst = &pRExC_state->code_blocks->cb[n];
7433 dst->start = src->start + offset;
7434 dst->end = src->end + offset;
7435 dst->block = src->block;
7436 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7445 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7454 /* see if there are any run-time code blocks in the pattern.
7455 * False positives are allowed */
7458 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7459 char *pat, STRLEN plen)
7464 PERL_UNUSED_CONTEXT;
7466 for (s = 0; s < plen; s++) {
7467 if ( pRExC_state->code_blocks
7468 && n < pRExC_state->code_blocks->count
7469 && s == pRExC_state->code_blocks->cb[n].start)
7471 s = pRExC_state->code_blocks->cb[n].end;
7475 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7477 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7479 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7486 /* Handle run-time code blocks. We will already have compiled any direct
7487 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7488 * copy of it, but with any literal code blocks blanked out and
7489 * appropriate chars escaped; then feed it into
7491 * eval "qr'modified_pattern'"
7495 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7499 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7501 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7502 * and merge them with any code blocks of the original regexp.
7504 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7505 * instead, just save the qr and return FALSE; this tells our caller that
7506 * the original pattern needs upgrading to utf8.
7510 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7511 char *pat, STRLEN plen)
7515 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7517 if (pRExC_state->runtime_code_qr) {
7518 /* this is the second time we've been called; this should
7519 * only happen if the main pattern got upgraded to utf8
7520 * during compilation; re-use the qr we compiled first time
7521 * round (which should be utf8 too)
7523 qr = pRExC_state->runtime_code_qr;
7524 pRExC_state->runtime_code_qr = NULL;
7525 assert(RExC_utf8 && SvUTF8(qr));
7531 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7535 /* determine how many extra chars we need for ' and \ escaping */
7536 for (s = 0; s < plen; s++) {
7537 if (pat[s] == '\'' || pat[s] == '\\')
7541 Newx(newpat, newlen, char);
7543 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7545 for (s = 0; s < plen; s++) {
7546 if ( pRExC_state->code_blocks
7547 && n < pRExC_state->code_blocks->count
7548 && s == pRExC_state->code_blocks->cb[n].start)
7550 /* blank out literal code block so that they aren't
7551 * recompiled: eg change from/to:
7561 assert(pat[s] == '(');
7562 assert(pat[s+1] == '?');
7566 while (s < pRExC_state->code_blocks->cb[n].end) {
7574 if (pat[s] == '\'' || pat[s] == '\\')
7579 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7581 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7587 Perl_re_printf( aTHX_
7588 "%sre-parsing pattern for runtime code:%s %s\n",
7589 PL_colors[4], PL_colors[5], newpat);
7592 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7598 PUSHSTACKi(PERLSI_REQUIRE);
7599 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7600 * parsing qr''; normally only q'' does this. It also alters
7602 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7603 SvREFCNT_dec_NN(sv);
7608 SV * const errsv = ERRSV;
7609 if (SvTRUE_NN(errsv))
7610 /* use croak_sv ? */
7611 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7613 assert(SvROK(qr_ref));
7615 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7616 /* the leaving below frees the tmp qr_ref.
7617 * Give qr a life of its own */
7625 if (!RExC_utf8 && SvUTF8(qr)) {
7626 /* first time through; the pattern got upgraded; save the
7627 * qr for the next time through */
7628 assert(!pRExC_state->runtime_code_qr);
7629 pRExC_state->runtime_code_qr = qr;
7634 /* extract any code blocks within the returned qr// */
7637 /* merge the main (r1) and run-time (r2) code blocks into one */
7639 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7640 struct reg_code_block *new_block, *dst;
7641 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7645 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7647 SvREFCNT_dec_NN(qr);
7651 if (!r1->code_blocks)
7652 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7654 r1c = r1->code_blocks->count;
7655 r2c = r2->code_blocks->count;
7657 Newx(new_block, r1c + r2c, struct reg_code_block);
7661 while (i1 < r1c || i2 < r2c) {
7662 struct reg_code_block *src;
7666 src = &r2->code_blocks->cb[i2++];
7670 src = &r1->code_blocks->cb[i1++];
7671 else if ( r1->code_blocks->cb[i1].start
7672 < r2->code_blocks->cb[i2].start)
7674 src = &r1->code_blocks->cb[i1++];
7675 assert(src->end < r2->code_blocks->cb[i2].start);
7678 assert( r1->code_blocks->cb[i1].start
7679 > r2->code_blocks->cb[i2].start);
7680 src = &r2->code_blocks->cb[i2++];
7682 assert(src->end < r1->code_blocks->cb[i1].start);
7685 assert(pat[src->start] == '(');
7686 assert(pat[src->end] == ')');
7687 dst->start = src->start;
7688 dst->end = src->end;
7689 dst->block = src->block;
7690 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7694 r1->code_blocks->count += r2c;
7695 Safefree(r1->code_blocks->cb);
7696 r1->code_blocks->cb = new_block;
7699 SvREFCNT_dec_NN(qr);
7705 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7706 struct reg_substr_datum *rsd,
7707 struct scan_data_substrs *sub,
7708 STRLEN longest_length)
7710 /* This is the common code for setting up the floating and fixed length
7711 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7712 * as to whether succeeded or not */
7716 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7717 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7719 if (! (longest_length
7720 || (eol /* Can't have SEOL and MULTI */
7721 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7723 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7724 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7729 /* copy the information about the longest from the reg_scan_data
7730 over to the program. */
7731 if (SvUTF8(sub->str)) {
7733 rsd->utf8_substr = sub->str;
7735 rsd->substr = sub->str;
7736 rsd->utf8_substr = NULL;
7738 /* end_shift is how many chars that must be matched that
7739 follow this item. We calculate it ahead of time as once the
7740 lookbehind offset is added in we lose the ability to correctly
7742 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7743 rsd->end_shift = ml - sub->min_offset
7745 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7747 + (SvTAIL(sub->str) != 0)
7751 t = (eol/* Can't have SEOL and MULTI */
7752 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7753 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7759 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7761 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7762 * properly wrapped with the right modifiers */
7764 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7765 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7766 != REGEX_DEPENDS_CHARSET);
7768 /* The caret is output if there are any defaults: if not all the STD
7769 * flags are set, or if no character set specifier is needed */
7771 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7773 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7774 == REG_RUN_ON_COMMENT_SEEN);
7775 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7776 >> RXf_PMf_STD_PMMOD_SHIFT);
7777 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7779 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7781 /* We output all the necessary flags; we never output a minus, as all
7782 * those are defaults, so are
7783 * covered by the caret */
7784 const STRLEN wraplen = pat_len + has_p + has_runon
7785 + has_default /* If needs a caret */
7786 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7788 /* If needs a character set specifier */
7789 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7790 + (sizeof("(?:)") - 1);
7792 PERL_ARGS_ASSERT_SET_REGEX_PV;
7794 /* make sure PL_bitcount bounds not exceeded */
7795 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7797 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7800 SvFLAGS(Rx) |= SVf_UTF8;
7803 /* If a default, cover it using the caret */
7805 *p++= DEFAULT_PAT_MOD;
7811 name = get_regex_charset_name(RExC_rx->extflags, &len);
7812 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7814 name = UNICODE_PAT_MODS;
7815 len = sizeof(UNICODE_PAT_MODS) - 1;
7817 Copy(name, p, len, char);
7821 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7824 while((ch = *fptr++)) {
7832 Copy(RExC_precomp, p, pat_len, char);
7833 assert ((RX_WRAPPED(Rx) - p) < 16);
7834 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7837 /* Adding a trailing \n causes this to compile properly:
7838 my $R = qr / A B C # D E/x; /($R)/
7839 Otherwise the parens are considered part of the comment */
7844 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7848 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7849 * regular expression into internal code.
7850 * The pattern may be passed either as:
7851 * a list of SVs (patternp plus pat_count)
7852 * a list of OPs (expr)
7853 * If both are passed, the SV list is used, but the OP list indicates
7854 * which SVs are actually pre-compiled code blocks
7856 * The SVs in the list have magic and qr overloading applied to them (and
7857 * the list may be modified in-place with replacement SVs in the latter
7860 * If the pattern hasn't changed from old_re, then old_re will be
7863 * eng is the current engine. If that engine has an op_comp method, then
7864 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7865 * do the initial concatenation of arguments and pass on to the external
7868 * If is_bare_re is not null, set it to a boolean indicating whether the
7869 * arg list reduced (after overloading) to a single bare regex which has
7870 * been returned (i.e. /$qr/).
7872 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7874 * pm_flags contains the PMf_* flags, typically based on those from the
7875 * pm_flags field of the related PMOP. Currently we're only interested in
7876 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7878 * For many years this code had an initial sizing pass that calculated
7879 * (sometimes incorrectly, leading to security holes) the size needed for the
7880 * compiled pattern. That was changed by commit
7881 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7882 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7883 * references to this sizing pass.
7885 * Now, an initial crude guess as to the size needed is made, based on the
7886 * length of the pattern. Patches welcome to improve that guess. That amount
7887 * of space is malloc'd and then immediately freed, and then clawed back node
7888 * by node. This design is to minimze, to the extent possible, memory churn
7889 * when doing the reallocs.
7891 * A separate parentheses counting pass may be needed in some cases.
7892 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7895 * The existence of a sizing pass necessitated design decisions that are no
7896 * longer needed. There are potential areas of simplification.
7898 * Beware that the optimization-preparation code in here knows about some
7899 * of the structure of the compiled regexp. [I'll say.]
7903 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7904 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7905 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7907 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7915 SV** new_patternp = patternp;
7917 /* these are all flags - maybe they should be turned
7918 * into a single int with different bit masks */
7919 I32 sawlookahead = 0;
7924 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7926 bool runtime_code = 0;
7928 RExC_state_t RExC_state;
7929 RExC_state_t * const pRExC_state = &RExC_state;
7930 #ifdef TRIE_STUDY_OPT
7931 /* search for "restudy" in this file for a detailed explanation */
7933 RExC_state_t copyRExC_state;
7935 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7937 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7939 DEBUG_r(if (!PL_colorset) reginitcolors());
7942 pRExC_state->warn_text = NULL;
7943 pRExC_state->unlexed_names = NULL;
7944 pRExC_state->code_blocks = NULL;
7947 *is_bare_re = FALSE;
7949 if (expr && (expr->op_type == OP_LIST ||
7950 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7951 /* allocate code_blocks if needed */
7955 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7956 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7957 ncode++; /* count of DO blocks */
7960 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7964 /* compile-time pattern with just OP_CONSTs and DO blocks */
7969 /* find how many CONSTs there are */
7972 if (expr->op_type == OP_CONST)
7975 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7976 if (o->op_type == OP_CONST)
7980 /* fake up an SV array */
7982 assert(!new_patternp);
7983 Newx(new_patternp, n, SV*);
7984 SAVEFREEPV(new_patternp);
7988 if (expr->op_type == OP_CONST)
7989 new_patternp[n] = cSVOPx_sv(expr);
7991 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7992 if (o->op_type == OP_CONST)
7993 new_patternp[n++] = cSVOPo_sv;
7998 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7999 "Assembling pattern from %d elements%s\n", pat_count,
8000 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
8002 /* set expr to the first arg op */
8004 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
8005 && expr->op_type != OP_CONST)
8007 expr = cLISTOPx(expr)->op_first;
8008 assert( expr->op_type == OP_PUSHMARK
8009 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
8010 || expr->op_type == OP_PADRANGE);
8011 expr = OpSIBLING(expr);
8014 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
8015 expr, &recompile, NULL);
8017 /* handle bare (possibly after overloading) regex: foo =~ $re */
8022 if (SvTYPE(re) == SVt_REGEXP) {
8026 DEBUG_PARSE_r(Perl_re_printf( aTHX_
8027 "Precompiled pattern%s\n",
8028 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
8034 exp = SvPV_nomg(pat, plen);
8036 if (!eng->op_comp) {
8037 if ((SvUTF8(pat) && IN_BYTES)
8038 || SvGMAGICAL(pat) || SvAMAGIC(pat))
8040 /* make a temporary copy; either to convert to bytes,
8041 * or to avoid repeating get-magic / overloaded stringify */
8042 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
8043 (IN_BYTES ? 0 : SvUTF8(pat)));
8045 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
8048 /* ignore the utf8ness if the pattern is 0 length */
8049 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
8050 RExC_uni_semantics = 0;
8051 RExC_contains_locale = 0;
8052 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
8053 RExC_in_script_run = 0;
8054 RExC_study_started = 0;
8055 pRExC_state->runtime_code_qr = NULL;
8056 RExC_frame_head= NULL;
8057 RExC_frame_last= NULL;
8058 RExC_frame_count= 0;
8059 RExC_latest_warn_offset = 0;
8060 RExC_use_BRANCHJ = 0;
8061 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
8062 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
8063 RExC_total_parens = 0;
8064 RExC_open_parens = NULL;
8065 RExC_close_parens = NULL;
8066 RExC_paren_names = NULL;
8068 RExC_seen_d_op = FALSE;
8070 RExC_paren_name_list = NULL;
8074 RExC_mysv1= sv_newmortal();
8075 RExC_mysv2= sv_newmortal();
8079 SV *dsv= sv_newmortal();
8080 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
8081 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
8082 PL_colors[4], PL_colors[5], s);
8085 /* we jump here if we have to recompile, e.g., from upgrading the pattern
8088 if ((pm_flags & PMf_USE_RE_EVAL)
8089 /* this second condition covers the non-regex literal case,
8090 * i.e. $foo =~ '(?{})'. */
8091 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
8093 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
8096 /* return old regex if pattern hasn't changed */
8097 /* XXX: note in the below we have to check the flags as well as the
8100 * Things get a touch tricky as we have to compare the utf8 flag
8101 * independently from the compile flags. */
8105 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
8106 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
8107 && RX_PRECOMP(old_re)
8108 && RX_PRELEN(old_re) == plen
8109 && memEQ(RX_PRECOMP(old_re), exp, plen)
8110 && !runtime_code /* with runtime code, always recompile */ )
8113 SV *dsv= sv_newmortal();
8114 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
8115 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
8116 PL_colors[4], PL_colors[5], s);
8121 /* Allocate the pattern's SV */
8122 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
8123 RExC_rx = ReANY(Rx);
8124 if ( RExC_rx == NULL )
8125 FAIL("Regexp out of space");
8127 rx_flags = orig_rx_flags;
8129 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
8130 && initial_charset == REGEX_DEPENDS_CHARSET)
8133 /* Set to use unicode semantics if the pattern is in utf8 and has the
8134 * 'depends' charset specified, as it means unicode when utf8 */
8135 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
8136 RExC_uni_semantics = 1;
8139 RExC_pm_flags = pm_flags;
8142 assert(TAINTING_get || !TAINT_get);
8144 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
8146 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
8147 /* whoops, we have a non-utf8 pattern, whilst run-time code
8148 * got compiled as utf8. Try again with a utf8 pattern */
8149 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
8150 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
8154 assert(!pRExC_state->runtime_code_qr);
8160 RExC_in_lookaround = 0;
8161 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
8162 RExC_recode_x_to_native = 0;
8163 RExC_in_multi_char_class = 0;
8165 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
8166 RExC_precomp_end = RExC_end = exp + plen;
8168 RExC_whilem_seen = 0;
8170 RExC_recurse = NULL;
8171 RExC_study_chunk_recursed = NULL;
8172 RExC_study_chunk_recursed_bytes= 0;
8173 RExC_recurse_count = 0;
8174 RExC_sets_depth = 0;
8175 pRExC_state->code_index = 0;
8177 /* Initialize the string in the compiled pattern. This is so that there is
8178 * something to output if necessary */
8179 set_regex_pv(pRExC_state, Rx);
8182 Perl_re_printf( aTHX_
8183 "Starting parse and generation\n");
8185 RExC_lastparse=NULL;
8188 /* Allocate space and zero-initialize. Note, the two step process
8189 of zeroing when in debug mode, thus anything assigned has to
8190 happen after that */
8193 /* On the first pass of the parse, we guess how big this will be. Then
8194 * we grow in one operation to that amount and then give it back. As
8195 * we go along, we re-allocate what we need.
8197 * XXX Currently the guess is essentially that the pattern will be an
8198 * EXACT node with one byte input, one byte output. This is crude, and
8199 * better heuristics are welcome.
8201 * On any subsequent passes, we guess what we actually computed in the
8202 * latest earlier pass. Such a pass probably didn't complete so is
8203 * missing stuff. We could improve those guesses by knowing where the
8204 * parse stopped, and use the length so far plus apply the above
8205 * assumption to what's left. */
8206 RExC_size = STR_SZ(RExC_end - RExC_start);
8209 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
8210 if ( RExC_rxi == NULL )
8211 FAIL("Regexp out of space");
8213 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
8214 RXi_SET( RExC_rx, RExC_rxi );
8216 /* We start from 0 (over from 0 in the case this is a reparse. The first
8217 * node parsed will give back any excess memory we have allocated so far).
8221 /* non-zero initialization begins here */
8222 RExC_rx->engine= eng;
8223 RExC_rx->extflags = rx_flags;
8224 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
8226 if (pm_flags & PMf_IS_QR) {
8227 RExC_rxi->code_blocks = pRExC_state->code_blocks;
8228 if (RExC_rxi->code_blocks) {
8229 RExC_rxi->code_blocks->refcnt++;
8233 RExC_rx->intflags = 0;
8235 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
8236 RExC_parse_set(exp);
8238 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
8239 * code makes sure the final byte is an uncounted NUL. But should this
8240 * ever not be the case, lots of things could read beyond the end of the
8241 * buffer: loops like
8242 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
8243 * strchr(RExC_parse, "foo");
8244 * etc. So it is worth noting. */
8245 assert(*RExC_end == '\0');
8249 RExC_parens_buf_size = 0;
8250 RExC_emit_start = RExC_rxi->program;
8251 pRExC_state->code_index = 0;
8253 *((char*) RExC_emit_start) = (char) REG_MAGIC;
8254 RExC_emit = NODE_STEP_REGNODE;
8257 if (reg(pRExC_state, 0, &flags, 1)) {
8259 /* Success!, But we may need to redo the parse knowing how many parens
8260 * there actually are */
8261 if (IN_PARENS_PASS) {
8262 flags |= RESTART_PARSE;
8265 /* We have that number in RExC_npar */
8266 RExC_total_parens = RExC_npar;
8268 else if (! MUST_RESTART(flags)) {
8270 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
8273 /* Here, we either have success, or we have to redo the parse for some reason */
8274 if (MUST_RESTART(flags)) {
8276 /* It's possible to write a regexp in ascii that represents Unicode
8277 codepoints outside of the byte range, such as via \x{100}. If we
8278 detect such a sequence we have to convert the entire pattern to utf8
8279 and then recompile, as our sizing calculation will have been based
8280 on 1 byte == 1 character, but we will need to use utf8 to encode
8281 at least some part of the pattern, and therefore must convert the whole
8284 if (flags & NEED_UTF8) {
8286 /* We have stored the offset of the final warning output so far.
8287 * That must be adjusted. Any variant characters between the start
8288 * of the pattern and this warning count for 2 bytes in the final,
8289 * so just add them again */
8290 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
8291 RExC_latest_warn_offset +=
8292 variant_under_utf8_count((U8 *) exp, (U8 *) exp
8293 + RExC_latest_warn_offset);
8295 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
8296 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
8297 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
8300 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
8303 if (ALL_PARENS_COUNTED) {
8304 /* Make enough room for all the known parens, and zero it */
8305 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
8306 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
8307 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
8309 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
8310 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
8312 else { /* Parse did not complete. Reinitialize the parentheses
8314 RExC_total_parens = 0;
8315 if (RExC_open_parens) {
8316 Safefree(RExC_open_parens);
8317 RExC_open_parens = NULL;
8319 if (RExC_close_parens) {
8320 Safefree(RExC_close_parens);
8321 RExC_close_parens = NULL;
8325 /* Clean up what we did in this parse */
8326 SvREFCNT_dec_NN(RExC_rx_sv);
8331 /* Here, we have successfully parsed and generated the pattern's program
8332 * for the regex engine. We are ready to finish things up and look for
8335 /* Update the string to compile, with correct modifiers, etc */
8336 set_regex_pv(pRExC_state, Rx);
8338 RExC_rx->nparens = RExC_total_parens - 1;
8340 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
8341 if (RExC_whilem_seen > 15)
8342 RExC_whilem_seen = 15;
8345 Perl_re_printf( aTHX_
8346 "Required size %" IVdf " nodes\n", (IV)RExC_size);
8348 RExC_lastparse=NULL;
8351 SetProgLen(RExC_rxi,RExC_size);
8353 DEBUG_DUMP_PRE_OPTIMIZE_r({
8354 SV * const sv = sv_newmortal();
8355 RXi_GET_DECL(RExC_rx, ri);
8357 Perl_re_printf( aTHX_ "Program before optimization:\n");
8359 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
8364 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
8367 /* XXXX To minimize changes to RE engine we always allocate
8368 3-units-long substrs field. */
8369 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8370 if (RExC_recurse_count) {
8371 Newx(RExC_recurse, RExC_recurse_count, regnode *);
8372 SAVEFREEPV(RExC_recurse);
8375 if (RExC_seen & REG_RECURSE_SEEN) {
8376 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8377 * So its 1 if there are no parens. */
8378 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8379 ((RExC_total_parens & 0x07) != 0);
8380 Newx(RExC_study_chunk_recursed,
8381 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8382 SAVEFREEPV(RExC_study_chunk_recursed);
8386 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8388 RExC_study_chunk_recursed_count= 0;
8390 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8391 if (RExC_study_chunk_recursed) {
8392 Zero(RExC_study_chunk_recursed,
8393 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8397 #ifdef TRIE_STUDY_OPT
8398 /* search for "restudy" in this file for a detailed explanation */
8400 StructCopy(&zero_scan_data, &data, scan_data_t);
8401 copyRExC_state = RExC_state;
8404 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8406 RExC_state = copyRExC_state;
8407 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8408 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8410 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8411 StructCopy(&zero_scan_data, &data, scan_data_t);
8414 StructCopy(&zero_scan_data, &data, scan_data_t);
8417 /* Dig out information for optimizations. */
8418 RExC_rx->extflags = RExC_flags; /* was pm_op */
8419 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8422 SvUTF8_on(Rx); /* Unicode in it? */
8423 RExC_rxi->regstclass = NULL;
8424 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8425 RExC_rx->intflags |= PREGf_NAUGHTY;
8426 scan = RExC_rxi->program + 1; /* First BRANCH. */
8428 /* testing for BRANCH here tells us whether there is "must appear"
8429 data in the pattern. If there is then we can use it for optimisations */
8430 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8432 SSize_t fake_deltap;
8433 STRLEN longest_length[2];
8434 regnode_ssc ch_class; /* pointed to by data */
8436 SSize_t last_close = 0; /* pointed to by data */
8437 regnode *first= scan;
8438 regnode *first_next= regnext(first);
8439 regnode *last_close_op= NULL;
8443 * Skip introductions and multiplicators >= 1
8444 * so that we can extract the 'meat' of the pattern that must
8445 * match in the large if() sequence following.
8446 * NOTE that EXACT is NOT covered here, as it is normally
8447 * picked up by the optimiser separately.
8449 * This is unfortunate as the optimiser isnt handling lookahead
8450 * properly currently.
8453 while ((OP(first) == OPEN && (sawopen = 1)) ||
8454 /* An OR of *one* alternative - should not happen now. */
8455 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8456 /* for now we can't handle lookbehind IFMATCH*/
8457 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8458 (OP(first) == PLUS) ||
8459 (OP(first) == MINMOD) ||
8460 /* An {n,m} with n>0 */
8461 (REGNODE_TYPE(OP(first)) == CURLY && ARG1(first) > 0) ||
8462 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END ))
8465 * the only op that could be a regnode is PLUS, all the rest
8466 * will be regnode_1 or regnode_2.
8468 * (yves doesn't think this is true)
8470 if (OP(first) == PLUS)
8473 if (OP(first) == MINMOD)
8476 first = REGNODE_AFTER(first);
8477 first_next= regnext(first);
8480 /* Starting-point info. */
8482 DEBUG_PEEP("first:", first, 0, 0);
8483 /* Ignore EXACT as we deal with it later. */
8484 if (REGNODE_TYPE(OP(first)) == EXACT) {
8485 if (! isEXACTFish(OP(first))) {
8486 NOOP; /* Empty, get anchored substr later. */
8489 RExC_rxi->regstclass = first;
8492 else if (REGNODE_TYPE(OP(first)) == TRIE &&
8493 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8495 /* this can happen only on restudy
8496 * Search for "restudy" in this file to find
8497 * a comment with details. */
8498 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8501 else if (REGNODE_SIMPLE(OP(first)))
8502 RExC_rxi->regstclass = first;
8503 else if (REGNODE_TYPE(OP(first)) == BOUND ||
8504 REGNODE_TYPE(OP(first)) == NBOUND)
8505 RExC_rxi->regstclass = first;
8506 else if (REGNODE_TYPE(OP(first)) == BOL) {
8507 RExC_rx->intflags |= (OP(first) == MBOL
8510 first = REGNODE_AFTER(first);
8513 else if (OP(first) == GPOS) {
8514 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8515 first = REGNODE_AFTER_type(first,tregnode_GPOS);
8518 else if ((!sawopen || !RExC_sawback) &&
8520 (OP(first) == STAR &&
8521 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
8522 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8524 /* turn .* into ^.* with an implied $*=1 */
8526 (OP(REGNODE_AFTER(first)) == REG_ANY)
8529 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8530 first = REGNODE_AFTER(first);
8533 if (sawplus && !sawminmod && !sawlookahead
8534 && (!sawopen || !RExC_sawback)
8535 && !pRExC_state->code_blocks) /* May examine pos and $& */
8536 /* x+ must match at the 1st pos of run of x's */
8537 RExC_rx->intflags |= PREGf_SKIP;
8539 /* Scan is after the zeroth branch, first is atomic matcher. */
8540 #ifdef TRIE_STUDY_OPT
8541 /* search for "restudy" in this file for a detailed explanation */
8544 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8545 (IV)(first - scan + 1))
8549 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8550 (IV)(first - scan + 1))
8556 * If there's something expensive in the r.e., find the
8557 * longest literal string that must appear and make it the
8558 * regmust. Resolve ties in favor of later strings, since
8559 * the regstart check works with the beginning of the r.e.
8560 * and avoiding duplication strengthens checking. Not a
8561 * strong reason, but sufficient in the absence of others.
8562 * [Now we resolve ties in favor of the earlier string if
8563 * it happens that c_offset_min has been invalidated, since the
8564 * earlier string may buy us something the later one won't.]
8567 data.substrs[0].str = newSVpvs("");
8568 data.substrs[1].str = newSVpvs("");
8569 data.last_found = newSVpvs("");
8570 data.cur_is_floating = 0; /* initially any found substring is fixed */
8571 ENTER_with_name("study_chunk");
8572 SAVEFREESV(data.substrs[0].str);
8573 SAVEFREESV(data.substrs[1].str);
8574 SAVEFREESV(data.last_found);
8576 if (!RExC_rxi->regstclass) {
8577 ssc_init(pRExC_state, &ch_class);
8578 data.start_class = &ch_class;
8579 stclass_flag = SCF_DO_STCLASS_AND;
8580 } else /* XXXX Check for BOUND? */
8582 data.last_closep = &last_close;
8583 data.last_close_opp = &last_close_op;
8587 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8588 * (NO top level branches)
8590 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
8591 scan + RExC_size, /* Up to end */
8593 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8594 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8596 /* search for "restudy" in this file for a detailed explanation
8597 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
8600 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8603 if ( RExC_total_parens == 1 && !data.cur_is_floating
8604 && data.last_start_min == 0 && data.last_end > 0
8605 && !RExC_seen_zerolen
8606 && !(RExC_seen & REG_VERBARG_SEEN)
8607 && !(RExC_seen & REG_GPOS_SEEN)
8609 RExC_rx->extflags |= RXf_CHECK_ALL;
8611 scan_commit(pRExC_state, &data,&minlen, 0);
8614 /* XXX this is done in reverse order because that's the way the
8615 * code was before it was parameterised. Don't know whether it
8616 * actually needs doing in reverse order. DAPM */
8617 for (i = 1; i >= 0; i--) {
8618 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8621 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8622 && data.substrs[0].min_offset
8623 == data.substrs[1].min_offset
8624 && SvCUR(data.substrs[0].str)
8625 == SvCUR(data.substrs[1].str)
8627 && S_setup_longest (aTHX_ pRExC_state,
8628 &(RExC_rx->substrs->data[i]),
8632 RExC_rx->substrs->data[i].min_offset =
8633 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8635 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8636 /* Don't offset infinity */
8637 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8638 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8639 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8642 RExC_rx->substrs->data[i].substr = NULL;
8643 RExC_rx->substrs->data[i].utf8_substr = NULL;
8644 longest_length[i] = 0;
8648 LEAVE_with_name("study_chunk");
8650 if (RExC_rxi->regstclass
8651 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8652 RExC_rxi->regstclass = NULL;
8654 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8655 || RExC_rx->substrs->data[0].min_offset)
8657 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8658 && is_ssc_worth_it(pRExC_state, data.start_class))
8660 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8662 ssc_finalize(pRExC_state, data.start_class);
8664 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8665 StructCopy(data.start_class,
8666 (regnode_ssc*)RExC_rxi->data->data[n],
8668 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8669 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8670 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8671 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8672 Perl_re_printf( aTHX_
8673 "synthetic stclass \"%s\".\n",
8674 SvPVX_const(sv));});
8675 data.start_class = NULL;
8678 /* A temporary algorithm prefers floated substr to fixed one of
8679 * same length to dig more info. */
8680 i = (longest_length[0] <= longest_length[1]);
8681 RExC_rx->substrs->check_ix = i;
8682 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8683 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8684 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8685 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8686 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8687 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8688 RExC_rx->intflags |= PREGf_NOSCAN;
8690 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8691 RExC_rx->extflags |= RXf_USE_INTUIT;
8692 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8693 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8696 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8697 if ( (STRLEN)minlen < longest_length[1] )
8698 minlen= longest_length[1];
8699 if ( (STRLEN)minlen < longest_length[0] )
8700 minlen= longest_length[0];
8704 /* Several toplevels. Best we can is to set minlen. */
8705 SSize_t fake_deltap;
8706 regnode_ssc ch_class;
8707 SSize_t last_close = 0;
8708 regnode *last_close_op = NULL;
8710 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8712 scan = RExC_rxi->program + 1;
8713 ssc_init(pRExC_state, &ch_class);
8714 data.start_class = &ch_class;
8715 data.last_closep = &last_close;
8716 data.last_close_opp = &last_close_op;
8720 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8721 * (patterns WITH top level branches)
8723 minlen = study_chunk(pRExC_state,
8724 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
8725 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8726 ? SCF_TRIE_DOING_RESTUDY
8729 /* search for "restudy" in this file for a detailed explanation
8730 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
8732 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8734 RExC_rx->check_substr = NULL;
8735 RExC_rx->check_utf8 = NULL;
8736 RExC_rx->substrs->data[0].substr = NULL;
8737 RExC_rx->substrs->data[0].utf8_substr = NULL;
8738 RExC_rx->substrs->data[1].substr = NULL;
8739 RExC_rx->substrs->data[1].utf8_substr = NULL;
8741 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8742 && is_ssc_worth_it(pRExC_state, data.start_class))
8744 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8746 ssc_finalize(pRExC_state, data.start_class);
8748 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8749 StructCopy(data.start_class,
8750 (regnode_ssc*)RExC_rxi->data->data[n],
8752 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8753 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8754 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8755 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8756 Perl_re_printf( aTHX_
8757 "synthetic stclass \"%s\".\n",
8758 SvPVX_const(sv));});
8759 data.start_class = NULL;
8763 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8764 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8765 RExC_rx->maxlen = REG_INFTY;
8768 RExC_rx->maxlen = RExC_maxlen;
8771 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8772 the "real" pattern. */
8774 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8775 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8777 RExC_rx->minlenret = minlen;
8778 if (RExC_rx->minlen < minlen)
8779 RExC_rx->minlen = minlen;
8781 if (RExC_seen & REG_RECURSE_SEEN ) {
8782 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8783 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8785 if (RExC_seen & REG_GPOS_SEEN)
8786 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8787 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8788 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8790 if (pRExC_state->code_blocks)
8791 RExC_rx->extflags |= RXf_EVAL_SEEN;
8792 if (RExC_seen & REG_VERBARG_SEEN)
8794 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8795 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8797 if (RExC_seen & REG_CUTGROUP_SEEN)
8798 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8799 if (pm_flags & PMf_USE_RE_EVAL)
8800 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8801 if (RExC_paren_names)
8802 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8804 RXp_PAREN_NAMES(RExC_rx) = NULL;
8806 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8807 * so it can be used in pp.c */
8808 if (RExC_rx->intflags & PREGf_ANCH)
8809 RExC_rx->extflags |= RXf_IS_ANCHORED;
8813 /* this is used to identify "special" patterns that might result
8814 * in Perl NOT calling the regex engine and instead doing the match "itself",
8815 * particularly special cases in split//. By having the regex compiler
8816 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8817 * we avoid weird issues with equivalent patterns resulting in different behavior,
8818 * AND we allow non Perl engines to get the same optimizations by the setting the
8819 * flags appropriately - Yves */
8820 regnode *first = RExC_rxi->program + 1;
8822 regnode *next = NULL;
8824 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
8825 next = REGNODE_AFTER(first);
8828 /* It's safe to read through *next only if OP(first) is a regop of
8829 * the right type (not EXACT, for example).
8831 if (REGNODE_TYPE(fop) == NOTHING && nop == END)
8832 RExC_rx->extflags |= RXf_NULL;
8833 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8834 /* when fop is SBOL first->flags will be true only when it was
8835 * produced by parsing /\A/, and not when parsing /^/. This is
8836 * very important for the split code as there we want to
8837 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8838 * See rt #122761 for more details. -- Yves */
8839 RExC_rx->extflags |= RXf_START_ONLY;
8840 else if (fop == PLUS
8841 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
8842 && OP(regnext(first)) == END)
8843 RExC_rx->extflags |= RXf_WHITE;
8844 else if ( RExC_rx->extflags & RXf_SPLIT
8845 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
8846 && STR_LEN(first) == 1
8847 && *(STRING(first)) == ' '
8848 && OP(regnext(first)) == END )
8849 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8853 if (RExC_contains_locale) {
8854 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8858 if (RExC_paren_names) {
8859 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8860 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8861 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8864 RExC_rxi->name_list_idx = 0;
8866 while ( RExC_recurse_count > 0 ) {
8867 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8869 * This data structure is set up in study_chunk() and is used
8870 * to calculate the distance between a GOSUB regopcode and
8871 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8874 * If for some reason someone writes code that optimises
8875 * away a GOSUB opcode then the assert should be changed to
8876 * an if(scan) to guard the ARG2L_SET() - Yves
8879 assert(scan && OP(scan) == GOSUB);
8880 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8883 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8884 /* assume we don't need to swap parens around before we match */
8886 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8887 (unsigned long)RExC_study_chunk_recursed_count);
8891 Perl_re_printf( aTHX_ "Final program:\n");
8895 if (RExC_open_parens) {
8896 Safefree(RExC_open_parens);
8897 RExC_open_parens = NULL;
8899 if (RExC_close_parens) {
8900 Safefree(RExC_close_parens);
8901 RExC_close_parens = NULL;
8905 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8906 * by setting the regexp SV to readonly-only instead. If the
8907 * pattern's been recompiled, the USEDness should remain. */
8908 if (old_re && SvREADONLY(old_re))
8916 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8919 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8921 PERL_UNUSED_ARG(value);
8923 if (flags & RXapif_FETCH) {
8924 return reg_named_buff_fetch(rx, key, flags);
8925 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8926 Perl_croak_no_modify();
8928 } else if (flags & RXapif_EXISTS) {
8929 return reg_named_buff_exists(rx, key, flags)
8932 } else if (flags & RXapif_REGNAMES) {
8933 return reg_named_buff_all(rx, flags);
8934 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8935 return reg_named_buff_scalar(rx, flags);
8937 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8943 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8946 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8947 PERL_UNUSED_ARG(lastkey);
8949 if (flags & RXapif_FIRSTKEY)
8950 return reg_named_buff_firstkey(rx, flags);
8951 else if (flags & RXapif_NEXTKEY)
8952 return reg_named_buff_nextkey(rx, flags);
8954 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8961 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8965 struct regexp *const rx = ReANY(r);
8967 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8969 if (rx && RXp_PAREN_NAMES(rx)) {
8970 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8973 SV* sv_dat=HeVAL(he_str);
8974 I32 *nums=(I32*)SvPVX(sv_dat);
8975 AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
8976 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8977 if ((I32)(rx->nparens) >= nums[i]
8978 && rx->offs[nums[i]].start != -1
8979 && rx->offs[nums[i]].end != -1)
8982 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8987 ret = newSV_type(SVt_NULL);
8990 av_push_simple(retarray, ret);
8993 return newRV_noinc(MUTABLE_SV(retarray));
9000 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
9003 struct regexp *const rx = ReANY(r);
9005 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
9007 if (rx && RXp_PAREN_NAMES(rx)) {
9008 if (flags & RXapif_ALL) {
9009 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
9011 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
9013 SvREFCNT_dec_NN(sv);
9025 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
9027 struct regexp *const rx = ReANY(r);
9029 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
9031 if ( rx && RXp_PAREN_NAMES(rx) ) {
9032 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
9034 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
9041 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
9043 struct regexp *const rx = ReANY(r);
9044 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9046 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
9048 if (rx && RXp_PAREN_NAMES(rx)) {
9049 HV *hv = RXp_PAREN_NAMES(rx);
9051 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
9054 SV* sv_dat = HeVAL(temphe);
9055 I32 *nums = (I32*)SvPVX(sv_dat);
9056 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
9057 if ((I32)(rx->lastparen) >= nums[i] &&
9058 rx->offs[nums[i]].start != -1 &&
9059 rx->offs[nums[i]].end != -1)
9065 if (parno || flags & RXapif_ALL) {
9066 return newSVhek(HeKEY_hek(temphe));
9074 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
9079 struct regexp *const rx = ReANY(r);
9081 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
9083 if (rx && RXp_PAREN_NAMES(rx)) {
9084 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
9085 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
9086 } else if (flags & RXapif_ONE) {
9087 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
9088 av = MUTABLE_AV(SvRV(ret));
9089 length = av_count(av);
9090 SvREFCNT_dec_NN(ret);
9091 return newSViv(length);
9093 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
9098 return &PL_sv_undef;
9102 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
9104 struct regexp *const rx = ReANY(r);
9107 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
9109 if (rx && RXp_PAREN_NAMES(rx)) {
9110 HV *hv= RXp_PAREN_NAMES(rx);
9112 (void)hv_iterinit(hv);
9113 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
9116 SV* sv_dat = HeVAL(temphe);
9117 I32 *nums = (I32*)SvPVX(sv_dat);
9118 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
9119 if ((I32)(rx->lastparen) >= nums[i] &&
9120 rx->offs[nums[i]].start != -1 &&
9121 rx->offs[nums[i]].end != -1)
9127 if (parno || flags & RXapif_ALL) {
9128 av_push(av, newSVhek(HeKEY_hek(temphe)));
9133 return newRV_noinc(MUTABLE_SV(av));
9137 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
9140 struct regexp *const rx = ReANY(r);
9146 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
9148 if ( n == RX_BUFF_IDX_CARET_PREMATCH
9149 || n == RX_BUFF_IDX_CARET_FULLMATCH
9150 || n == RX_BUFF_IDX_CARET_POSTMATCH
9153 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
9155 /* on something like
9158 * the KEEPCOPY is set on the PMOP rather than the regex */
9159 if (PL_curpm && r == PM_GETRE(PL_curpm))
9160 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
9169 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
9170 /* no need to distinguish between them any more */
9171 n = RX_BUFF_IDX_FULLMATCH;
9173 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
9174 && rx->offs[0].start != -1)
9176 /* $`, ${^PREMATCH} */
9177 i = rx->offs[0].start;
9181 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
9182 && rx->offs[0].end != -1)
9184 /* $', ${^POSTMATCH} */
9185 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
9186 i = rx->sublen + rx->suboffset - rx->offs[0].end;
9189 if (inRANGE(n, 0, (I32)rx->nparens) &&
9190 (s1 = rx->offs[n].start) != -1 &&
9191 (t1 = rx->offs[n].end) != -1)
9193 /* $&, ${^MATCH}, $1 ... */
9195 s = rx->subbeg + s1 - rx->suboffset;
9200 assert(s >= rx->subbeg);
9201 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
9203 #ifdef NO_TAINT_SUPPORT
9204 sv_setpvn(sv, s, i);
9206 const int oldtainted = TAINT_get;
9208 sv_setpvn(sv, s, i);
9209 TAINT_set(oldtainted);
9211 if (RXp_MATCH_UTF8(rx))
9216 if (RXp_MATCH_TAINTED(rx)) {
9217 if (SvTYPE(sv) >= SVt_PVMG) {
9218 MAGIC* const mg = SvMAGIC(sv);
9221 SvMAGIC_set(sv, mg->mg_moremagic);
9223 if ((mgt = SvMAGIC(sv))) {
9224 mg->mg_moremagic = mgt;
9225 SvMAGIC_set(sv, mg);
9242 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
9243 SV const * const value)
9245 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
9247 PERL_UNUSED_ARG(rx);
9248 PERL_UNUSED_ARG(paren);
9249 PERL_UNUSED_ARG(value);
9252 Perl_croak_no_modify();
9256 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
9259 struct regexp *const rx = ReANY(r);
9263 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
9265 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
9266 || paren == RX_BUFF_IDX_CARET_FULLMATCH
9267 || paren == RX_BUFF_IDX_CARET_POSTMATCH
9270 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
9272 /* on something like
9275 * the KEEPCOPY is set on the PMOP rather than the regex */
9276 if (PL_curpm && r == PM_GETRE(PL_curpm))
9277 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
9283 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
9285 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
9286 case RX_BUFF_IDX_PREMATCH: /* $` */
9287 if (rx->offs[0].start != -1) {
9288 i = rx->offs[0].start;
9297 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
9298 case RX_BUFF_IDX_POSTMATCH: /* $' */
9299 if (rx->offs[0].end != -1) {
9300 i = rx->sublen - rx->offs[0].end;
9302 s1 = rx->offs[0].end;
9309 default: /* $& / ${^MATCH}, $1, $2, ... */
9310 if (paren <= (I32)rx->nparens &&
9311 (s1 = rx->offs[paren].start) != -1 &&
9312 (t1 = rx->offs[paren].end) != -1)
9318 if (ckWARN(WARN_UNINITIALIZED))
9319 report_uninit((const SV *)sv);
9324 if (i > 0 && RXp_MATCH_UTF8(rx)) {
9325 const char * const s = rx->subbeg - rx->suboffset + s1;
9330 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
9337 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
9339 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
9340 PERL_UNUSED_ARG(rx);
9344 return newSVpvs("Regexp");
9347 /* Scans the name of a named buffer from the pattern.
9348 * If flags is REG_RSN_RETURN_NULL returns null.
9349 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
9350 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
9351 * to the parsed name as looked up in the RExC_paren_names hash.
9352 * If there is an error throws a vFAIL().. type exception.
9355 #define REG_RSN_RETURN_NULL 0
9356 #define REG_RSN_RETURN_NAME 1
9357 #define REG_RSN_RETURN_DATA 2
9360 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
9362 char *name_start = RExC_parse;
9365 PERL_ARGS_ASSERT_REG_SCAN_NAME;
9367 assert (RExC_parse <= RExC_end);
9368 if (RExC_parse == RExC_end) NOOP;
9369 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
9370 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
9371 * using do...while */
9374 RExC_parse_inc_utf8();
9375 } while ( RExC_parse < RExC_end
9376 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9379 RExC_parse_inc_by(1);
9380 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9382 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
9384 vFAIL("Group name must start with a non-digit word character");
9386 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9387 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9388 if ( flags == REG_RSN_RETURN_NAME)
9390 else if (flags==REG_RSN_RETURN_DATA) {
9393 if ( ! sv_name ) /* should not happen*/
9394 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9395 if (RExC_paren_names)
9396 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9398 sv_dat = HeVAL(he_str);
9399 if ( ! sv_dat ) { /* Didn't find group */
9401 /* It might be a forward reference; we can't fail until we
9402 * know, by completing the parse to get all the groups, and
9404 if (ALL_PARENS_COUNTED) {
9405 vFAIL("Reference to nonexistent named group");
9408 REQUIRE_PARENS_PASS;
9414 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9415 (unsigned long) flags);
9418 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9419 if (RExC_lastparse!=RExC_parse) { \
9420 Perl_re_printf( aTHX_ "%s", \
9421 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9422 RExC_end - RExC_parse, 16, \
9424 PERL_PV_ESCAPE_UNI_DETECT | \
9425 PERL_PV_PRETTY_ELLIPSES | \
9426 PERL_PV_PRETTY_LTGT | \
9427 PERL_PV_ESCAPE_RE | \
9428 PERL_PV_PRETTY_EXACTSIZE \
9432 Perl_re_printf( aTHX_ "%16s",""); \
9434 if (RExC_lastnum!=RExC_emit) \
9435 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9437 Perl_re_printf( aTHX_ "|%4s",""); \
9438 Perl_re_printf( aTHX_ "|%*s%-4s", \
9439 (int)((depth*2)), "", \
9442 RExC_lastnum=RExC_emit; \
9443 RExC_lastparse=RExC_parse; \
9448 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9449 DEBUG_PARSE_MSG((funcname)); \
9450 Perl_re_printf( aTHX_ "%4s","\n"); \
9452 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9453 DEBUG_PARSE_MSG((funcname)); \
9454 Perl_re_printf( aTHX_ fmt "\n",args); \
9457 /* This section of code defines the inversion list object and its methods. The
9458 * interfaces are highly subject to change, so as much as possible is static to
9459 * this file. An inversion list is here implemented as a malloc'd C UV array
9460 * as an SVt_INVLIST scalar.
9462 * An inversion list for Unicode is an array of code points, sorted by ordinal
9463 * number. Each element gives the code point that begins a range that extends
9464 * up-to but not including the code point given by the next element. The final
9465 * element gives the first code point of a range that extends to the platform's
9466 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9467 * ...) give ranges whose code points are all in the inversion list. We say
9468 * that those ranges are in the set. The odd-numbered elements give ranges
9469 * whose code points are not in the inversion list, and hence not in the set.
9470 * Thus, element [0] is the first code point in the list. Element [1]
9471 * is the first code point beyond that not in the list; and element [2] is the
9472 * first code point beyond that that is in the list. In other words, the first
9473 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9474 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9475 * all code points in that range are not in the inversion list. The third
9476 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9477 * list, and so forth. Thus every element whose index is divisible by two
9478 * gives the beginning of a range that is in the list, and every element whose
9479 * index is not divisible by two gives the beginning of a range not in the
9480 * list. If the final element's index is divisible by two, the inversion list
9481 * extends to the platform's infinity; otherwise the highest code point in the
9482 * inversion list is the contents of that element minus 1.
9484 * A range that contains just a single code point N will look like
9486 * invlist[i+1] == N+1
9488 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9489 * impossible to represent, so element [i+1] is omitted. The single element
9491 * invlist[0] == UV_MAX
9492 * contains just UV_MAX, but is interpreted as matching to infinity.
9494 * Taking the complement (inverting) an inversion list is quite simple, if the
9495 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9496 * This implementation reserves an element at the beginning of each inversion
9497 * list to always contain 0; there is an additional flag in the header which
9498 * indicates if the list begins at the 0, or is offset to begin at the next
9499 * element. This means that the inversion list can be inverted without any
9500 * copying; just flip the flag.
9502 * More about inversion lists can be found in "Unicode Demystified"
9503 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9505 * The inversion list data structure is currently implemented as an SV pointing
9506 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9507 * array of UV whose memory management is automatically handled by the existing
9508 * facilities for SV's.
9510 * Some of the methods should always be private to the implementation, and some
9511 * should eventually be made public */
9513 /* The header definitions are in F<invlist_inline.h> */
9515 #ifndef PERL_IN_XSUB_RE
9517 PERL_STATIC_INLINE UV*
9518 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9520 /* Returns a pointer to the first element in the inversion list's array.
9521 * This is called upon initialization of an inversion list. Where the
9522 * array begins depends on whether the list has the code point U+0000 in it
9523 * or not. The other parameter tells it whether the code that follows this
9524 * call is about to put a 0 in the inversion list or not. The first
9525 * element is either the element reserved for 0, if TRUE, or the element
9526 * after it, if FALSE */
9528 bool* offset = get_invlist_offset_addr(invlist);
9529 UV* zero_addr = (UV *) SvPVX(invlist);
9531 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9534 assert(! _invlist_len(invlist));
9538 /* 1^1 = 0; 1^0 = 1 */
9539 *offset = 1 ^ will_have_0;
9540 return zero_addr + *offset;
9544 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9546 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9547 * steals the list from 'src', so 'src' is made to have a NULL list. This
9548 * is similar to what SvSetMagicSV() would do, if it were implemented on
9549 * inversion lists, though this routine avoids a copy */
9551 const UV src_len = _invlist_len(src);
9552 const bool src_offset = *get_invlist_offset_addr(src);
9553 const STRLEN src_byte_len = SvLEN(src);
9554 char * array = SvPVX(src);
9556 #ifndef NO_TAINT_SUPPORT
9557 const int oldtainted = TAINT_get;
9560 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9562 assert(is_invlist(src));
9563 assert(is_invlist(dest));
9564 assert(! invlist_is_iterating(src));
9565 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9567 /* Make sure it ends in the right place with a NUL, as our inversion list
9568 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9570 array[src_byte_len - 1] = '\0';
9572 TAINT_NOT; /* Otherwise it breaks */
9573 sv_usepvn_flags(dest,
9577 /* This flag is documented to cause a copy to be avoided */
9578 SV_HAS_TRAILING_NUL);
9579 TAINT_set(oldtainted);
9584 /* Finish up copying over the other fields in an inversion list */
9585 *get_invlist_offset_addr(dest) = src_offset;
9586 invlist_set_len(dest, src_len, src_offset);
9587 *get_invlist_previous_index_addr(dest) = 0;
9588 invlist_iterfinish(dest);
9591 PERL_STATIC_INLINE IV*
9592 S_get_invlist_previous_index_addr(SV* invlist)
9594 /* Return the address of the IV that is reserved to hold the cached index
9596 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9598 assert(is_invlist(invlist));
9600 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9603 PERL_STATIC_INLINE IV
9604 S_invlist_previous_index(SV* const invlist)
9606 /* Returns cached index of previous search */
9608 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9610 return *get_invlist_previous_index_addr(invlist);
9613 PERL_STATIC_INLINE void
9614 S_invlist_set_previous_index(SV* const invlist, const IV index)
9616 /* Caches <index> for later retrieval */
9618 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9620 assert(index == 0 || index < (int) _invlist_len(invlist));
9622 *get_invlist_previous_index_addr(invlist) = index;
9625 PERL_STATIC_INLINE void
9626 S_invlist_trim(SV* invlist)
9628 /* Free the not currently-being-used space in an inversion list */
9630 /* But don't free up the space needed for the 0 UV that is always at the
9631 * beginning of the list, nor the trailing NUL */
9632 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9634 PERL_ARGS_ASSERT_INVLIST_TRIM;
9636 assert(is_invlist(invlist));
9638 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9641 PERL_STATIC_INLINE void
9642 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9644 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9646 assert(is_invlist(invlist));
9648 invlist_set_len(invlist, 0, 0);
9649 invlist_trim(invlist);
9652 #endif /* ifndef PERL_IN_XSUB_RE */
9654 PERL_STATIC_INLINE bool
9655 S_invlist_is_iterating(const SV* const invlist)
9657 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9659 /* get_invlist_iter_addr()'s sv is non-const only because it returns a
9660 * value that can be used to modify the invlist, it doesn't modify the
9662 return *(get_invlist_iter_addr((SV*)invlist)) < (STRLEN) UV_MAX;
9665 #ifndef PERL_IN_XSUB_RE
9667 PERL_STATIC_INLINE UV
9668 S_invlist_max(const SV* const invlist)
9670 /* Returns the maximum number of elements storable in the inversion list's
9671 * array, without having to realloc() */
9673 PERL_ARGS_ASSERT_INVLIST_MAX;
9675 assert(is_invlist(invlist));
9677 /* Assumes worst case, in which the 0 element is not counted in the
9678 * inversion list, so subtracts 1 for that */
9679 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9680 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9681 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9685 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9687 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9689 /* First 1 is in case the zero element isn't in the list; second 1 is for
9691 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9692 invlist_set_len(invlist, 0, 0);
9694 /* Force iterinit() to be used to get iteration to work */
9695 invlist_iterfinish(invlist);
9697 *get_invlist_previous_index_addr(invlist) = 0;
9698 SvPOK_on(invlist); /* This allows B to extract the PV */
9702 Perl__new_invlist(pTHX_ IV initial_size)
9705 /* Return a pointer to a newly constructed inversion list, with enough
9706 * space to store 'initial_size' elements. If that number is negative, a
9707 * system default is used instead */
9711 if (initial_size < 0) {
9715 new_list = newSV_type(SVt_INVLIST);
9716 initialize_invlist_guts(new_list, initial_size);
9722 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9724 /* Return a pointer to a newly constructed inversion list, initialized to
9725 * point to <list>, which has to be in the exact correct inversion list
9726 * form, including internal fields. Thus this is a dangerous routine that
9727 * should not be used in the wrong hands. The passed in 'list' contains
9728 * several header fields at the beginning that are not part of the
9729 * inversion list body proper */
9731 const STRLEN length = (STRLEN) list[0];
9732 const UV version_id = list[1];
9733 const bool offset = cBOOL(list[2]);
9734 #define HEADER_LENGTH 3
9735 /* If any of the above changes in any way, you must change HEADER_LENGTH
9736 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9737 * perl -E 'say int(rand 2**31-1)'
9739 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9740 data structure type, so that one being
9741 passed in can be validated to be an
9742 inversion list of the correct vintage.
9745 SV* invlist = newSV_type(SVt_INVLIST);
9747 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9749 if (version_id != INVLIST_VERSION_ID) {
9750 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9753 /* The generated array passed in includes header elements that aren't part
9754 * of the list proper, so start it just after them */
9755 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9757 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9758 shouldn't touch it */
9760 *(get_invlist_offset_addr(invlist)) = offset;
9762 /* The 'length' passed to us is the physical number of elements in the
9763 * inversion list. But if there is an offset the logical number is one
9765 invlist_set_len(invlist, length - offset, offset);
9767 invlist_set_previous_index(invlist, 0);
9769 /* Initialize the iteration pointer. */
9770 invlist_iterfinish(invlist);
9772 SvREADONLY_on(invlist);
9779 S__append_range_to_invlist(pTHX_ SV* const invlist,
9780 const UV start, const UV end)
9782 /* Subject to change or removal. Append the range from 'start' to 'end' at
9783 * the end of the inversion list. The range must be above any existing
9787 UV max = invlist_max(invlist);
9788 UV len = _invlist_len(invlist);
9791 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9793 if (len == 0) { /* Empty lists must be initialized */
9794 offset = start != 0;
9795 array = _invlist_array_init(invlist, ! offset);
9798 /* Here, the existing list is non-empty. The current max entry in the
9799 * list is generally the first value not in the set, except when the
9800 * set extends to the end of permissible values, in which case it is
9801 * the first entry in that final set, and so this call is an attempt to
9802 * append out-of-order */
9804 UV final_element = len - 1;
9805 array = invlist_array(invlist);
9806 if ( array[final_element] > start
9807 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9809 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9810 array[final_element], start,
9811 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9814 /* Here, it is a legal append. If the new range begins 1 above the end
9815 * of the range below it, it is extending the range below it, so the
9816 * new first value not in the set is one greater than the newly
9817 * extended range. */
9818 offset = *get_invlist_offset_addr(invlist);
9819 if (array[final_element] == start) {
9820 if (end != UV_MAX) {
9821 array[final_element] = end + 1;
9824 /* But if the end is the maximum representable on the machine,
9825 * assume that infinity was actually what was meant. Just let
9826 * the range that this would extend to have no end */
9827 invlist_set_len(invlist, len - 1, offset);
9833 /* Here the new range doesn't extend any existing set. Add it */
9835 len += 2; /* Includes an element each for the start and end of range */
9837 /* If wll overflow the existing space, extend, which may cause the array to
9840 invlist_extend(invlist, len);
9842 /* Have to set len here to avoid assert failure in invlist_array() */
9843 invlist_set_len(invlist, len, offset);
9845 array = invlist_array(invlist);
9848 invlist_set_len(invlist, len, offset);
9851 /* The next item on the list starts the range, the one after that is
9852 * one past the new range. */
9853 array[len - 2] = start;
9854 if (end != UV_MAX) {
9855 array[len - 1] = end + 1;
9858 /* But if the end is the maximum representable on the machine, just let
9859 * the range have no end */
9860 invlist_set_len(invlist, len - 1, offset);
9865 Perl__invlist_search(SV* const invlist, const UV cp)
9867 /* Searches the inversion list for the entry that contains the input code
9868 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9869 * return value is the index into the list's array of the range that
9870 * contains <cp>, that is, 'i' such that
9871 * array[i] <= cp < array[i+1]
9876 IV high = _invlist_len(invlist);
9877 const IV highest_element = high - 1;
9880 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9882 /* If list is empty, return failure. */
9883 if (UNLIKELY(high == 0)) {
9887 /* (We can't get the array unless we know the list is non-empty) */
9888 array = invlist_array(invlist);
9890 mid = invlist_previous_index(invlist);
9892 if (UNLIKELY(mid > highest_element)) {
9893 mid = highest_element;
9896 /* <mid> contains the cache of the result of the previous call to this
9897 * function (0 the first time). See if this call is for the same result,
9898 * or if it is for mid-1. This is under the theory that calls to this
9899 * function will often be for related code points that are near each other.
9900 * And benchmarks show that caching gives better results. We also test
9901 * here if the code point is within the bounds of the list. These tests
9902 * replace others that would have had to be made anyway to make sure that
9903 * the array bounds were not exceeded, and these give us extra information
9904 * at the same time */
9905 if (cp >= array[mid]) {
9906 if (cp >= array[highest_element]) {
9907 return highest_element;
9910 /* Here, array[mid] <= cp < array[highest_element]. This means that
9911 * the final element is not the answer, so can exclude it; it also
9912 * means that <mid> is not the final element, so can refer to 'mid + 1'
9914 if (cp < array[mid + 1]) {
9920 else { /* cp < aray[mid] */
9921 if (cp < array[0]) { /* Fail if outside the array */
9925 if (cp >= array[mid - 1]) {
9930 /* Binary search. What we are looking for is <i> such that
9931 * array[i] <= cp < array[i+1]
9932 * The loop below converges on the i+1. Note that there may not be an
9933 * (i+1)th element in the array, and things work nonetheless */
9934 while (low < high) {
9935 mid = (low + high) / 2;
9936 assert(mid <= highest_element);
9937 if (array[mid] <= cp) { /* cp >= array[mid] */
9940 /* We could do this extra test to exit the loop early.
9941 if (cp < array[low]) {
9946 else { /* cp < array[mid] */
9953 invlist_set_previous_index(invlist, high);
9958 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9959 const bool complement_b, SV** output)
9961 /* Take the union of two inversion lists and point '*output' to it. On
9962 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9963 * even 'a' or 'b'). If to an inversion list, the contents of the original
9964 * list will be replaced by the union. The first list, 'a', may be
9965 * NULL, in which case a copy of the second list is placed in '*output'.
9966 * If 'complement_b' is TRUE, the union is taken of the complement
9967 * (inversion) of 'b' instead of b itself.
9969 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9970 * Richard Gillam, published by Addison-Wesley, and explained at some
9971 * length there. The preface says to incorporate its examples into your
9972 * code at your own risk.
9974 * The algorithm is like a merge sort. */
9976 const UV* array_a; /* a's array */
9978 UV len_a; /* length of a's array */
9981 SV* u; /* the resulting union */
9985 UV i_a = 0; /* current index into a's array */
9989 /* running count, as explained in the algorithm source book; items are
9990 * stopped accumulating and are output when the count changes to/from 0.
9991 * The count is incremented when we start a range that's in an input's set,
9992 * and decremented when we start a range that's not in a set. So this
9993 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9994 * and hence nothing goes into the union; 1, just one of the inputs is in
9995 * its set (and its current range gets added to the union); and 2 when both
9996 * inputs are in their sets. */
9999 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
10001 assert(*output == NULL || is_invlist(*output));
10003 len_b = _invlist_len(b);
10006 /* Here, 'b' is empty, hence it's complement is all possible code
10007 * points. So if the union includes the complement of 'b', it includes
10008 * everything, and we need not even look at 'a'. It's easiest to
10009 * create a new inversion list that matches everything. */
10010 if (complement_b) {
10011 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
10013 if (*output == NULL) { /* If the output didn't exist, just point it
10015 *output = everything;
10017 else { /* Otherwise, replace its contents with the new list */
10018 invlist_replace_list_destroys_src(*output, everything);
10019 SvREFCNT_dec_NN(everything);
10025 /* Here, we don't want the complement of 'b', and since 'b' is empty,
10026 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
10027 * output will be empty */
10029 if (a == NULL || _invlist_len(a) == 0) {
10030 if (*output == NULL) {
10031 *output = _new_invlist(0);
10034 invlist_clear(*output);
10039 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
10040 * union. We can just return a copy of 'a' if '*output' doesn't point
10041 * to an existing list */
10042 if (*output == NULL) {
10043 *output = invlist_clone(a, NULL);
10047 /* If the output is to overwrite 'a', we have a no-op, as it's
10048 * already in 'a' */
10049 if (*output == a) {
10053 /* Here, '*output' is to be overwritten by 'a' */
10054 u = invlist_clone(a, NULL);
10055 invlist_replace_list_destroys_src(*output, u);
10056 SvREFCNT_dec_NN(u);
10061 /* Here 'b' is not empty. See about 'a' */
10063 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
10065 /* Here, 'a' is empty (and b is not). That means the union will come
10066 * entirely from 'b'. If '*output' is NULL, we can directly return a
10067 * clone of 'b'. Otherwise, we replace the contents of '*output' with
10070 SV ** dest = (*output == NULL) ? output : &u;
10071 *dest = invlist_clone(b, NULL);
10072 if (complement_b) {
10073 _invlist_invert(*dest);
10077 invlist_replace_list_destroys_src(*output, u);
10078 SvREFCNT_dec_NN(u);
10084 /* Here both lists exist and are non-empty */
10085 array_a = invlist_array(a);
10086 array_b = invlist_array(b);
10088 /* If are to take the union of 'a' with the complement of b, set it
10089 * up so are looking at b's complement. */
10090 if (complement_b) {
10092 /* To complement, we invert: if the first element is 0, remove it. To
10093 * do this, we just pretend the array starts one later */
10094 if (array_b[0] == 0) {
10100 /* But if the first element is not zero, we pretend the list starts
10101 * at the 0 that is always stored immediately before the array. */
10107 /* Size the union for the worst case: that the sets are completely
10109 u = _new_invlist(len_a + len_b);
10111 /* Will contain U+0000 if either component does */
10112 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
10113 || (len_b > 0 && array_b[0] == 0));
10115 /* Go through each input list item by item, stopping when have exhausted
10117 while (i_a < len_a && i_b < len_b) {
10118 UV cp; /* The element to potentially add to the union's array */
10119 bool cp_in_set; /* is it in the input list's set or not */
10121 /* We need to take one or the other of the two inputs for the union.
10122 * Since we are merging two sorted lists, we take the smaller of the
10123 * next items. In case of a tie, we take first the one that is in its
10124 * set. If we first took the one not in its set, it would decrement
10125 * the count, possibly to 0 which would cause it to be output as ending
10126 * the range, and the next time through we would take the same number,
10127 * and output it again as beginning the next range. By doing it the
10128 * opposite way, there is no possibility that the count will be
10129 * momentarily decremented to 0, and thus the two adjoining ranges will
10130 * be seamlessly merged. (In a tie and both are in the set or both not
10131 * in the set, it doesn't matter which we take first.) */
10132 if ( array_a[i_a] < array_b[i_b]
10133 || ( array_a[i_a] == array_b[i_b]
10134 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
10136 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
10137 cp = array_a[i_a++];
10140 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10141 cp = array_b[i_b++];
10144 /* Here, have chosen which of the two inputs to look at. Only output
10145 * if the running count changes to/from 0, which marks the
10146 * beginning/end of a range that's in the set */
10149 array_u[i_u++] = cp;
10156 array_u[i_u++] = cp;
10162 /* The loop above increments the index into exactly one of the input lists
10163 * each iteration, and ends when either index gets to its list end. That
10164 * means the other index is lower than its end, and so something is
10165 * remaining in that one. We decrement 'count', as explained below, if
10166 * that list is in its set. (i_a and i_b each currently index the element
10167 * beyond the one we care about.) */
10168 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10169 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10174 /* Above we decremented 'count' if the list that had unexamined elements in
10175 * it was in its set. This has made it so that 'count' being non-zero
10176 * means there isn't anything left to output; and 'count' equal to 0 means
10177 * that what is left to output is precisely that which is left in the
10178 * non-exhausted input list.
10180 * To see why, note first that the exhausted input obviously has nothing
10181 * left to add to the union. If it was in its set at its end, that means
10182 * the set extends from here to the platform's infinity, and hence so does
10183 * the union and the non-exhausted set is irrelevant. The exhausted set
10184 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
10185 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
10186 * 'count' remains at 1. This is consistent with the decremented 'count'
10187 * != 0 meaning there's nothing left to add to the union.
10189 * But if the exhausted input wasn't in its set, it contributed 0 to
10190 * 'count', and the rest of the union will be whatever the other input is.
10191 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
10192 * otherwise it gets decremented to 0. This is consistent with 'count'
10193 * == 0 meaning the remainder of the union is whatever is left in the
10194 * non-exhausted list. */
10199 IV copy_count = len_a - i_a;
10200 if (copy_count > 0) { /* The non-exhausted input is 'a' */
10201 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
10203 else { /* The non-exhausted input is b */
10204 copy_count = len_b - i_b;
10205 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
10207 len_u = i_u + copy_count;
10210 /* Set the result to the final length, which can change the pointer to
10211 * array_u, so re-find it. (Note that it is unlikely that this will
10212 * change, as we are shrinking the space, not enlarging it) */
10213 if (len_u != _invlist_len(u)) {
10214 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
10216 array_u = invlist_array(u);
10219 if (*output == NULL) { /* Simply return the new inversion list */
10223 /* Otherwise, overwrite the inversion list that was in '*output'. We
10224 * could instead free '*output', and then set it to 'u', but experience
10225 * has shown [perl #127392] that if the input is a mortal, we can get a
10226 * huge build-up of these during regex compilation before they get
10228 invlist_replace_list_destroys_src(*output, u);
10229 SvREFCNT_dec_NN(u);
10236 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
10237 const bool complement_b, SV** i)
10239 /* Take the intersection of two inversion lists and point '*i' to it. On
10240 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
10241 * even 'a' or 'b'). If to an inversion list, the contents of the original
10242 * list will be replaced by the intersection. The first list, 'a', may be
10243 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
10244 * TRUE, the result will be the intersection of 'a' and the complement (or
10245 * inversion) of 'b' instead of 'b' directly.
10247 * The basis for this comes from "Unicode Demystified" Chapter 13 by
10248 * Richard Gillam, published by Addison-Wesley, and explained at some
10249 * length there. The preface says to incorporate its examples into your
10250 * code at your own risk. In fact, it had bugs
10252 * The algorithm is like a merge sort, and is essentially the same as the
10256 const UV* array_a; /* a's array */
10258 UV len_a; /* length of a's array */
10261 SV* r; /* the resulting intersection */
10265 UV i_a = 0; /* current index into a's array */
10269 /* running count of how many of the two inputs are postitioned at ranges
10270 * that are in their sets. As explained in the algorithm source book,
10271 * items are stopped accumulating and are output when the count changes
10272 * to/from 2. The count is incremented when we start a range that's in an
10273 * input's set, and decremented when we start a range that's not in a set.
10274 * Only when it is 2 are we in the intersection. */
10277 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
10279 assert(*i == NULL || is_invlist(*i));
10281 /* Special case if either one is empty */
10282 len_a = (a == NULL) ? 0 : _invlist_len(a);
10283 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
10284 if (len_a != 0 && complement_b) {
10286 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
10287 * must be empty. Here, also we are using 'b's complement, which
10288 * hence must be every possible code point. Thus the intersection
10289 * is simply 'a'. */
10291 if (*i == a) { /* No-op */
10296 *i = invlist_clone(a, NULL);
10300 r = invlist_clone(a, NULL);
10301 invlist_replace_list_destroys_src(*i, r);
10302 SvREFCNT_dec_NN(r);
10306 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
10307 * intersection must be empty */
10309 *i = _new_invlist(0);
10317 /* Here both lists exist and are non-empty */
10318 array_a = invlist_array(a);
10319 array_b = invlist_array(b);
10321 /* If are to take the intersection of 'a' with the complement of b, set it
10322 * up so are looking at b's complement. */
10323 if (complement_b) {
10325 /* To complement, we invert: if the first element is 0, remove it. To
10326 * do this, we just pretend the array starts one later */
10327 if (array_b[0] == 0) {
10333 /* But if the first element is not zero, we pretend the list starts
10334 * at the 0 that is always stored immediately before the array. */
10340 /* Size the intersection for the worst case: that the intersection ends up
10341 * fragmenting everything to be completely disjoint */
10342 r= _new_invlist(len_a + len_b);
10344 /* Will contain U+0000 iff both components do */
10345 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
10346 && len_b > 0 && array_b[0] == 0);
10348 /* Go through each list item by item, stopping when have exhausted one of
10350 while (i_a < len_a && i_b < len_b) {
10351 UV cp; /* The element to potentially add to the intersection's
10353 bool cp_in_set; /* Is it in the input list's set or not */
10355 /* We need to take one or the other of the two inputs for the
10356 * intersection. Since we are merging two sorted lists, we take the
10357 * smaller of the next items. In case of a tie, we take first the one
10358 * that is not in its set (a difference from the union algorithm). If
10359 * we first took the one in its set, it would increment the count,
10360 * possibly to 2 which would cause it to be output as starting a range
10361 * in the intersection, and the next time through we would take that
10362 * same number, and output it again as ending the set. By doing the
10363 * opposite of this, there is no possibility that the count will be
10364 * momentarily incremented to 2. (In a tie and both are in the set or
10365 * both not in the set, it doesn't matter which we take first.) */
10366 if ( array_a[i_a] < array_b[i_b]
10367 || ( array_a[i_a] == array_b[i_b]
10368 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
10370 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
10371 cp = array_a[i_a++];
10374 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10375 cp= array_b[i_b++];
10378 /* Here, have chosen which of the two inputs to look at. Only output
10379 * if the running count changes to/from 2, which marks the
10380 * beginning/end of a range that's in the intersection */
10384 array_r[i_r++] = cp;
10389 array_r[i_r++] = cp;
10396 /* The loop above increments the index into exactly one of the input lists
10397 * each iteration, and ends when either index gets to its list end. That
10398 * means the other index is lower than its end, and so something is
10399 * remaining in that one. We increment 'count', as explained below, if the
10400 * exhausted list was in its set. (i_a and i_b each currently index the
10401 * element beyond the one we care about.) */
10402 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10403 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10408 /* Above we incremented 'count' if the exhausted list was in its set. This
10409 * has made it so that 'count' being below 2 means there is nothing left to
10410 * output; otheriwse what's left to add to the intersection is precisely
10411 * that which is left in the non-exhausted input list.
10413 * To see why, note first that the exhausted input obviously has nothing
10414 * left to affect the intersection. If it was in its set at its end, that
10415 * means the set extends from here to the platform's infinity, and hence
10416 * anything in the non-exhausted's list will be in the intersection, and
10417 * anything not in it won't be. Hence, the rest of the intersection is
10418 * precisely what's in the non-exhausted list The exhausted set also
10419 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10420 * it means 'count' is now at least 2. This is consistent with the
10421 * incremented 'count' being >= 2 means to add the non-exhausted list to
10422 * the intersection.
10424 * But if the exhausted input wasn't in its set, it contributed 0 to
10425 * 'count', and the intersection can't include anything further; the
10426 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10427 * incremented. This is consistent with 'count' being < 2 meaning nothing
10428 * further to add to the intersection. */
10429 if (count < 2) { /* Nothing left to put in the intersection. */
10432 else { /* copy the non-exhausted list, unchanged. */
10433 IV copy_count = len_a - i_a;
10434 if (copy_count > 0) { /* a is the one with stuff left */
10435 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10437 else { /* b is the one with stuff left */
10438 copy_count = len_b - i_b;
10439 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10441 len_r = i_r + copy_count;
10444 /* Set the result to the final length, which can change the pointer to
10445 * array_r, so re-find it. (Note that it is unlikely that this will
10446 * change, as we are shrinking the space, not enlarging it) */
10447 if (len_r != _invlist_len(r)) {
10448 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10450 array_r = invlist_array(r);
10453 if (*i == NULL) { /* Simply return the calculated intersection */
10456 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10457 instead free '*i', and then set it to 'r', but experience has
10458 shown [perl #127392] that if the input is a mortal, we can get a
10459 huge build-up of these during regex compilation before they get
10462 invlist_replace_list_destroys_src(*i, r);
10467 SvREFCNT_dec_NN(r);
10474 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10476 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10477 * set. A pointer to the inversion list is returned. This may actually be
10478 * a new list, in which case the passed in one has been destroyed. The
10479 * passed-in inversion list can be NULL, in which case a new one is created
10480 * with just the one range in it. The new list is not necessarily
10481 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10482 * result of this function. The gain would not be large, and in many
10483 * cases, this is called multiple times on a single inversion list, so
10484 * anything freed may almost immediately be needed again.
10486 * This used to mostly call the 'union' routine, but that is much more
10487 * heavyweight than really needed for a single range addition */
10489 UV* array; /* The array implementing the inversion list */
10490 UV len; /* How many elements in 'array' */
10491 SSize_t i_s; /* index into the invlist array where 'start'
10493 SSize_t i_e = 0; /* And the index where 'end' should go */
10494 UV cur_highest; /* The highest code point in the inversion list
10495 upon entry to this function */
10497 /* This range becomes the whole inversion list if none already existed */
10498 if (invlist == NULL) {
10499 invlist = _new_invlist(2);
10500 _append_range_to_invlist(invlist, start, end);
10504 /* Likewise, if the inversion list is currently empty */
10505 len = _invlist_len(invlist);
10507 _append_range_to_invlist(invlist, start, end);
10511 /* Starting here, we have to know the internals of the list */
10512 array = invlist_array(invlist);
10514 /* If the new range ends higher than the current highest ... */
10515 cur_highest = invlist_highest(invlist);
10516 if (end > cur_highest) {
10518 /* If the whole range is higher, we can just append it */
10519 if (start > cur_highest) {
10520 _append_range_to_invlist(invlist, start, end);
10524 /* Otherwise, add the portion that is higher ... */
10525 _append_range_to_invlist(invlist, cur_highest + 1, end);
10527 /* ... and continue on below to handle the rest. As a result of the
10528 * above append, we know that the index of the end of the range is the
10529 * final even numbered one of the array. Recall that the final element
10530 * always starts a range that extends to infinity. If that range is in
10531 * the set (meaning the set goes from here to infinity), it will be an
10532 * even index, but if it isn't in the set, it's odd, and the final
10533 * range in the set is one less, which is even. */
10534 if (end == UV_MAX) {
10542 /* We have dealt with appending, now see about prepending. If the new
10543 * range starts lower than the current lowest ... */
10544 if (start < array[0]) {
10546 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10547 * Let the union code handle it, rather than having to know the
10548 * trickiness in two code places. */
10549 if (UNLIKELY(start == 0)) {
10552 range_invlist = _new_invlist(2);
10553 _append_range_to_invlist(range_invlist, start, end);
10555 _invlist_union(invlist, range_invlist, &invlist);
10557 SvREFCNT_dec_NN(range_invlist);
10562 /* If the whole new range comes before the first entry, and doesn't
10563 * extend it, we have to insert it as an additional range */
10564 if (end < array[0] - 1) {
10566 goto splice_in_new_range;
10569 /* Here the new range adjoins the existing first range, extending it
10573 /* And continue on below to handle the rest. We know that the index of
10574 * the beginning of the range is the first one of the array */
10577 else { /* Not prepending any part of the new range to the existing list.
10578 * Find where in the list it should go. This finds i_s, such that:
10579 * invlist[i_s] <= start < array[i_s+1]
10581 i_s = _invlist_search(invlist, start);
10584 /* At this point, any extending before the beginning of the inversion list
10585 * and/or after the end has been done. This has made it so that, in the
10586 * code below, each endpoint of the new range is either in a range that is
10587 * in the set, or is in a gap between two ranges that are. This means we
10588 * don't have to worry about exceeding the array bounds.
10590 * Find where in the list the new range ends (but we can skip this if we
10591 * have already determined what it is, or if it will be the same as i_s,
10592 * which we already have computed) */
10594 i_e = (start == end)
10596 : _invlist_search(invlist, end);
10599 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10600 * is a range that goes to infinity there is no element at invlist[i_e+1],
10601 * so only the first relation holds. */
10603 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10605 /* Here, the ranges on either side of the beginning of the new range
10606 * are in the set, and this range starts in the gap between them.
10608 * The new range extends the range above it downwards if the new range
10609 * ends at or above that range's start */
10610 const bool extends_the_range_above = ( end == UV_MAX
10611 || end + 1 >= array[i_s+1]);
10613 /* The new range extends the range below it upwards if it begins just
10614 * after where that range ends */
10615 if (start == array[i_s]) {
10617 /* If the new range fills the entire gap between the other ranges,
10618 * they will get merged together. Other ranges may also get
10619 * merged, depending on how many of them the new range spans. In
10620 * the general case, we do the merge later, just once, after we
10621 * figure out how many to merge. But in the case where the new
10622 * range exactly spans just this one gap (possibly extending into
10623 * the one above), we do the merge here, and an early exit. This
10624 * is done here to avoid having to special case later. */
10625 if (i_e - i_s <= 1) {
10627 /* If i_e - i_s == 1, it means that the new range terminates
10628 * within the range above, and hence 'extends_the_range_above'
10629 * must be true. (If the range above it extends to infinity,
10630 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10631 * will be 0, so no harm done.) */
10632 if (extends_the_range_above) {
10633 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10634 invlist_set_len(invlist,
10636 *(get_invlist_offset_addr(invlist)));
10640 /* Here, i_e must == i_s. We keep them in sync, as they apply
10641 * to the same range, and below we are about to decrement i_s
10646 /* Here, the new range is adjacent to the one below. (It may also
10647 * span beyond the range above, but that will get resolved later.)
10648 * Extend the range below to include this one. */
10649 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10651 start = array[i_s];
10653 else if (extends_the_range_above) {
10655 /* Here the new range only extends the range above it, but not the
10656 * one below. It merges with the one above. Again, we keep i_e
10657 * and i_s in sync if they point to the same range */
10662 array[i_s] = start;
10666 /* Here, we've dealt with the new range start extending any adjoining
10669 * If the new range extends to infinity, it is now the final one,
10670 * regardless of what was there before */
10671 if (UNLIKELY(end == UV_MAX)) {
10672 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10676 /* If i_e started as == i_s, it has also been dealt with,
10677 * and been updated to the new i_s, which will fail the following if */
10678 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10680 /* Here, the ranges on either side of the end of the new range are in
10681 * the set, and this range ends in the gap between them.
10683 * If this range is adjacent to (hence extends) the range above it, it
10684 * becomes part of that range; likewise if it extends the range below,
10685 * it becomes part of that range */
10686 if (end + 1 == array[i_e+1]) {
10688 array[i_e] = start;
10690 else if (start <= array[i_e]) {
10691 array[i_e] = end + 1;
10698 /* If the range fits entirely in an existing range (as possibly already
10699 * extended above), it doesn't add anything new */
10700 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10704 /* Here, no part of the range is in the list. Must add it. It will
10705 * occupy 2 more slots */
10706 splice_in_new_range:
10708 invlist_extend(invlist, len + 2);
10709 array = invlist_array(invlist);
10710 /* Move the rest of the array down two slots. Don't include any
10712 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10714 /* Do the actual splice */
10715 array[i_e+1] = start;
10716 array[i_e+2] = end + 1;
10717 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10721 /* Here the new range crossed the boundaries of a pre-existing range. The
10722 * code above has adjusted things so that both ends are in ranges that are
10723 * in the set. This means everything in between must also be in the set.
10724 * Just squash things together */
10725 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10726 invlist_set_len(invlist,
10728 *(get_invlist_offset_addr(invlist)));
10734 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10735 UV** other_elements_ptr)
10737 /* Create and return an inversion list whose contents are to be populated
10738 * by the caller. The caller gives the number of elements (in 'size') and
10739 * the very first element ('element0'). This function will set
10740 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10741 * are to be placed.
10743 * Obviously there is some trust involved that the caller will properly
10744 * fill in the other elements of the array.
10746 * (The first element needs to be passed in, as the underlying code does
10747 * things differently depending on whether it is zero or non-zero) */
10749 SV* invlist = _new_invlist(size);
10752 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10754 invlist = add_cp_to_invlist(invlist, element0);
10755 offset = *get_invlist_offset_addr(invlist);
10757 invlist_set_len(invlist, size, offset);
10758 *other_elements_ptr = invlist_array(invlist) + 1;
10764 #ifndef PERL_IN_XSUB_RE
10766 Perl__invlist_invert(pTHX_ SV* const invlist)
10768 /* Complement the input inversion list. This adds a 0 if the list didn't
10769 * have a zero; removes it otherwise. As described above, the data
10770 * structure is set up so that this is very efficient */
10772 PERL_ARGS_ASSERT__INVLIST_INVERT;
10774 assert(! invlist_is_iterating(invlist));
10776 /* The inverse of matching nothing is matching everything */
10777 if (_invlist_len(invlist) == 0) {
10778 _append_range_to_invlist(invlist, 0, UV_MAX);
10782 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10786 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10788 /* Return a new inversion list that is a copy of the input one, which is
10789 * unchanged. The new list will not be mortal even if the old one was. */
10791 const STRLEN nominal_length = _invlist_len(invlist);
10792 const STRLEN physical_length = SvCUR(invlist);
10793 const bool offset = *(get_invlist_offset_addr(invlist));
10795 PERL_ARGS_ASSERT_INVLIST_CLONE;
10797 if (new_invlist == NULL) {
10798 new_invlist = _new_invlist(nominal_length);
10801 sv_upgrade(new_invlist, SVt_INVLIST);
10802 initialize_invlist_guts(new_invlist, nominal_length);
10805 *(get_invlist_offset_addr(new_invlist)) = offset;
10806 invlist_set_len(new_invlist, nominal_length, offset);
10807 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10809 return new_invlist;
10814 PERL_STATIC_INLINE UV
10815 S_invlist_lowest(SV* const invlist)
10817 /* Returns the lowest code point that matches an inversion list. This API
10818 * has an ambiguity, as it returns 0 under either the lowest is actually
10819 * 0, or if the list is empty. If this distinction matters to you, check
10820 * for emptiness before calling this function */
10822 UV len = _invlist_len(invlist);
10825 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10831 array = invlist_array(invlist);
10837 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10839 /* Get the contents of an inversion list into a string SV so that they can
10840 * be printed out. If 'traditional_style' is TRUE, it uses the format
10841 * traditionally done for debug tracing; otherwise it uses a format
10842 * suitable for just copying to the output, with blanks between ranges and
10843 * a dash between range components */
10847 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10848 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10850 if (traditional_style) {
10851 output = newSVpvs("\n");
10854 output = newSVpvs("");
10857 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10859 assert(! invlist_is_iterating(invlist));
10861 invlist_iterinit(invlist);
10862 while (invlist_iternext(invlist, &start, &end)) {
10863 if (end == UV_MAX) {
10864 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10865 start, intra_range_delimiter,
10866 inter_range_delimiter);
10868 else if (end != start) {
10869 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10871 intra_range_delimiter,
10872 end, inter_range_delimiter);
10875 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10876 start, inter_range_delimiter);
10880 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10881 SvCUR_set(output, SvCUR(output) - 1);
10887 #ifndef PERL_IN_XSUB_RE
10889 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10890 const char * const indent, SV* const invlist)
10892 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10893 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10894 * the string 'indent'. The output looks like this:
10895 [0] 0x000A .. 0x000D
10897 [4] 0x2028 .. 0x2029
10898 [6] 0x3104 .. INFTY
10899 * This means that the first range of code points matched by the list are
10900 * 0xA through 0xD; the second range contains only the single code point
10901 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10902 * are used to define each range (except if the final range extends to
10903 * infinity, only a single element is needed). The array index of the
10904 * first element for the corresponding range is given in brackets. */
10909 PERL_ARGS_ASSERT__INVLIST_DUMP;
10911 if (invlist_is_iterating(invlist)) {
10912 Perl_dump_indent(aTHX_ level, file,
10913 "%sCan't dump inversion list because is in middle of iterating\n",
10918 invlist_iterinit(invlist);
10919 while (invlist_iternext(invlist, &start, &end)) {
10920 if (end == UV_MAX) {
10921 Perl_dump_indent(aTHX_ level, file,
10922 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10923 indent, (UV)count, start);
10925 else if (end != start) {
10926 Perl_dump_indent(aTHX_ level, file,
10927 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10928 indent, (UV)count, start, end);
10931 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10932 indent, (UV)count, start);
10940 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10942 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10944 /* Return a boolean as to if the two passed in inversion lists are
10945 * identical. The final argument, if TRUE, says to take the complement of
10946 * the second inversion list before doing the comparison */
10948 const UV len_a = _invlist_len(a);
10949 UV len_b = _invlist_len(b);
10951 const UV* array_a = NULL;
10952 const UV* array_b = NULL;
10954 PERL_ARGS_ASSERT__INVLISTEQ;
10956 /* This code avoids accessing the arrays unless it knows the length is
10961 return ! complement_b;
10965 array_a = invlist_array(a);
10969 array_b = invlist_array(b);
10972 /* If are to compare 'a' with the complement of b, set it
10973 * up so are looking at b's complement. */
10974 if (complement_b) {
10976 /* The complement of nothing is everything, so <a> would have to have
10977 * just one element, starting at zero (ending at infinity) */
10979 return (len_a == 1 && array_a[0] == 0);
10981 if (array_b[0] == 0) {
10983 /* Otherwise, to complement, we invert. Here, the first element is
10984 * 0, just remove it. To do this, we just pretend the array starts
10992 /* But if the first element is not zero, we pretend the list starts
10993 * at the 0 that is always stored immediately before the array. */
10999 return len_a == len_b
11000 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
11006 * As best we can, determine the characters that can match the start of
11007 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
11008 * can be false positive matches
11010 * Returns the invlist as a new SV*; it is the caller's responsibility to
11011 * call SvREFCNT_dec() when done with it.
11014 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
11016 const U8 * s = (U8*)STRING(node);
11017 SSize_t bytelen = STR_LEN(node);
11019 /* Start out big enough for 2 separate code points */
11020 SV* invlist = _new_invlist(4);
11022 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
11027 /* We punt and assume can match anything if the node begins
11028 * with a multi-character fold. Things are complicated. For
11029 * example, /ffi/i could match any of:
11030 * "\N{LATIN SMALL LIGATURE FFI}"
11031 * "\N{LATIN SMALL LIGATURE FF}I"
11032 * "F\N{LATIN SMALL LIGATURE FI}"
11033 * plus several other things; and making sure we have all the
11034 * possibilities is hard. */
11035 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
11036 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
11039 /* Any Latin1 range character can potentially match any
11040 * other depending on the locale, and in Turkic locales, 'I' and
11041 * 'i' can match U+130 and U+131 */
11042 if (OP(node) == EXACTFL) {
11043 _invlist_union(invlist, PL_Latin1, &invlist);
11044 if (isALPHA_FOLD_EQ(uc, 'I')) {
11045 invlist = add_cp_to_invlist(invlist,
11046 LATIN_SMALL_LETTER_DOTLESS_I);
11047 invlist = add_cp_to_invlist(invlist,
11048 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
11052 /* But otherwise, it matches at least itself. We can
11053 * quickly tell if it has a distinct fold, and if so,
11054 * it matches that as well */
11055 invlist = add_cp_to_invlist(invlist, uc);
11056 if (IS_IN_SOME_FOLD_L1(uc))
11057 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
11060 /* Some characters match above-Latin1 ones under /i. This
11061 * is true of EXACTFL ones when the locale is UTF-8 */
11062 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
11063 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
11064 EXACTFAA_NO_TRIE)))
11066 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
11070 else { /* Pattern is UTF-8 */
11071 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
11072 const U8* e = s + bytelen;
11075 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
11077 /* The only code points that aren't folded in a UTF EXACTFish
11078 * node are the problematic ones in EXACTFL nodes */
11079 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
11080 /* We need to check for the possibility that this EXACTFL
11081 * node begins with a multi-char fold. Therefore we fold
11082 * the first few characters of it so that we can make that
11088 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
11090 *(d++) = (U8) toFOLD(*s);
11091 if (fc < 0) { /* Save the first fold */
11098 UV fold = toFOLD_utf8_safe(s, e, d, &len);
11099 if (fc < 0) { /* Save the first fold */
11107 /* And set up so the code below that looks in this folded
11108 * buffer instead of the node's string */
11113 /* When we reach here 's' points to the fold of the first
11114 * character(s) of the node; and 'e' points to far enough along
11115 * the folded string to be just past any possible multi-char
11118 * Like the non-UTF case above, we punt if the node begins with a
11119 * multi-char fold */
11121 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
11122 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
11124 else { /* Single char fold */
11127 const U32 * remaining_folds;
11128 Size_t folds_count;
11130 /* It matches itself */
11131 invlist = add_cp_to_invlist(invlist, fc);
11133 /* ... plus all the things that fold to it, which are found in
11134 * PL_utf8_foldclosures */
11135 folds_count = _inverse_folds(fc, &first_fold,
11137 for (k = 0; k < folds_count; k++) {
11138 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
11140 /* /aa doesn't allow folds between ASCII and non- */
11141 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
11142 && isASCII(c) != isASCII(fc))
11147 invlist = add_cp_to_invlist(invlist, c);
11150 if (OP(node) == EXACTFL) {
11152 /* If either [iI] are present in an EXACTFL node the above code
11153 * should have added its normal case pair, but under a Turkish
11154 * locale they could match instead the case pairs from it. Add
11155 * those as potential matches as well */
11156 if (isALPHA_FOLD_EQ(fc, 'I')) {
11157 invlist = add_cp_to_invlist(invlist,
11158 LATIN_SMALL_LETTER_DOTLESS_I);
11159 invlist = add_cp_to_invlist(invlist,
11160 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
11162 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
11163 invlist = add_cp_to_invlist(invlist, 'I');
11165 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11166 invlist = add_cp_to_invlist(invlist, 'i');
11175 #undef HEADER_LENGTH
11176 #undef TO_INTERNAL_SIZE
11177 #undef FROM_INTERNAL_SIZE
11178 #undef INVLIST_VERSION_ID
11180 /* End of inversion list object */
11183 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
11185 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
11186 * constructs, and updates RExC_flags with them. On input, RExC_parse
11187 * should point to the first flag; it is updated on output to point to the
11188 * final ')' or ':'. There needs to be at least one flag, or this will
11191 /* for (?g), (?gc), and (?o) warnings; warning
11192 about (?c) will warn about (?g) -- japhy */
11194 #define WASTED_O 0x01
11195 #define WASTED_G 0x02
11196 #define WASTED_C 0x04
11197 #define WASTED_GC (WASTED_G|WASTED_C)
11198 I32 wastedflags = 0x00;
11199 U32 posflags = 0, negflags = 0;
11200 U32 *flagsp = &posflags;
11201 char has_charset_modifier = '\0';
11203 bool has_use_defaults = FALSE;
11204 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
11205 int x_mod_count = 0;
11207 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
11209 /* '^' as an initial flag sets certain defaults */
11210 if (UCHARAT(RExC_parse) == '^') {
11211 RExC_parse_inc_by(1);
11212 has_use_defaults = TRUE;
11213 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
11214 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
11215 ? REGEX_UNICODE_CHARSET
11216 : REGEX_DEPENDS_CHARSET;
11217 set_regex_charset(&RExC_flags, cs);
11220 cs = get_regex_charset(RExC_flags);
11221 if ( cs == REGEX_DEPENDS_CHARSET
11222 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
11224 cs = REGEX_UNICODE_CHARSET;
11228 while (RExC_parse < RExC_end) {
11229 /* && memCHRs("iogcmsx", *RExC_parse) */
11230 /* (?g), (?gc) and (?o) are useless here
11231 and must be globally applied -- japhy */
11232 if ((RExC_pm_flags & PMf_WILDCARD)) {
11233 if (flagsp == & negflags) {
11234 if (*RExC_parse == 'm') {
11235 RExC_parse_inc_by(1);
11236 /* diag_listed_as: Use of %s is not allowed in Unicode
11237 property wildcard subpatterns in regex; marked by <--
11239 vFAIL("Use of modifier '-m' is not allowed in Unicode"
11240 " property wildcard subpatterns");
11244 if (*RExC_parse == 's') {
11245 goto modifier_illegal_in_wildcard;
11250 switch (*RExC_parse) {
11252 /* Code for the imsxn flags */
11253 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
11255 case LOCALE_PAT_MOD:
11256 if (has_charset_modifier) {
11257 goto excess_modifier;
11259 else if (flagsp == &negflags) {
11262 cs = REGEX_LOCALE_CHARSET;
11263 has_charset_modifier = LOCALE_PAT_MOD;
11265 case UNICODE_PAT_MOD:
11266 if (has_charset_modifier) {
11267 goto excess_modifier;
11269 else if (flagsp == &negflags) {
11272 cs = REGEX_UNICODE_CHARSET;
11273 has_charset_modifier = UNICODE_PAT_MOD;
11275 case ASCII_RESTRICT_PAT_MOD:
11276 if (flagsp == &negflags) {
11279 if (has_charset_modifier) {
11280 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
11281 goto excess_modifier;
11283 /* Doubled modifier implies more restricted */
11284 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
11287 cs = REGEX_ASCII_RESTRICTED_CHARSET;
11289 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
11291 case DEPENDS_PAT_MOD:
11292 if (has_use_defaults) {
11293 goto fail_modifiers;
11295 else if (flagsp == &negflags) {
11298 else if (has_charset_modifier) {
11299 goto excess_modifier;
11302 /* The dual charset means unicode semantics if the
11303 * pattern (or target, not known until runtime) are
11304 * utf8, or something in the pattern indicates unicode
11306 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
11307 ? REGEX_UNICODE_CHARSET
11308 : REGEX_DEPENDS_CHARSET;
11309 has_charset_modifier = DEPENDS_PAT_MOD;
11312 RExC_parse_inc_by(1);
11313 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
11314 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
11316 else if (has_charset_modifier == *(RExC_parse - 1)) {
11317 vFAIL2("Regexp modifier \"%c\" may not appear twice",
11318 *(RExC_parse - 1));
11321 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
11323 NOT_REACHED; /*NOTREACHED*/
11325 RExC_parse_inc_by(1);
11326 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
11327 *(RExC_parse - 1));
11328 NOT_REACHED; /*NOTREACHED*/
11329 case GLOBAL_PAT_MOD: /* 'g' */
11330 if (RExC_pm_flags & PMf_WILDCARD) {
11331 goto modifier_illegal_in_wildcard;
11334 case ONCE_PAT_MOD: /* 'o' */
11335 if (ckWARN(WARN_REGEXP)) {
11336 const I32 wflagbit = *RExC_parse == 'o'
11339 if (! (wastedflags & wflagbit) ) {
11340 wastedflags |= wflagbit;
11341 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
11344 "Useless (%s%c) - %suse /%c modifier",
11345 flagsp == &negflags ? "?-" : "?",
11347 flagsp == &negflags ? "don't " : "",
11354 case CONTINUE_PAT_MOD: /* 'c' */
11355 if (RExC_pm_flags & PMf_WILDCARD) {
11356 goto modifier_illegal_in_wildcard;
11358 if (ckWARN(WARN_REGEXP)) {
11359 if (! (wastedflags & WASTED_C) ) {
11360 wastedflags |= WASTED_GC;
11361 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
11364 "Useless (%sc) - %suse /gc modifier",
11365 flagsp == &negflags ? "?-" : "?",
11366 flagsp == &negflags ? "don't " : ""
11371 case KEEPCOPY_PAT_MOD: /* 'p' */
11372 if (RExC_pm_flags & PMf_WILDCARD) {
11373 goto modifier_illegal_in_wildcard;
11375 if (flagsp == &negflags) {
11376 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
11378 *flagsp |= RXf_PMf_KEEPCOPY;
11382 /* A flag is a default iff it is following a minus, so
11383 * if there is a minus, it means will be trying to
11384 * re-specify a default which is an error */
11385 if (has_use_defaults || flagsp == &negflags) {
11386 goto fail_modifiers;
11388 flagsp = &negflags;
11389 wastedflags = 0; /* reset so (?g-c) warns twice */
11395 if ( (RExC_pm_flags & PMf_WILDCARD)
11396 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11398 RExC_parse_inc_by(1);
11399 /* diag_listed_as: Use of %s is not allowed in Unicode
11400 property wildcard subpatterns in regex; marked by <--
11402 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11403 " property wildcard subpatterns",
11404 has_charset_modifier);
11407 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11408 negflags |= RXf_PMf_EXTENDED_MORE;
11410 RExC_flags |= posflags;
11412 if (negflags & RXf_PMf_EXTENDED) {
11413 negflags |= RXf_PMf_EXTENDED_MORE;
11415 RExC_flags &= ~negflags;
11416 set_regex_charset(&RExC_flags, cs);
11421 RExC_parse_inc_if_char();
11422 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11423 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11424 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11425 NOT_REACHED; /*NOTREACHED*/
11431 vFAIL("Sequence (?... not terminated");
11433 modifier_illegal_in_wildcard:
11434 RExC_parse_inc_by(1);
11435 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11436 subpatterns in regex; marked by <-- HERE in m/%s/ */
11437 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11438 " subpatterns", *(RExC_parse - 1));
11442 - reg - regular expression, i.e. main body or parenthesized thing
11444 * Caller must absorb opening parenthesis.
11446 * Combining parenthesis handling with the base level of regular expression
11447 * is a trifle forced, but the need to tie the tails of the branches to what
11448 * follows makes it hard to avoid.
11450 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11452 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11454 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11457 STATIC regnode_offset
11458 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11460 char * backref_parse_start,
11464 regnode_offset ret;
11465 char* name_start = RExC_parse;
11467 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11468 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11470 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11472 if (RExC_parse != name_start && ch == '}') {
11473 while (isBLANK(*RExC_parse)) {
11474 RExC_parse_inc_by(1);
11477 if (RExC_parse == name_start || *RExC_parse != ch) {
11478 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11479 vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
11483 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11484 RExC_rxi->data->data[num]=(void*)sv_dat;
11485 SvREFCNT_inc_simple_void_NN(sv_dat);
11488 ret = reganode(pRExC_state,
11491 : (ASCII_FOLD_RESTRICTED)
11493 : (AT_LEAST_UNI_SEMANTICS)
11499 *flagp |= HASWIDTH;
11501 nextchar(pRExC_state);
11505 /* reg_la_NOTHING()
11507 * Maybe parse a parenthezised lookaround construct that is equivalent to a
11508 * NOTHING regop when the construct is empty.
11510 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
11512 * Checks for unterminated constructs and throws a "not terminated" error
11513 * with the appropriate type if necessary
11515 * Assuming it does not throw an exception increments RExC_seen_zerolen.
11517 * If the construct is empty generates a NOTHING op and returns its
11518 * regnode_offset, which the caller would then return to its caller.
11520 * If the construct is not empty increments RExC_in_lookaround, and turns
11521 * on any flags provided in RExC_seen, and then returns 0 to signify
11522 * that parsing should continue.
11524 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
11525 * any use of it would have had to be broken onto multiple lines, hence
11526 * the abbreviation.
11528 STATIC regnode_offset
11529 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
11533 PERL_ARGS_ASSERT_REG_LA_NOTHING;
11535 /* false below so we do not force /x */
11536 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
11538 if (RExC_parse >= RExC_end)
11539 vFAIL2("Sequence (%s... not terminated", type);
11541 /* Always increment as NOTHING regops are zerolen */
11542 RExC_seen_zerolen++;
11544 if (*RExC_parse == ')') {
11545 regnode_offset ret= reg_node(pRExC_state, NOTHING);
11546 nextchar(pRExC_state);
11550 RExC_seen |= flags;
11551 RExC_in_lookaround++;
11552 return 0; /* keep parsing! */
11557 * Maybe parse a parenthezised lookaround construct that is equivalent to a
11558 * OPFAIL regop when the construct is empty.
11560 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
11562 * Checks for unterminated constructs and throws a "not terminated" error
11565 * If the construct is empty generates an OPFAIL op and returns its
11566 * regnode_offset which the caller should then return to its caller.
11568 * If the construct is not empty increments RExC_in_lookaround, and also
11569 * increments RExC_seen_zerolen, and turns on the flags provided in
11570 * RExC_seen, and then returns 0 to signify that parsing should continue.
11572 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
11573 * any use of it would have had to be broken onto multiple lines, hence
11574 * the abbreviation.
11577 STATIC regnode_offset
11578 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
11582 PERL_ARGS_ASSERT_REG_LA_OPFAIL;
11584 /* FALSE so we don't force to /x below */;
11585 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
11587 if (RExC_parse >= RExC_end)
11588 vFAIL2("Sequence (%s... not terminated", type);
11590 if (*RExC_parse == ')') {
11591 regnode_offset ret= reganode(pRExC_state, OPFAIL, 0);
11592 nextchar(pRExC_state);
11593 return ret; /* return produced regop */
11596 /* only increment zerolen *after* we check if we produce an OPFAIL
11597 * as an OPFAIL does not match a zero length construct, as it
11598 * does not match ever. */
11599 RExC_seen_zerolen++;
11600 RExC_seen |= flags;
11601 RExC_in_lookaround++;
11602 return 0; /* keep parsing! */
11605 /* Below are the main parsing routines.
11607 * S_reg() parses a whole pattern or subpattern. It itself handles things
11608 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
11609 * alternation '|' in the '...' pattern.
11610 * S_regbranch() effectively implements the concatenation operator, handling
11611 * one alternative of '|', repeatedly calling S_regpiece on each
11612 * segment of the input.
11613 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
11614 * and then adds any quantifier for that chunk.
11615 * S_regatom() parses the next chunk of the input, returning when it
11616 * determines it has found a complete atomic chunk. The chunk may
11617 * be a nested subpattern, in which case S_reg is called
11620 * The functions generate regnodes as they go along, appending each to the
11621 * pattern data structure so far. They return the offset of the current final
11622 * node into that structure, or 0 on failure.
11624 * There are three parameters common to all of them:
11625 * pRExC_state is a structure with much information about the current
11626 * state of the parse. It's easy to add new elements to
11627 * convey new information, but beware that an error return may
11628 * require clearing the element.
11629 * flagp is a pointer to bit flags set in a lower level to pass up
11630 * to higher levels information, such as the cause of a
11631 * failure, or some characteristic about the generated node
11632 * depth is roughly the recursion depth, mostly unused except for
11633 * pretty printing debugging info.
11635 * There are ancillary functions that these may farm work out to, using the
11638 * The protocol for handling flags is that each function will, before
11639 * returning, add into *flagp the flags it needs to pass up. Each function has
11640 * a second flags variable, typically named 'flags', which it sets and clears
11641 * at will. Flag bits in it are used in that function, and it calls the next
11642 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
11643 * 'flags' will contain whatever it had before the call, plus whatever that
11644 * function passed up. If it wants to pass any of these up to its caller, it
11645 * has to add them to its *flagp. This means that it takes extra steps to keep
11646 * passing a flag upwards, and otherwise the flag bit is cleared for higher
11650 /* On success, returns the offset at which any next node should be placed into
11651 * the regex engine program being compiled.
11653 * Returns 0 otherwise, with *flagp set to indicate why:
11654 * TRYAGAIN at the end of (?) that only sets flags.
11655 * RESTART_PARSE if the parse needs to be restarted, or'd with
11656 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11657 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11659 STATIC regnode_offset
11660 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11661 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11662 * 2 is like 1, but indicates that nextchar() has been called to advance
11663 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11664 * this flag alerts us to the need to check for that */
11666 regnode_offset ret = 0; /* Will be the head of the group. */
11668 regnode_offset lastbr;
11669 regnode_offset ender = 0;
11672 U32 oregflags = RExC_flags;
11673 bool have_branch = 0;
11675 I32 freeze_paren = 0;
11676 I32 after_freeze = 0;
11677 I32 num; /* numeric backreferences */
11678 SV * max_open; /* Max number of unclosed parens */
11679 I32 was_in_lookaround = RExC_in_lookaround;
11681 /* The difference between the following variables can be seen with *
11682 * the broken pattern /(?:foo/ where segment_parse_start will point *
11683 * at the 'f', and reg_parse_start will point at the '(' */
11685 /* the following is used for unmatched '(' errors */
11686 char * const reg_parse_start = RExC_parse;
11688 /* the following is used to track where various segments of
11689 * the pattern that we parse out started. */
11690 char * segment_parse_start = RExC_parse;
11692 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11694 PERL_ARGS_ASSERT_REG;
11695 DEBUG_PARSE("reg ");
11697 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11699 if (!SvIOK(max_open)) {
11700 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11702 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11704 vFAIL("Too many nested open parens");
11707 *flagp = 0; /* Initialize. */
11709 /* Having this true makes it feasible to have a lot fewer tests for the
11710 * parse pointer being in scope. For example, we can write
11711 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
11713 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
11715 assert(*RExC_end == '\0');
11717 /* Make an OPEN node, if parenthesized. */
11720 /* Under /x, space and comments can be gobbled up between the '(' and
11721 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11722 * intervening space, as the sequence is a token, and a token should be
11724 bool has_intervening_patws = (paren == 2)
11725 && *(RExC_parse - 1) != '(';
11727 if (RExC_parse >= RExC_end) {
11728 vFAIL("Unmatched (");
11731 if (paren == 'r') { /* Atomic script run */
11735 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11736 char *start_verb = RExC_parse + 1;
11738 char *start_arg = NULL;
11739 unsigned char op = 0;
11740 int arg_required = 0;
11741 int internal_argval = -1; /* if > -1 no argument allowed */
11742 bool has_upper = FALSE;
11743 U32 seen_flag_set = 0; /* RExC_seen flags we must set */
11745 if (has_intervening_patws) {
11746 RExC_parse_inc_by(1); /* past the '*' */
11748 /* For strict backwards compatibility, don't change the message
11749 * now that we also have lowercase operands */
11750 if (isUPPER(*RExC_parse)) {
11751 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11754 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11757 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11758 if ( *RExC_parse == ':' ) {
11759 start_arg = RExC_parse + 1;
11763 if (isUPPER(*RExC_parse)) {
11766 RExC_parse_inc_by(1);
11769 RExC_parse_inc_utf8();
11772 verb_len = RExC_parse - start_verb;
11774 if (RExC_parse >= RExC_end) {
11775 goto unterminated_verb_pattern;
11779 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11782 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11783 unterminated_verb_pattern:
11785 vFAIL("Unterminated verb pattern argument");
11788 vFAIL("Unterminated '(*...' argument");
11792 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11794 vFAIL("Unterminated verb pattern");
11797 vFAIL("Unterminated '(*...' construct");
11802 /* Here, we know that RExC_parse < RExC_end */
11804 switch ( *start_verb ) {
11805 case 'A': /* (*ACCEPT) */
11806 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11808 internal_argval = RExC_nestroot;
11811 case 'C': /* (*COMMIT) */
11812 if ( memEQs(start_verb, verb_len,"COMMIT") )
11815 case 'F': /* (*FAIL) */
11816 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11820 case ':': /* (*:NAME) */
11821 case 'M': /* (*MARK:NAME) */
11822 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11827 case 'P': /* (*PRUNE) */
11828 if ( memEQs(start_verb, verb_len,"PRUNE") )
11831 case 'S': /* (*SKIP) */
11832 if ( memEQs(start_verb, verb_len,"SKIP") )
11835 case 'T': /* (*THEN) */
11836 /* [19:06] <TimToady> :: is then */
11837 if ( memEQs(start_verb, verb_len,"THEN") ) {
11839 RExC_seen |= REG_CUTGROUP_SEEN;
11843 if ( memEQs(start_verb, verb_len, "asr")
11844 || memEQs(start_verb, verb_len, "atomic_script_run"))
11846 paren = 'r'; /* Mnemonic: recursed run */
11849 else if (memEQs(start_verb, verb_len, "atomic")) {
11850 paren = 't'; /* AtOMIC */
11851 goto alpha_assertions;
11855 if ( memEQs(start_verb, verb_len, "plb")
11856 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11859 goto lookbehind_alpha_assertions;
11861 else if ( memEQs(start_verb, verb_len, "pla")
11862 || memEQs(start_verb, verb_len, "positive_lookahead"))
11865 goto alpha_assertions;
11869 if ( memEQs(start_verb, verb_len, "nlb")
11870 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11873 goto lookbehind_alpha_assertions;
11875 else if ( memEQs(start_verb, verb_len, "nla")
11876 || memEQs(start_verb, verb_len, "negative_lookahead"))
11879 goto alpha_assertions;
11883 if ( memEQs(start_verb, verb_len, "sr")
11884 || memEQs(start_verb, verb_len, "script_run"))
11886 regnode_offset atomic;
11892 /* This indicates Unicode rules. */
11893 REQUIRE_UNI_RULES(flagp, 0);
11899 RExC_parse_set(start_arg);
11901 if (RExC_in_script_run) {
11903 /* Nested script runs are treated as no-ops, because
11904 * if the nested one fails, the outer one must as
11905 * well. It could fail sooner, and avoid (??{} with
11906 * side effects, but that is explicitly documented as
11907 * undefined behavior. */
11911 if (paren == 's') {
11916 /* But, the atomic part of a nested atomic script run
11917 * isn't a no-op, but can be treated just like a '(?>'
11923 if (paren == 's') {
11924 /* Here, we're starting a new regular script run */
11925 ret = reg_node(pRExC_state, SROPEN);
11926 RExC_in_script_run = 1;
11931 /* Here, we are starting an atomic script run. This is
11932 * handled by recursing to deal with the atomic portion
11933 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11935 ret = reg_node(pRExC_state, SROPEN);
11937 RExC_in_script_run = 1;
11939 atomic = reg(pRExC_state, 'r', &flags, depth);
11940 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11941 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11945 if (! REGTAIL(pRExC_state, ret, atomic)) {
11946 REQUIRE_BRANCHJ(flagp, 0);
11949 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11952 REQUIRE_BRANCHJ(flagp, 0);
11955 RExC_in_script_run = 0;
11961 lookbehind_alpha_assertions:
11962 seen_flag_set = REG_LOOKBEHIND_SEEN;
11967 if ( !start_arg ) {
11971 if ( RExC_parse == start_arg ) {
11972 if ( paren == 'A' || paren == 'B' ) {
11973 /* An empty negative lookaround assertion is failure.
11974 * See also: S_reg_la_OPFAIL() */
11976 /* Note: OPFAIL is *not* zerolen. */
11977 ret = reganode(pRExC_state, OPFAIL, 0);
11978 nextchar(pRExC_state);
11982 if ( paren == 'a' || paren == 'b' ) {
11983 /* An empty positive lookaround assertion is success.
11984 * See also: S_reg_la_NOTHING() */
11986 /* Note: NOTHING is zerolen, so increment here */
11987 RExC_seen_zerolen++;
11988 ret = reg_node(pRExC_state, NOTHING);
11989 nextchar(pRExC_state);
11994 RExC_seen_zerolen++;
11995 RExC_in_lookaround++;
11996 RExC_seen |= seen_flag_set;
11998 RExC_parse_set(start_arg);
12002 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
12003 UTF8fARG(UTF, verb_len, start_verb));
12004 NOT_REACHED; /*NOTREACHED*/
12006 } /* End of switch */
12008 RExC_parse_inc_safe();
12009 if (has_upper || verb_len == 0) {
12010 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
12011 UTF8fARG(UTF, verb_len, start_verb));
12014 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
12015 UTF8fARG(UTF, verb_len, start_verb));
12018 if ( RExC_parse == start_arg ) {
12021 if ( arg_required && !start_arg ) {
12022 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
12023 (int) verb_len, start_verb);
12025 if (internal_argval == -1) {
12026 ret = reganode(pRExC_state, op, 0);
12028 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
12030 RExC_seen |= REG_VERBARG_SEEN;
12032 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
12033 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
12034 STR_WITH_LEN("S"));
12035 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
12036 FLAGS(REGNODE_p(ret)) = 1;
12038 FLAGS(REGNODE_p(ret)) = 0;
12040 if ( internal_argval != -1 )
12041 ARG2L_SET(REGNODE_p(ret), internal_argval);
12042 nextchar(pRExC_state);
12045 else if (*RExC_parse == '?') { /* (?...) */
12046 bool is_logical = 0;
12047 const char * const seqstart = RExC_parse;
12048 const char * endptr;
12049 const char non_existent_group_msg[]
12050 = "Reference to nonexistent group";
12051 const char impossible_group[] = "Invalid reference to group";
12053 if (has_intervening_patws) {
12054 RExC_parse_inc_by(1);
12055 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
12058 RExC_parse_inc_by(1); /* past the '?' */
12059 paren = *RExC_parse; /* might be a trailing NUL, if not
12062 if (RExC_parse > RExC_end) {
12065 ret = 0; /* For look-ahead/behind. */
12068 case 'P': /* (?P...) variants for those used to PCRE/Python */
12069 paren = *RExC_parse;
12070 if ( paren == '<') { /* (?P<...>) named capture */
12071 RExC_parse_inc_by(1);
12072 if (RExC_parse >= RExC_end) {
12073 vFAIL("Sequence (?P<... not terminated");
12075 goto named_capture;
12077 else if (paren == '>') { /* (?P>name) named recursion */
12078 RExC_parse_inc_by(1);
12079 if (RExC_parse >= RExC_end) {
12080 vFAIL("Sequence (?P>... not terminated");
12082 goto named_recursion;
12084 else if (paren == '=') { /* (?P=...) named backref */
12085 RExC_parse_inc_by(1);
12086 return handle_named_backref(pRExC_state, flagp,
12087 segment_parse_start, ')');
12089 RExC_parse_inc_if_char();
12090 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
12091 vFAIL3("Sequence (%.*s...) not recognized",
12092 (int) (RExC_parse - seqstart), seqstart);
12093 NOT_REACHED; /*NOTREACHED*/
12094 case '<': /* (?<...) */
12095 /* If you want to support (?<*...), first reconcile with GH #17363 */
12096 if (*RExC_parse == '!') {
12097 paren = ','; /* negative lookbehind (?<! ... ) */
12098 RExC_parse_inc_by(1);
12099 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
12104 if (*RExC_parse == '=') {
12105 /* paren = '<' - negative lookahead (?<= ... ) */
12106 RExC_parse_inc_by(1);
12107 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
12118 case '\'': /* (?'...') */
12119 name_start = RExC_parse;
12120 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
12121 if ( RExC_parse == name_start
12122 || RExC_parse >= RExC_end
12123 || *RExC_parse != paren)
12125 vFAIL2("Sequence (?%c... not terminated",
12126 paren=='>' ? '<' : (char) paren);
12131 if (!svname) /* shouldn't happen */
12133 "panic: reg_scan_name returned NULL");
12134 if (!RExC_paren_names) {
12135 RExC_paren_names= newHV();
12136 sv_2mortal(MUTABLE_SV(RExC_paren_names));
12138 RExC_paren_name_list= newAV();
12139 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
12142 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
12144 sv_dat = HeVAL(he_str);
12146 /* croak baby croak */
12148 "panic: paren_name hash element allocation failed");
12149 } else if ( SvPOK(sv_dat) ) {
12150 /* (?|...) can mean we have dupes so scan to check
12151 its already been stored. Maybe a flag indicating
12152 we are inside such a construct would be useful,
12153 but the arrays are likely to be quite small, so
12154 for now we punt -- dmq */
12155 IV count = SvIV(sv_dat);
12156 I32 *pv = (I32*)SvPVX(sv_dat);
12158 for ( i = 0 ; i < count ; i++ ) {
12159 if ( pv[i] == RExC_npar ) {
12165 pv = (I32*)SvGROW(sv_dat,
12166 SvCUR(sv_dat) + sizeof(I32)+1);
12167 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
12168 pv[count] = RExC_npar;
12169 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
12172 (void)SvUPGRADE(sv_dat, SVt_PVNV);
12173 sv_setpvn(sv_dat, (char *)&(RExC_npar),
12176 SvIV_set(sv_dat, 1);
12179 /* Yes this does cause a memory leak in debugging Perls
12181 if (!av_store(RExC_paren_name_list,
12182 RExC_npar, SvREFCNT_inc_NN(svname)))
12183 SvREFCNT_dec_NN(svname);
12186 /*sv_dump(sv_dat);*/
12188 nextchar(pRExC_state);
12190 goto capturing_parens;
12192 NOT_REACHED; /*NOTREACHED*/
12193 case '=': /* (?=...) */
12194 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
12197 case '!': /* (?!...) */
12198 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
12201 case '|': /* (?|...) */
12202 /* branch reset, behave like a (?:...) except that
12203 buffers in alternations share the same numbers */
12205 after_freeze = freeze_paren = RExC_npar;
12207 /* XXX This construct currently requires an extra pass.
12208 * Investigation would be required to see if that could be
12210 REQUIRE_PARENS_PASS;
12212 case ':': /* (?:...) */
12213 case '>': /* (?>...) */
12215 case '$': /* (?$...) */
12216 case '@': /* (?@...) */
12217 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
12219 case '0' : /* (?0) */
12220 case 'R' : /* (?R) */
12221 if (RExC_parse == RExC_end || *RExC_parse != ')')
12222 FAIL("Sequence (?R) not terminated");
12224 RExC_seen |= REG_RECURSE_SEEN;
12226 /* XXX These constructs currently require an extra pass.
12227 * It probably could be changed */
12228 REQUIRE_PARENS_PASS;
12230 *flagp |= POSTPONED;
12231 goto gen_recurse_regop;
12233 /* named and numeric backreferences */
12234 case '&': /* (?&NAME) */
12235 segment_parse_start = RExC_parse - 1;
12238 SV *sv_dat = reg_scan_name(pRExC_state,
12239 REG_RSN_RETURN_DATA);
12240 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
12242 if (RExC_parse >= RExC_end || *RExC_parse != ')')
12243 vFAIL("Sequence (?&... not terminated");
12244 goto gen_recurse_regop;
12247 if (! inRANGE(RExC_parse[0], '1', '9')) {
12248 RExC_parse_inc_by(1);
12249 vFAIL("Illegal pattern");
12251 goto parse_recursion;
12253 case '-': /* (?-1) */
12254 if (! inRANGE(RExC_parse[0], '1', '9')) {
12255 RExC_parse--; /* rewind to let it be handled later */
12259 case '1': case '2': case '3': case '4': /* (?1) */
12260 case '5': case '6': case '7': case '8': case '9':
12261 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
12264 bool is_neg = FALSE;
12266 segment_parse_start = RExC_parse - 1;
12267 if (*RExC_parse == '-') {
12268 RExC_parse_inc_by(1);
12272 if (grok_atoUV(RExC_parse, &unum, &endptr)
12276 RExC_parse_set((char*)endptr);
12278 else { /* Overflow, or something like that. Position
12279 beyond all digits for the message */
12280 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
12281 RExC_parse_inc_by(1);
12283 vFAIL(impossible_group);
12286 /* -num is always representable on 1 and 2's complement
12291 if (*RExC_parse!=')')
12292 vFAIL("Expecting close bracket");
12295 if (paren == '-' || paren == '+') {
12297 /* Don't overflow */
12298 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
12299 RExC_parse_inc_by(1);
12300 vFAIL(impossible_group);
12304 Diagram of capture buffer numbering.
12305 Top line is the normal capture buffer numbers
12306 Bottom line is the negative indexing as from
12310 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
12311 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
12312 - 5 4 3 2 1 X Y x x
12314 Resolve to absolute group. Recall that RExC_npar is +1 of
12315 the actual parenthesis group number. For lookahead, we
12316 have to compensate for that. Using the above example, when
12317 we get to Y in the parse, num is 2 and RExC_npar is 6. We
12318 want 7 for +2, and 4 for -2.
12320 if ( paren == '+' ) {
12326 if (paren == '-' && num < 1) {
12327 RExC_parse_inc_by(1);
12328 vFAIL(non_existent_group_msg);
12332 if (num >= RExC_npar) {
12334 /* It might be a forward reference; we can't fail until we
12335 * know, by completing the parse to get all the groups, and
12336 * then reparsing */
12337 if (ALL_PARENS_COUNTED) {
12338 if (num >= RExC_total_parens) {
12339 RExC_parse_inc_by(1);
12340 vFAIL(non_existent_group_msg);
12344 REQUIRE_PARENS_PASS;
12348 /* We keep track how many GOSUB items we have produced.
12349 To start off the ARG2L() of the GOSUB holds its "id",
12350 which is used later in conjunction with RExC_recurse
12351 to calculate the offset we need to jump for the GOSUB,
12352 which it will store in the final representation.
12353 We have to defer the actual calculation until much later
12354 as the regop may move.
12356 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
12357 RExC_recurse_count++;
12358 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12359 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
12360 22, "| |", (int)(depth * 2 + 1), "",
12361 (UV)ARG(REGNODE_p(ret)),
12362 (IV)ARG2L(REGNODE_p(ret))));
12363 RExC_seen |= REG_RECURSE_SEEN;
12365 *flagp |= POSTPONED;
12366 assert(*RExC_parse == ')');
12367 nextchar(pRExC_state);
12372 case '?': /* (??...) */
12374 if (*RExC_parse != '{') {
12375 RExC_parse_inc_if_char();
12376 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
12378 "Sequence (%" UTF8f "...) not recognized",
12379 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
12380 NOT_REACHED; /*NOTREACHED*/
12382 *flagp |= POSTPONED;
12384 RExC_parse_inc_by(1);
12386 case '{': /* (?{...}) */
12389 struct reg_code_block *cb;
12392 RExC_seen_zerolen++;
12394 if ( !pRExC_state->code_blocks
12395 || pRExC_state->code_index
12396 >= pRExC_state->code_blocks->count
12397 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
12398 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
12401 if (RExC_pm_flags & PMf_USE_RE_EVAL)
12402 FAIL("panic: Sequence (?{...}): no code block found\n");
12403 FAIL("Eval-group not allowed at runtime, use re 'eval'");
12405 /* this is a pre-compiled code block (?{...}) */
12406 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
12407 RExC_parse_set(RExC_start + cb->end);
12409 if (cb->src_regex) {
12410 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
12411 RExC_rxi->data->data[n] =
12412 (void*)SvREFCNT_inc((SV*)cb->src_regex);
12413 RExC_rxi->data->data[n+1] = (void*)o;
12416 n = add_data(pRExC_state,
12417 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
12418 RExC_rxi->data->data[n] = (void*)o;
12420 pRExC_state->code_index++;
12421 nextchar(pRExC_state);
12424 regnode_offset eval;
12425 ret = reg_node(pRExC_state, LOGICAL);
12427 eval = reg2Lanode(pRExC_state, EVAL,
12430 /* for later propagation into (??{})
12432 RExC_flags & RXf_PMf_COMPILETIME
12434 FLAGS(REGNODE_p(ret)) = 2;
12435 if (! REGTAIL(pRExC_state, ret, eval)) {
12436 REQUIRE_BRANCHJ(flagp, 0);
12440 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
12443 case '(': /* (?(?{...})...) and (?(?=...)...) */
12446 const int DEFINE_len = sizeof("DEFINE") - 1;
12447 if ( RExC_parse < RExC_end - 1
12448 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
12449 && ( RExC_parse[1] == '='
12450 || RExC_parse[1] == '!'
12451 || RExC_parse[1] == '<'
12452 || RExC_parse[1] == '{'))
12453 || ( RExC_parse[0] == '*' /* (?(*...)) */
12454 && ( memBEGINs(RExC_parse + 1,
12455 (Size_t) (RExC_end - (RExC_parse + 1)),
12457 || memBEGINs(RExC_parse + 1,
12458 (Size_t) (RExC_end - (RExC_parse + 1)),
12460 || memBEGINs(RExC_parse + 1,
12461 (Size_t) (RExC_end - (RExC_parse + 1)),
12463 || memBEGINs(RExC_parse + 1,
12464 (Size_t) (RExC_end - (RExC_parse + 1)),
12466 || memBEGINs(RExC_parse + 1,
12467 (Size_t) (RExC_end - (RExC_parse + 1)),
12468 "positive_lookahead:")
12469 || memBEGINs(RExC_parse + 1,
12470 (Size_t) (RExC_end - (RExC_parse + 1)),
12471 "positive_lookbehind:")
12472 || memBEGINs(RExC_parse + 1,
12473 (Size_t) (RExC_end - (RExC_parse + 1)),
12474 "negative_lookahead:")
12475 || memBEGINs(RExC_parse + 1,
12476 (Size_t) (RExC_end - (RExC_parse + 1)),
12477 "negative_lookbehind:"))))
12478 ) { /* Lookahead or eval. */
12480 regnode_offset tail;
12482 ret = reg_node(pRExC_state, LOGICAL);
12483 FLAGS(REGNODE_p(ret)) = 1;
12485 tail = reg(pRExC_state, 1, &flag, depth+1);
12486 RETURN_FAIL_ON_RESTART(flag, flagp);
12487 if (! REGTAIL(pRExC_state, ret, tail)) {
12488 REQUIRE_BRANCHJ(flagp, 0);
12492 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
12493 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
12495 char ch = RExC_parse[0] == '<' ? '>' : '\'';
12496 char *name_start= RExC_parse;
12497 RExC_parse_inc_by(1);
12499 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
12500 if ( RExC_parse == name_start
12501 || RExC_parse >= RExC_end
12502 || *RExC_parse != ch)
12504 vFAIL2("Sequence (?(%c... not terminated",
12505 (ch == '>' ? '<' : ch));
12507 RExC_parse_inc_by(1);
12509 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12510 RExC_rxi->data->data[num]=(void*)sv_dat;
12511 SvREFCNT_inc_simple_void_NN(sv_dat);
12513 ret = reganode(pRExC_state, GROUPPN, num);
12514 goto insert_if_check_paren;
12516 else if (memBEGINs(RExC_parse,
12517 (STRLEN) (RExC_end - RExC_parse),
12520 ret = reganode(pRExC_state, DEFINEP, 0);
12521 RExC_parse_inc_by(DEFINE_len);
12523 goto insert_if_check_paren;
12525 else if (RExC_parse[0] == 'R') {
12526 RExC_parse_inc_by(1);
12527 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
12528 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
12529 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
12532 if (RExC_parse[0] == '0') {
12534 RExC_parse_inc_by(1);
12536 else if (inRANGE(RExC_parse[0], '1', '9')) {
12539 if (grok_atoUV(RExC_parse, &uv, &endptr)
12542 parno = (I32)uv + 1;
12543 RExC_parse_set((char*)endptr);
12545 /* else "Switch condition not recognized" below */
12546 } else if (RExC_parse[0] == '&') {
12548 RExC_parse_inc_by(1);
12549 sv_dat = reg_scan_name(pRExC_state,
12550 REG_RSN_RETURN_DATA);
12552 parno = 1 + *((I32 *)SvPVX(sv_dat));
12554 ret = reganode(pRExC_state, INSUBP, parno);
12555 goto insert_if_check_paren;
12557 else if (inRANGE(RExC_parse[0], '1', '9')) {
12562 if (grok_atoUV(RExC_parse, &uv, &endptr)
12566 RExC_parse_set((char*)endptr);
12569 vFAIL("panic: grok_atoUV returned FALSE");
12571 ret = reganode(pRExC_state, GROUPP, parno);
12573 insert_if_check_paren:
12574 if (UCHARAT(RExC_parse) != ')') {
12575 RExC_parse_inc_safe();
12576 vFAIL("Switch condition not recognized");
12578 nextchar(pRExC_state);
12580 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12583 REQUIRE_BRANCHJ(flagp, 0);
12585 br = regbranch(pRExC_state, &flags, 1, depth+1);
12587 RETURN_FAIL_ON_RESTART(flags,flagp);
12588 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12591 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12594 REQUIRE_BRANCHJ(flagp, 0);
12596 c = UCHARAT(RExC_parse);
12597 nextchar(pRExC_state);
12598 if (flags&HASWIDTH)
12599 *flagp |= HASWIDTH;
12602 vFAIL("(?(DEFINE)....) does not allow branches");
12604 /* Fake one for optimizer. */
12605 lastbr = reganode(pRExC_state, IFTHEN, 0);
12607 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12608 RETURN_FAIL_ON_RESTART(flags, flagp);
12609 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12612 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12613 REQUIRE_BRANCHJ(flagp, 0);
12615 if (flags&HASWIDTH)
12616 *flagp |= HASWIDTH;
12617 c = UCHARAT(RExC_parse);
12618 nextchar(pRExC_state);
12623 if (RExC_parse >= RExC_end)
12624 vFAIL("Switch (?(condition)... not terminated");
12626 vFAIL("Switch (?(condition)... contains too many branches");
12628 ender = reg_node(pRExC_state, TAIL);
12629 if (! REGTAIL(pRExC_state, br, ender)) {
12630 REQUIRE_BRANCHJ(flagp, 0);
12633 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12634 REQUIRE_BRANCHJ(flagp, 0);
12636 if (! REGTAIL(pRExC_state,
12638 REGNODE_AFTER(REGNODE_p(lastbr))),
12641 REQUIRE_BRANCHJ(flagp, 0);
12645 if (! REGTAIL(pRExC_state, ret, ender)) {
12646 REQUIRE_BRANCHJ(flagp, 0);
12648 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12649 RExC_size++; /* XXX WHY do we need this?!!
12650 For large programs it seems to be required
12651 but I can't figure out why. -- dmq*/
12655 RExC_parse_inc_safe();
12656 vFAIL("Unknown switch condition (?(...))");
12658 case '[': /* (?[ ... ]) */
12659 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
12660 case 0: /* A NUL */
12661 RExC_parse--; /* for vFAIL to print correctly */
12662 vFAIL("Sequence (? incomplete");
12666 if (RExC_strict) { /* [perl #132851] */
12667 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12670 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12672 default: /* e.g., (?i) */
12673 RExC_parse_set((char *) seqstart + 1);
12675 parse_lparen_question_flags(pRExC_state);
12676 if (UCHARAT(RExC_parse) != ':') {
12677 if (RExC_parse < RExC_end)
12678 nextchar(pRExC_state);
12683 nextchar(pRExC_state);
12688 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12692 if (! ALL_PARENS_COUNTED) {
12693 /* If we are in our first pass through (and maybe only pass),
12694 * we need to allocate memory for the capturing parentheses
12698 if (!RExC_parens_buf_size) {
12699 /* first guess at number of parens we might encounter */
12700 RExC_parens_buf_size = 10;
12702 /* setup RExC_open_parens, which holds the address of each
12703 * OPEN tag, and to make things simpler for the 0 index the
12704 * start of the program - this is used later for offsets */
12705 Newxz(RExC_open_parens, RExC_parens_buf_size,
12707 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12709 /* setup RExC_close_parens, which holds the address of each
12710 * CLOSE tag, and to make things simpler for the 0 index
12711 * the end of the program - this is used later for offsets
12713 Newxz(RExC_close_parens, RExC_parens_buf_size,
12715 /* we dont know where end op starts yet, so we dont need to
12716 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12719 else if (RExC_npar > RExC_parens_buf_size) {
12720 I32 old_size = RExC_parens_buf_size;
12722 RExC_parens_buf_size *= 2;
12724 Renew(RExC_open_parens, RExC_parens_buf_size,
12726 Zero(RExC_open_parens + old_size,
12727 RExC_parens_buf_size - old_size, regnode_offset);
12729 Renew(RExC_close_parens, RExC_parens_buf_size,
12731 Zero(RExC_close_parens + old_size,
12732 RExC_parens_buf_size - old_size, regnode_offset);
12736 ret = reganode(pRExC_state, OPEN, parno);
12737 if (!RExC_nestroot)
12738 RExC_nestroot = parno;
12739 if (RExC_open_parens && !RExC_open_parens[parno])
12741 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12742 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12743 22, "| |", (int)(depth * 2 + 1), "",
12745 RExC_open_parens[parno]= ret;
12750 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12759 /* Pick up the branches, linking them together. */
12760 segment_parse_start = RExC_parse;
12761 br = regbranch(pRExC_state, &flags, 1, depth+1);
12763 /* branch_len = (paren != 0); */
12766 RETURN_FAIL_ON_RESTART(flags, flagp);
12767 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12769 if (*RExC_parse == '|') {
12770 if (RExC_use_BRANCHJ) {
12771 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12774 reginsert(pRExC_state, BRANCH, br, depth+1);
12778 else if (paren == ':') {
12779 *flagp |= flags&SIMPLE;
12781 if (is_open) { /* Starts with OPEN. */
12782 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12783 REQUIRE_BRANCHJ(flagp, 0);
12786 else if (paren != '?') /* Not Conditional */
12788 *flagp |= flags & (HASWIDTH | POSTPONED);
12790 while (*RExC_parse == '|') {
12791 if (RExC_use_BRANCHJ) {
12794 ender = reganode(pRExC_state, LONGJMP, 0);
12796 /* Append to the previous. */
12797 shut_gcc_up = REGTAIL(pRExC_state,
12798 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
12800 PERL_UNUSED_VAR(shut_gcc_up);
12802 nextchar(pRExC_state);
12803 if (freeze_paren) {
12804 if (RExC_npar > after_freeze)
12805 after_freeze = RExC_npar;
12806 RExC_npar = freeze_paren;
12808 br = regbranch(pRExC_state, &flags, 0, depth+1);
12811 RETURN_FAIL_ON_RESTART(flags, flagp);
12812 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12814 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12815 REQUIRE_BRANCHJ(flagp, 0);
12818 *flagp |= flags & (HASWIDTH | POSTPONED);
12821 if (have_branch || paren != ':') {
12824 /* Make a closing node, and hook it on the end. */
12827 ender = reg_node(pRExC_state, TAIL);
12830 ender = reganode(pRExC_state, CLOSE, parno);
12831 if ( RExC_close_parens ) {
12832 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12833 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12834 22, "| |", (int)(depth * 2 + 1), "",
12835 (IV)parno, ender));
12836 RExC_close_parens[parno]= ender;
12837 if (RExC_nestroot == parno)
12842 ender = reg_node(pRExC_state, SRCLOSE);
12843 RExC_in_script_run = 0;
12845 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
12846 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
12847 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
12848 case '<': /* (?<= ... ) */
12849 case ',': /* (?<! ... ) */
12850 *flagp &= ~HASWIDTH;
12851 ender = reg_node(pRExC_state, LOOKBEHIND_END);
12853 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
12858 *flagp &= ~HASWIDTH;
12860 case 't': /* aTomic */
12862 ender = reg_node(pRExC_state, SUCCEED);
12865 ender = reg_node(pRExC_state, END);
12866 assert(!RExC_end_op); /* there can only be one! */
12867 RExC_end_op = REGNODE_p(ender);
12868 if (RExC_close_parens) {
12869 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12870 "%*s%*s Setting close paren #0 (END) to %zu\n",
12871 22, "| |", (int)(depth * 2 + 1), "",
12874 RExC_close_parens[0]= ender;
12879 DEBUG_PARSE_MSG("lsbr");
12880 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12881 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12882 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12883 SvPV_nolen_const(RExC_mysv1),
12885 SvPV_nolen_const(RExC_mysv2),
12887 (IV)(ender - lastbr)
12890 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12891 REQUIRE_BRANCHJ(flagp, 0);
12895 char is_nothing= 1;
12897 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12899 /* Hook the tails of the branches to the closing node. */
12900 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12901 const U8 op = REGNODE_TYPE(OP(br));
12902 regnode *nextoper = REGNODE_AFTER(br);
12903 if (op == BRANCH) {
12904 if (! REGTAIL_STUDY(pRExC_state,
12905 REGNODE_OFFSET(nextoper),
12908 REQUIRE_BRANCHJ(flagp, 0);
12910 if ( OP(nextoper) != NOTHING
12911 || regnext(nextoper) != REGNODE_p(ender))
12914 else if (op == BRANCHJ) {
12915 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12916 REGNODE_OFFSET(nextoper),
12918 PERL_UNUSED_VAR(shut_gcc_up);
12919 /* for now we always disable this optimisation * /
12920 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
12921 if ( OP(nopr) != NOTHING
12922 || regnext(nopr) != REGNODE_p(ender))
12928 regnode * ret_as_regnode = REGNODE_p(ret);
12929 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
12930 ? regnext(ret_as_regnode)
12933 DEBUG_PARSE_MSG("NADA");
12934 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12935 NULL, pRExC_state);
12936 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12937 NULL, pRExC_state);
12938 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12939 SvPV_nolen_const(RExC_mysv1),
12940 (IV)REG_NODE_NUM(ret_as_regnode),
12941 SvPV_nolen_const(RExC_mysv2),
12947 if (OP(REGNODE_p(ender)) == TAIL) {
12949 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
12952 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12953 OP(opt)= OPTIMIZED;
12954 NEXT_OFF(br)= REGNODE_p(ender) - br;
12962 /* Even/odd or x=don't care: 010101x10x */
12963 static const char parens[] = "=!aA<,>Bbt";
12964 /* flag below is set to 0 up through 'A'; 1 for larger */
12966 if (paren && (p = strchr(parens, paren))) {
12967 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12968 int flag = (p - parens) > 3;
12970 if (paren == '>' || paren == 't') {
12971 node = SUSPEND, flag = 0;
12974 reginsert(pRExC_state, node, ret, depth+1);
12975 FLAGS(REGNODE_p(ret)) = flag;
12976 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12978 REQUIRE_BRANCHJ(flagp, 0);
12983 /* Check for proper termination. */
12985 /* restore original flags, but keep (?p) and, if we've encountered
12986 * something in the parse that changes /d rules into /u, keep the /u */
12987 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12988 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
12989 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12991 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12992 RExC_parse_set(reg_parse_start);
12993 vFAIL("Unmatched (");
12995 nextchar(pRExC_state);
12997 else if (!paren && RExC_parse < RExC_end) {
12998 if (*RExC_parse == ')') {
12999 RExC_parse_inc_by(1);
13000 vFAIL("Unmatched )");
13003 FAIL("Junk on end of regexp"); /* "Can't happen". */
13004 NOT_REACHED; /* NOTREACHED */
13007 if (after_freeze > RExC_npar)
13008 RExC_npar = after_freeze;
13010 RExC_in_lookaround = was_in_lookaround;
13016 - regbranch - one alternative of an | operator
13018 * Implements the concatenation operator.
13020 * On success, returns the offset at which any next node should be placed into
13021 * the regex engine program being compiled.
13023 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
13024 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
13027 STATIC regnode_offset
13028 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
13030 regnode_offset ret;
13031 regnode_offset chain = 0;
13032 regnode_offset latest;
13033 I32 flags = 0, c = 0;
13034 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13036 PERL_ARGS_ASSERT_REGBRANCH;
13038 DEBUG_PARSE("brnc");
13043 if (RExC_use_BRANCHJ)
13044 ret = reganode(pRExC_state, BRANCHJ, 0);
13046 ret = reg_node(pRExC_state, BRANCH);
13050 *flagp = 0; /* Initialize. */
13052 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13053 FALSE /* Don't force to /x */ );
13054 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
13055 flags &= ~TRYAGAIN;
13056 latest = regpiece(pRExC_state, &flags, depth+1);
13058 if (flags & TRYAGAIN)
13060 RETURN_FAIL_ON_RESTART(flags, flagp);
13061 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
13065 *flagp |= flags&(HASWIDTH|POSTPONED);
13067 /* FIXME adding one for every branch after the first is probably
13068 * excessive now we have TRIE support. (hv) */
13070 if (! REGTAIL(pRExC_state, chain, latest)) {
13071 /* XXX We could just redo this branch, but figuring out what
13072 * bookkeeping needs to be reset is a pain, and it's likely
13073 * that other branches that goto END will also be too large */
13074 REQUIRE_BRANCHJ(flagp, 0);
13080 if (chain == 0) { /* Loop ran zero times. */
13081 chain = reg_node(pRExC_state, NOTHING);
13086 *flagp |= flags&SIMPLE;
13098 #ifndef PERL_IN_XSUB_RE
13100 Perl_regcurly(const char *s, const char *e, const char * result[5])
13102 /* This function matches a {m,n} quantifier. When called with a NULL final
13103 * argument, it simply parses the input from 's' up through 'e-1', and
13104 * returns a boolean as to whether or not this input is syntactically a
13105 * {m,n} quantifier.
13107 * When called with a non-NULL final parameter, and when the function
13108 * returns TRUE, it additionally stores information into the array
13109 * specified by that parameter about what it found in the parse. The
13110 * parameter must be a pointer into a 5 element array of 'const char *'
13111 * elements. The returned information is as follows:
13112 * result[RBRACE] points to the closing brace
13113 * result[MIN_S] points to the first byte of the lower bound
13114 * result[MIN_E] points to one beyond the final byte of the lower bound
13115 * result[MAX_S] points to the first byte of the upper bound
13116 * result[MAX_E] points to one beyond the final byte of the upper bound
13118 * If the quantifier is of the form {m,} (meaning an infinite upper
13119 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
13120 * to is irrelevant, just that it's the same place
13122 * If instead the quantifier is of the form {m} there is actually only
13123 * one bound, and both the upper and lower result[] elements are set to
13126 * This function checks only for syntactic validity; it leaves checking for
13127 * semantic validity and raising any diagnostics to the caller. This
13128 * function is called in multiple places to check for syntax, but only from
13129 * one for semantics. It makes it as simple as possible for the
13130 * syntax-only callers, while furnishing just enough information for the
13134 const char * min_start = NULL;
13135 const char * max_start = NULL;
13136 const char * min_end = NULL;
13137 const char * max_end = NULL;
13139 bool has_comma = FALSE;
13141 PERL_ARGS_ASSERT_REGCURLY;
13143 if (s >= e || *s++ != '{')
13146 while (s < e && isBLANK(*s)) {
13154 } while (s < e && isDIGIT(*s));
13158 while (s < e && isBLANK(*s)) {
13166 while (s < e && isBLANK(*s)) {
13174 } while (s < e && isDIGIT(*s));
13179 while (s < e && isBLANK(*s)) {
13182 /* Need at least one number */
13183 if (s >= e || *s != '}' || (! min_start && ! max_end)) {
13189 result[RBRACE] = s;
13191 result[MIN_S] = min_start;
13192 result[MIN_E] = min_end;
13195 result[MAX_S] = max_start;
13196 result[MAX_E] = max_end;
13199 /* Having no value after the comma is signalled by setting
13200 * start and end to the same value. What that value is isn't
13201 * relevant; NULL is chosen simply because it will fail if the
13202 * caller mistakenly uses it */
13203 result[MAX_S] = result[MAX_E] = NULL;
13206 else { /* No comma means lower and upper bounds are the same */
13207 result[MAX_S] = min_start;
13208 result[MAX_E] = min_end;
13217 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
13218 const char * start, const char * end)
13220 /* This is a helper function for regpiece() to compute, given the
13221 * quantifier {m,n}, the value of either m or n, based on the starting
13222 * position 'start' in the string, through the byte 'end-1', returning it
13223 * if valid, and failing appropriately if not. It knows the restrictions
13224 * imposed on quantifier values */
13227 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
13229 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
13231 if (grok_atoUV(start, &uv, &end)) {
13232 if (uv < REG_INFTY) { /* A valid, small-enough number */
13236 else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
13237 leading zeros or overflow */
13238 RExC_parse_set((char * ) end);
13240 /* Perhaps too generic a msg for what is only failure from having
13241 * leading zeros, but this is how it's always behaved. */
13242 vFAIL("Invalid quantifier in {,}");
13243 NOT_REACHED; /*NOTREACHED*/
13246 /* Here, found a quantifier, but was too large; either it overflowed or was
13247 * too big a legal number */
13248 RExC_parse_set((char * ) end);
13249 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
13251 NOT_REACHED; /*NOTREACHED*/
13252 return U32_MAX; /* Perhaps some compilers will be expecting a return */
13256 - regpiece - something followed by possible quantifier * + ? {n,m}
13258 * Note that the branching code sequences used for ? and the general cases
13259 * of * and + are somewhat optimized: they use the same NOTHING node as
13260 * both the endmarker for their branch list and the body of the last branch.
13261 * It might seem that this node could be dispensed with entirely, but the
13262 * endmarker role is not redundant.
13264 * On success, returns the offset at which any next node should be placed into
13265 * the regex engine program being compiled.
13267 * Returns 0 otherwise, with *flagp set to indicate why:
13268 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
13269 * RESTART_PARSE if the parse needs to be restarted, or'd with
13270 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
13272 STATIC regnode_offset
13273 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13275 regnode_offset ret;
13278 const char * const origparse = RExC_parse;
13280 I32 max = REG_INFTY;
13282 /* Save the original in case we change the emitted regop to a FAIL. */
13283 const regnode_offset orig_emit = RExC_emit;
13285 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13287 PERL_ARGS_ASSERT_REGPIECE;
13289 DEBUG_PARSE("piec");
13291 ret = regatom(pRExC_state, &flags, depth+1);
13293 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
13294 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
13299 const char * regcurly_return[5];
13302 nextchar(pRExC_state);
13307 nextchar(pRExC_state);
13312 nextchar(pRExC_state);
13316 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
13317 to determine which */
13318 if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
13319 const char * min_start = regcurly_return[MIN_S];
13320 const char * min_end = regcurly_return[MIN_E];
13321 const char * max_start = regcurly_return[MAX_S];
13322 const char * max_end = regcurly_return[MAX_E];
13325 min = get_quantifier_value(pRExC_state, min_start, min_end);
13331 if (max_start == max_end) { /* Was of the form {m,} */
13334 else if (max_start == min_start) { /* Was of the form {m} */
13337 else { /* Was of the form {m,n} */
13338 assert(max_end >= max_start);
13340 max = get_quantifier_value(pRExC_state, max_start, max_end);
13343 RExC_parse_set((char *) regcurly_return[RBRACE]);
13344 nextchar(pRExC_state);
13346 if (max < min) { /* If can't match, warn and optimize to fail
13348 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
13349 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
13350 NEXT_OFF(REGNODE_p(orig_emit)) =
13351 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
13354 else if (min == max && *RExC_parse == '?') {
13355 ckWARN2reg(RExC_parse + 1,
13356 "Useless use of greediness modifier '%c'",
13361 } /* End of is {m,n} */
13363 /* Here was a '{', but what followed it didn't form a quantifier. */
13369 NOT_REACHED; /*NOTREACHED*/
13372 /* Here we have a quantifier, and have calculated 'min' and 'max'.
13374 * Check and possibly adjust a zero width operand */
13375 if (! (flags & (HASWIDTH|POSTPONED))) {
13376 if (max > REG_INFTY/3) {
13377 ckWARN2reg(RExC_parse,
13378 "%" UTF8f " matches null string many times",
13379 UTF8fARG(UTF, (RExC_parse >= origparse
13380 ? RExC_parse - origparse
13385 /* There's no point in trying to match something 0 length more than
13386 * once except for extra side effects, which we don't have here since
13396 /* If this is a code block pass it up */
13397 *flagp |= (flags & POSTPONED);
13400 *flagp |= (flags & HASWIDTH);
13401 if (max == REG_INFTY)
13402 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
13405 /* 'SIMPLE' operands don't require full generality */
13406 if ((flags&SIMPLE)) {
13407 if (max == REG_INFTY) {
13409 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
13410 goto min0_maxINF_wildcard_forbidden;
13413 reginsert(pRExC_state, STAR, ret, depth+1);
13417 else if (min == 1) {
13418 reginsert(pRExC_state, PLUS, ret, depth+1);
13424 /* Here, SIMPLE, but not the '*' and '+' special cases */
13426 MARK_NAUGHTY_EXP(2, 2);
13427 reginsert(pRExC_state, CURLY, ret, depth+1);
13429 else { /* not SIMPLE */
13430 const regnode_offset w = reg_node(pRExC_state, WHILEM);
13432 FLAGS(REGNODE_p(w)) = 0;
13433 if (! REGTAIL(pRExC_state, ret, w)) {
13434 REQUIRE_BRANCHJ(flagp, 0);
13436 if (RExC_use_BRANCHJ) {
13437 reginsert(pRExC_state, LONGJMP, ret, depth+1);
13438 reginsert(pRExC_state, NOTHING, ret, depth+1);
13439 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
13441 reginsert(pRExC_state, CURLYX, ret, depth+1);
13443 if (RExC_use_BRANCHJ)
13444 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
13446 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
13449 REQUIRE_BRANCHJ(flagp, 0);
13451 RExC_whilem_seen++;
13452 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
13455 /* Finish up the CURLY/CURLYX case */
13456 FLAGS(REGNODE_p(ret)) = 0;
13458 ARG1_SET(REGNODE_p(ret), (U16)min);
13459 ARG2_SET(REGNODE_p(ret), (U16)max);
13463 /* Process any greediness modifiers */
13464 if (*RExC_parse == '?') {
13465 nextchar(pRExC_state);
13466 reginsert(pRExC_state, MINMOD, ret, depth+1);
13467 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
13468 REQUIRE_BRANCHJ(flagp, 0);
13471 else if (*RExC_parse == '+') {
13472 regnode_offset ender;
13473 nextchar(pRExC_state);
13474 ender = reg_node(pRExC_state, SUCCEED);
13475 if (! REGTAIL(pRExC_state, ret, ender)) {
13476 REQUIRE_BRANCHJ(flagp, 0);
13478 reginsert(pRExC_state, SUSPEND, ret, depth+1);
13479 ender = reg_node(pRExC_state, TAIL);
13480 if (! REGTAIL(pRExC_state, ret, ender)) {
13481 REQUIRE_BRANCHJ(flagp, 0);
13485 /* Forbid extra quantifiers */
13486 if (isQUANTIFIER(RExC_parse, RExC_end)) {
13487 RExC_parse_inc_by(1);
13488 vFAIL("Nested quantifiers");
13493 min0_maxINF_wildcard_forbidden:
13495 /* Here we are in a wildcard match, and the minimum match length is 0, and
13496 * the max could be infinity. This is currently forbidden. The only
13497 * reason is to make it harder to write patterns that take a long long time
13498 * to halt, and because the use of this construct isn't necessary in
13499 * matching Unicode property values */
13500 RExC_parse_inc_by(1);
13501 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
13502 subpatterns in regex; marked by <-- HERE in m/%s/
13504 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
13507 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
13508 * legal at all in wildcards, so can't get this far */
13510 NOT_REACHED; /*NOTREACHED*/
13514 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
13515 regnode_offset * node_p,
13523 /* This routine teases apart the various meanings of \N and returns
13524 * accordingly. The input parameters constrain which meaning(s) is/are valid
13525 * in the current context.
13527 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
13529 * If <code_point_p> is not NULL, the context is expecting the result to be a
13530 * single code point. If this \N instance turns out to a single code point,
13531 * the function returns TRUE and sets *code_point_p to that code point.
13533 * If <node_p> is not NULL, the context is expecting the result to be one of
13534 * the things representable by a regnode. If this \N instance turns out to be
13535 * one such, the function generates the regnode, returns TRUE and sets *node_p
13536 * to point to the offset of that regnode into the regex engine program being
13539 * If this instance of \N isn't legal in any context, this function will
13540 * generate a fatal error and not return.
13542 * On input, RExC_parse should point to the first char following the \N at the
13543 * time of the call. On successful return, RExC_parse will have been updated
13544 * to point to just after the sequence identified by this routine. Also
13545 * *flagp has been updated as needed.
13547 * When there is some problem with the current context and this \N instance,
13548 * the function returns FALSE, without advancing RExC_parse, nor setting
13549 * *node_p, nor *code_point_p, nor *flagp.
13551 * If <cp_count> is not NULL, the caller wants to know the length (in code
13552 * points) that this \N sequence matches. This is set, and the input is
13553 * parsed for errors, even if the function returns FALSE, as detailed below.
13555 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
13557 * Probably the most common case is for the \N to specify a single code point.
13558 * *cp_count will be set to 1, and *code_point_p will be set to that code
13561 * Another possibility is for the input to be an empty \N{}. This is no
13562 * longer accepted, and will generate a fatal error.
13564 * Another possibility is for a custom charnames handler to be in effect which
13565 * translates the input name to an empty string. *cp_count will be set to 0.
13566 * *node_p will be set to a generated NOTHING node.
13568 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
13569 * set to 0. *node_p will be set to a generated REG_ANY node.
13571 * The fifth possibility is that \N resolves to a sequence of more than one
13572 * code points. *cp_count will be set to the number of code points in the
13573 * sequence. *node_p will be set to a generated node returned by this
13574 * function calling S_reg().
13576 * The sixth and final possibility is that it is premature to be calling this
13577 * function; the parse needs to be restarted. This can happen when this
13578 * changes from /d to /u rules, or when the pattern needs to be upgraded to
13579 * UTF-8. The latter occurs only when the fifth possibility would otherwise
13580 * be in effect, and is because one of those code points requires the pattern
13581 * to be recompiled as UTF-8. The function returns FALSE, and sets the
13582 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
13583 * happens, the caller needs to desist from continuing parsing, and return
13584 * this information to its caller. This is not set for when there is only one
13585 * code point, as this can be called as part of an ANYOF node, and they can
13586 * store above-Latin1 code points without the pattern having to be in UTF-8.
13588 * For non-single-quoted regexes, the tokenizer has resolved character and
13589 * sequence names inside \N{...} into their Unicode values, normalizing the
13590 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
13591 * hex-represented code points in the sequence. This is done there because
13592 * the names can vary based on what charnames pragma is in scope at the time,
13593 * so we need a way to take a snapshot of what they resolve to at the time of
13594 * the original parse. [perl #56444].
13596 * That parsing is skipped for single-quoted regexes, so here we may get
13597 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
13598 * like '\N{U+41}', that code point is Unicode, and has to be translated into
13599 * the native character set for non-ASCII platforms. The other possibilities
13600 * are already native, so no translation is done. */
13602 char * endbrace; /* points to '}' following the name */
13603 char * e; /* points to final non-blank before endbrace */
13604 char* p = RExC_parse; /* Temporary */
13606 SV * substitute_parse = NULL;
13611 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13613 PERL_ARGS_ASSERT_GROK_BSLASH_N;
13615 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
13616 assert(! (node_p && cp_count)); /* At most 1 should be set */
13618 if (cp_count) { /* Initialize return for the most common case */
13622 /* The [^\n] meaning of \N ignores spaces and comments under the /x
13623 * modifier. The other meanings do not (except blanks adjacent to and
13624 * within the braces), so use a temporary until we find out which we are
13625 * being called with */
13626 skip_to_be_ignored_text(pRExC_state, &p,
13627 FALSE /* Don't force to /x */ );
13629 /* Disambiguate between \N meaning a named character versus \N meaning
13630 * [^\n]. The latter is assumed when the {...} following the \N is a legal
13631 * quantifier, or if there is no '{' at all */
13632 if (*p != '{' || regcurly(p, RExC_end, NULL)) {
13642 *node_p = reg_node(pRExC_state, REG_ANY);
13643 *flagp |= HASWIDTH|SIMPLE;
13648 /* The test above made sure that the next real character is a '{', but
13649 * under the /x modifier, it could be separated by space (or a comment and
13650 * \n) and this is not allowed (for consistency with \x{...} and the
13651 * tokenizer handling of \N{NAME}). */
13652 if (*RExC_parse != '{') {
13653 vFAIL("Missing braces on \\N{}");
13656 RExC_parse_inc_by(1); /* Skip past the '{' */
13658 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13659 if (! endbrace) { /* no trailing brace */
13660 vFAIL2("Missing right brace on \\%c{}", 'N');
13663 /* Here, we have decided it should be a named character or sequence. These
13664 * imply Unicode semantics */
13665 REQUIRE_UNI_RULES(flagp, FALSE);
13667 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13668 * nothing at all (not allowed under strict) */
13669 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13670 RExC_parse_set(endbrace);
13672 RExC_parse_inc_by(1); /* Position after the "}" */
13673 vFAIL("Zero length \\N{}");
13679 nextchar(pRExC_state);
13684 *node_p = reg_node(pRExC_state, NOTHING);
13688 while (isBLANK(*RExC_parse)) {
13689 RExC_parse_inc_by(1);
13693 while (RExC_parse < e && isBLANK(*(e-1))) {
13697 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13699 /* Here, the name isn't of the form U+.... This can happen if the
13700 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13701 * is the time to find out what the name means */
13703 const STRLEN name_len = e - RExC_parse;
13704 SV * value_sv; /* What does this name evaluate to */
13706 const U8 * value; /* string of name's value */
13707 STRLEN value_len; /* and its length */
13709 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13710 * toke.c, and their values. Make sure is initialized */
13711 if (! RExC_unlexed_names) {
13712 RExC_unlexed_names = newHV();
13715 /* If we have already seen this name in this pattern, use that. This
13716 * allows us to only call the charnames handler once per name per
13717 * pattern. A broken or malicious handler could return something
13718 * different each time, which could cause the results to vary depending
13719 * on if something gets added or subtracted from the pattern that
13720 * causes the number of passes to change, for example */
13721 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13724 value_sv = *value_svp;
13726 else { /* Otherwise we have to go out and get the name */
13727 const char * error_msg = NULL;
13728 value_sv = get_and_check_backslash_N_name(RExC_parse, e,
13732 RExC_parse_set(endbrace);
13736 /* If no error message, should have gotten a valid return */
13739 /* Save the name's meaning for later use */
13740 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13743 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13747 /* Here, we have the value the name evaluates to in 'value_sv' */
13748 value = (U8 *) SvPV(value_sv, value_len);
13750 /* See if the result is one code point vs 0 or multiple */
13751 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13755 /* Here, exactly one code point. If that isn't what is wanted,
13757 if (! code_point_p) {
13762 /* Convert from string to numeric code point */
13763 *code_point_p = (SvUTF8(value_sv))
13764 ? valid_utf8_to_uvchr(value, NULL)
13767 /* Have parsed this entire single code point \N{...}. *cp_count
13768 * has already been set to 1, so don't do it again. */
13769 RExC_parse_set(endbrace);
13770 nextchar(pRExC_state);
13772 } /* End of is a single code point */
13774 /* Count the code points, if caller desires. The API says to do this
13775 * even if we will later return FALSE */
13779 *cp_count = (SvUTF8(value_sv))
13780 ? utf8_length(value, value + value_len)
13784 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13785 * But don't back the pointer up if the caller wants to know how many
13786 * code points there are (they need to handle it themselves in this
13795 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13796 * reg recursively to parse it. That way, it retains its atomicness,
13797 * while not having to worry about any special handling that some code
13798 * points may have. */
13800 substitute_parse = newSVpvs("?:");
13801 sv_catsv(substitute_parse, value_sv);
13802 sv_catpv(substitute_parse, ")");
13804 /* The value should already be native, so no need to convert on EBCDIC
13806 assert(! RExC_recode_x_to_native);
13809 else { /* \N{U+...} */
13810 Size_t count = 0; /* code point count kept internally */
13812 /* We can get to here when the input is \N{U+...} or when toke.c has
13813 * converted a name to the \N{U+...} form. This include changing a
13814 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13816 RExC_parse_inc_by(2); /* Skip past the 'U+' */
13818 /* Code points are separated by dots. The '}' terminates the whole
13821 do { /* Loop until the ending brace */
13822 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13823 | PERL_SCAN_SILENT_ILLDIGIT
13824 | PERL_SCAN_NOTIFY_ILLDIGIT
13825 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13826 | PERL_SCAN_DISALLOW_PREFIX;
13827 STRLEN len = e - RExC_parse;
13829 char * start_digit = RExC_parse;
13830 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13833 RExC_parse_inc_by(1);
13835 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13838 RExC_parse_inc_by(len);
13840 if (cp > MAX_LEGAL_CP) {
13841 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13844 if (RExC_parse >= e) { /* Got to the closing '}' */
13849 /* Here, is a single code point; fail if doesn't want that */
13850 if (! code_point_p) {
13855 /* A single code point is easy to handle; just return it */
13856 *code_point_p = UNI_TO_NATIVE(cp);
13857 RExC_parse_set(endbrace);
13858 nextchar(pRExC_state);
13862 /* Here, the parse stopped bfore the ending brace. This is legal
13863 * only if that character is a dot separating code points, like a
13864 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13865 * So the next character must be a dot (and the one after that
13866 * can't be the ending brace, or we'd have something like
13869 if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
13870 /*point to after 1st invalid */
13871 RExC_parse_incf(RExC_orig_utf8);
13872 /*Guard against malformed utf8*/
13873 RExC_parse_set(MIN(e, RExC_parse));
13877 /* Here, looks like its really a multiple character sequence. Fail
13878 * if that's not what the caller wants. But continue with counting
13879 * and error checking if they still want a count */
13880 if (! node_p && ! cp_count) {
13884 /* What is done here is to convert this to a sub-pattern of the
13885 * form \x{char1}\x{char2}... and then call reg recursively to
13886 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13887 * atomicness, while not having to worry about special handling
13888 * that some code points may have. We don't create a subpattern,
13889 * but go through the motions of code point counting and error
13890 * checking, if the caller doesn't want a node returned. */
13892 if (node_p && ! substitute_parse) {
13893 substitute_parse = newSVpvs("?:");
13899 /* Convert to notation the rest of the code understands */
13900 sv_catpvs(substitute_parse, "\\x{");
13901 sv_catpvn(substitute_parse, start_digit,
13902 RExC_parse - start_digit);
13903 sv_catpvs(substitute_parse, "}");
13906 /* Move to after the dot (or ending brace the final time through.)
13908 RExC_parse_inc_by(1);
13911 } while (RExC_parse < e);
13913 if (! node_p) { /* Doesn't want the node */
13920 sv_catpvs(substitute_parse, ")");
13922 /* The values are Unicode, and therefore have to be converted to native
13923 * on a non-Unicode (meaning non-ASCII) platform. */
13924 SET_recode_x_to_native(1);
13927 /* Here, we have the string the name evaluates to, ready to be parsed,
13928 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13929 * constructs. This can be called from within a substitute parse already.
13930 * The error reporting mechanism doesn't work for 2 levels of this, but the
13931 * code above has validated this new construct, so there should be no
13932 * errors generated by the below. And this isn't an exact copy, so the
13933 * mechanism to seamlessly deal with this won't work, so turn off warnings
13935 save_start = RExC_start;
13936 orig_end = RExC_end;
13938 RExC_start = SvPVX(substitute_parse);
13939 RExC_parse_set(RExC_start);
13940 RExC_end = RExC_parse + SvCUR(substitute_parse);
13941 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13943 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13945 /* Restore the saved values */
13947 RExC_start = save_start;
13948 RExC_parse_set(endbrace);
13949 RExC_end = orig_end;
13950 SET_recode_x_to_native(0);
13952 SvREFCNT_dec_NN(substitute_parse);
13955 RETURN_FAIL_ON_RESTART(flags, flagp);
13956 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13959 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13961 nextchar(pRExC_state);
13968 S_compute_EXACTish(RExC_state_t *pRExC_state)
13972 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13980 op = get_regex_charset(RExC_flags);
13981 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13982 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13983 been, so there is no hole */
13986 return op + EXACTF;
13989 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13990 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13993 S_backref_value(char *p, char *e)
13995 const char* endptr = e;
13997 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
14003 #define REGNODE_GUTS(state,op,extra_size) \
14004 regnode_guts_debug(state,op,extra_size)
14006 #define REGNODE_GUTS(state,op,extra_size) \
14007 regnode_guts(state,extra_size)
14012 - regatom - the lowest level
14014 Try to identify anything special at the start of the current parse position.
14015 If there is, then handle it as required. This may involve generating a
14016 single regop, such as for an assertion; or it may involve recursing, such as
14017 to handle a () structure.
14019 If the string doesn't start with something special then we gobble up
14020 as much literal text as we can. If we encounter a quantifier, we have to
14021 back off the final literal character, as that quantifier applies to just it
14022 and not to the whole string of literals.
14024 Once we have been able to handle whatever type of thing started the
14025 sequence, we return the offset into the regex engine program being compiled
14026 at which any next regnode should be placed.
14028 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
14029 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
14030 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
14031 Otherwise does not return 0.
14033 Note: we have to be careful with escapes, as they can be both literal
14034 and special, and in the case of \10 and friends, context determines which.
14036 A summary of the code structure is:
14038 switch (first_byte) {
14039 cases for each special:
14040 handle this special;
14043 switch (2nd byte) {
14044 cases for each unambiguous special:
14045 handle this special;
14047 cases for each ambigous special/literal:
14049 if (special) handle here
14051 default: // unambiguously literal:
14054 default: // is a literal char
14057 create EXACTish node for literal;
14058 while (more input and node isn't full) {
14059 switch (input_byte) {
14060 cases for each special;
14061 make sure parse pointer is set so that the next call to
14062 regatom will see this special first
14063 goto loopdone; // EXACTish node terminated by prev. char
14065 append char to EXACTISH node;
14067 get next input byte;
14071 return the generated node;
14073 Specifically there are two separate switches for handling
14074 escape sequences, with the one for handling literal escapes requiring
14075 a dummy entry for all of the special escapes that are actually handled
14080 STATIC regnode_offset
14081 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
14083 regnode_offset ret = 0;
14085 char *atom_parse_start;
14089 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14091 *flagp = 0; /* Initialize. */
14093 DEBUG_PARSE("atom");
14095 PERL_ARGS_ASSERT_REGATOM;
14098 atom_parse_start = RExC_parse;
14099 assert(RExC_parse < RExC_end);
14100 switch ((U8)*RExC_parse) {
14102 RExC_seen_zerolen++;
14103 nextchar(pRExC_state);
14104 if (RExC_flags & RXf_PMf_MULTILINE)
14105 ret = reg_node(pRExC_state, MBOL);
14107 ret = reg_node(pRExC_state, SBOL);
14110 nextchar(pRExC_state);
14112 RExC_seen_zerolen++;
14113 if (RExC_flags & RXf_PMf_MULTILINE)
14114 ret = reg_node(pRExC_state, MEOL);
14116 ret = reg_node(pRExC_state, SEOL);
14119 nextchar(pRExC_state);
14120 if (RExC_flags & RXf_PMf_SINGLELINE)
14121 ret = reg_node(pRExC_state, SANY);
14123 ret = reg_node(pRExC_state, REG_ANY);
14124 *flagp |= HASWIDTH|SIMPLE;
14129 char * const cc_parse_start = ++RExC_parse;
14130 ret = regclass(pRExC_state, flagp, depth+1,
14131 FALSE, /* means parse the whole char class */
14132 TRUE, /* allow multi-char folds */
14133 FALSE, /* don't silence non-portable warnings. */
14134 (bool) RExC_strict,
14135 TRUE, /* Allow an optimized regnode result */
14138 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14139 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
14142 if (*RExC_parse != ']') {
14143 RExC_parse_set(cc_parse_start);
14144 vFAIL("Unmatched [");
14146 nextchar(pRExC_state);
14150 nextchar(pRExC_state);
14151 ret = reg(pRExC_state, 2, &flags, depth+1);
14153 if (flags & TRYAGAIN) {
14154 if (RExC_parse >= RExC_end) {
14155 /* Make parent create an empty node if needed. */
14156 *flagp |= TRYAGAIN;
14161 RETURN_FAIL_ON_RESTART(flags, flagp);
14162 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
14165 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
14169 if (flags & TRYAGAIN) {
14170 *flagp |= TRYAGAIN;
14173 vFAIL("Internal urp");
14174 /* Supposed to be caught earlier. */
14179 RExC_parse_inc_by(1);
14180 vFAIL("Quantifier follows nothing");
14185 This switch handles escape sequences that resolve to some kind
14186 of special regop and not to literal text. Escape sequences that
14187 resolve to literal text are handled below in the switch marked
14190 Every entry in this switch *must* have a corresponding entry
14191 in the literal escape switch. However, the opposite is not
14192 required, as the default for this switch is to jump to the
14193 literal text handling code.
14195 RExC_parse_inc_by(1);
14196 switch ((U8)*RExC_parse) {
14197 /* Special Escapes */
14199 RExC_seen_zerolen++;
14200 /* Under wildcards, this is changed to match \n; should be
14201 * invisible to the user, as they have to compile under /m */
14202 if (RExC_pm_flags & PMf_WILDCARD) {
14203 ret = reg_node(pRExC_state, MBOL);
14206 ret = reg_node(pRExC_state, SBOL);
14207 /* SBOL is shared with /^/ so we set the flags so we can tell
14208 * /\A/ from /^/ in split. */
14209 FLAGS(REGNODE_p(ret)) = 1;
14211 goto finish_meta_pat;
14213 if (RExC_pm_flags & PMf_WILDCARD) {
14214 RExC_parse_inc_by(1);
14215 /* diag_listed_as: Use of %s is not allowed in Unicode property
14216 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
14218 vFAIL("Use of '\\G' is not allowed in Unicode property"
14219 " wildcard subpatterns");
14221 ret = reg_node(pRExC_state, GPOS);
14222 RExC_seen |= REG_GPOS_SEEN;
14223 goto finish_meta_pat;
14225 if (!RExC_in_lookaround) {
14226 RExC_seen_zerolen++;
14227 ret = reg_node(pRExC_state, KEEPS);
14228 /* XXX:dmq : disabling in-place substitution seems to
14229 * be necessary here to avoid cases of memory corruption, as
14230 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
14232 RExC_seen |= REG_LOOKBEHIND_SEEN;
14233 goto finish_meta_pat;
14236 ++RExC_parse; /* advance past the 'K' */
14237 vFAIL("\\K not permitted in lookahead/lookbehind");
14240 if (RExC_pm_flags & PMf_WILDCARD) {
14241 /* See comment under \A above */
14242 ret = reg_node(pRExC_state, MEOL);
14245 ret = reg_node(pRExC_state, SEOL);
14247 RExC_seen_zerolen++; /* Do not optimize RE away */
14248 goto finish_meta_pat;
14250 if (RExC_pm_flags & PMf_WILDCARD) {
14251 /* See comment under \A above */
14252 ret = reg_node(pRExC_state, MEOL);
14255 ret = reg_node(pRExC_state, EOS);
14257 RExC_seen_zerolen++; /* Do not optimize RE away */
14258 goto finish_meta_pat;
14260 vFAIL("\\C no longer supported");
14262 ret = reg_node(pRExC_state, CLUMP);
14263 *flagp |= HASWIDTH;
14264 goto finish_meta_pat;
14272 regex_charset charset = get_regex_charset(RExC_flags);
14274 RExC_seen_zerolen++;
14275 RExC_seen |= REG_LOOKBEHIND_SEEN;
14276 op = BOUND + charset;
14278 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
14279 flags = TRADITIONAL_BOUND;
14280 if (op > BOUNDA) { /* /aa is same as /a */
14286 char name = *RExC_parse;
14287 char * endbrace = (char *) memchr(RExC_parse, '}',
14288 RExC_end - RExC_parse);
14289 char * e = endbrace;
14291 RExC_parse_inc_by(2);
14294 vFAIL2("Missing right brace on \\%c{}", name);
14297 while (isBLANK(*RExC_parse)) {
14298 RExC_parse_inc_by(1);
14301 while (RExC_parse < e && isBLANK(*(e - 1))) {
14305 if (e == RExC_parse) {
14306 RExC_parse_set(endbrace + 1); /* After the '}' */
14307 vFAIL2("Empty \\%c{}", name);
14310 length = e - RExC_parse;
14312 switch (*RExC_parse) {
14315 && (memNEs(RExC_parse + 1, length - 1, "cb")))
14317 goto bad_bound_type;
14322 if (length != 2 || *(RExC_parse + 1) != 'b') {
14323 goto bad_bound_type;
14328 if (length != 2 || *(RExC_parse + 1) != 'b') {
14329 goto bad_bound_type;
14334 if (length != 2 || *(RExC_parse + 1) != 'b') {
14335 goto bad_bound_type;
14343 "'%" UTF8f "' is an unknown bound type",
14344 UTF8fARG(UTF, length, e - length));
14345 NOT_REACHED; /*NOTREACHED*/
14347 RExC_parse_set(endbrace);
14348 REQUIRE_UNI_RULES(flagp, 0);
14353 else if (op >= BOUNDA) { /* /aa is same as /a */
14357 /* Don't have to worry about UTF-8, in this message because
14358 * to get here the contents of the \b must be ASCII */
14359 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
14360 "Using /u for '%.*s' instead of /%s",
14362 endbrace - length + 1,
14363 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
14364 ? ASCII_RESTRICT_PAT_MODS
14365 : ASCII_MORE_RESTRICT_PAT_MODS);
14370 RExC_seen_d_op = TRUE;
14372 else if (op == BOUNDL) {
14373 RExC_contains_locale = 1;
14377 op += NBOUND - BOUND;
14380 ret = reg_node(pRExC_state, op);
14381 FLAGS(REGNODE_p(ret)) = flags;
14383 goto finish_meta_pat;
14387 ret = reg_node(pRExC_state, LNBREAK);
14388 *flagp |= HASWIDTH|SIMPLE;
14389 goto finish_meta_pat;
14403 /* These all have the same meaning inside [brackets], and it knows
14404 * how to do the best optimizations for them. So, pretend we found
14405 * these within brackets, and let it do the work */
14408 ret = regclass(pRExC_state, flagp, depth+1,
14409 TRUE, /* means just parse this element */
14410 FALSE, /* don't allow multi-char folds */
14411 FALSE, /* don't silence non-portable warnings. It
14412 would be a bug if these returned
14414 (bool) RExC_strict,
14415 TRUE, /* Allow an optimized regnode result */
14417 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14418 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
14419 * multi-char folds are allowed. */
14421 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
14424 RExC_parse--; /* regclass() leaves this one too far ahead */
14427 /* The escapes above that don't take a parameter can't be
14428 * followed by a '{'. But 'pX', 'p{foo}' and
14429 * correspondingly 'P' can be */
14430 if ( RExC_parse - atom_parse_start == 1
14431 && UCHARAT(RExC_parse + 1) == '{'
14432 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
14434 RExC_parse_inc_by(2);
14435 vFAIL("Unescaped left brace in regex is illegal here");
14437 nextchar(pRExC_state);
14440 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
14441 * \N{...} evaluates to a sequence of more than one code points).
14442 * The function call below returns a regnode, which is our result.
14443 * The parameters cause it to fail if the \N{} evaluates to a
14444 * single code point; we handle those like any other literal. The
14445 * reason that the multicharacter case is handled here and not as
14446 * part of the EXACtish code is because of quantifiers. In
14447 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
14448 * this way makes that Just Happen. dmq.
14449 * join_exact() will join this up with adjacent EXACTish nodes
14450 * later on, if appropriate. */
14452 if (grok_bslash_N(pRExC_state,
14453 &ret, /* Want a regnode returned */
14454 NULL, /* Fail if evaluates to a single code
14456 NULL, /* Don't need a count of how many code
14465 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14467 /* Here, evaluates to a single code point. Go get that */
14468 RExC_parse_set(atom_parse_start);
14471 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
14472 parse_named_seq: /* Also handle non-numeric \g{...} */
14475 if ( RExC_parse >= RExC_end - 1
14476 || (( ch = RExC_parse[1]) != '<'
14480 RExC_parse_inc_by(1);
14481 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
14482 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
14484 RExC_parse_inc_by(2);
14486 while (isBLANK(*RExC_parse)) {
14487 RExC_parse_inc_by(1);
14490 ret = handle_named_backref(pRExC_state,
14502 case '1': case '2': case '3': case '4':
14503 case '5': case '6': case '7': case '8': case '9':
14506 char * endbrace = NULL;
14507 char * s = RExC_parse;
14508 char * e = RExC_end;
14515 endbrace = (char *) memchr(s, '}', RExC_end - s);
14518 /* Missing '}'. Position after the number to give
14519 * a better indication to the user of where the
14526 /* If it looks to be a name and not a number, go
14527 * handle it there */
14528 if (! isDIGIT(*s)) {
14529 goto parse_named_seq;
14534 } while isDIGIT(*s);
14537 vFAIL("Unterminated \\g{...} pattern");
14540 s++; /* Past the '{' */
14542 while (isBLANK(*s)) {
14546 /* Ignore trailing blanks */
14548 while (s < e && isBLANK(*(e - 1))) {
14553 /* Here, have isolated the meat of the construct from any
14554 * surrounding braces */
14561 if (endbrace && !isDIGIT(*s)) {
14562 goto parse_named_seq;
14566 num = S_backref_value(RExC_parse, RExC_end);
14568 vFAIL("Reference to invalid group 0");
14569 else if (num == I32_MAX) {
14570 if (isDIGIT(*RExC_parse))
14571 vFAIL("Reference to nonexistent group");
14573 vFAIL("Unterminated \\g... pattern");
14577 num = RExC_npar - num;
14579 vFAIL("Reference to nonexistent or unclosed group");
14583 num = S_backref_value(RExC_parse, RExC_end);
14584 /* bare \NNN might be backref or octal - if it is larger
14585 * than or equal RExC_npar then it is assumed to be an
14586 * octal escape. Note RExC_npar is +1 from the actual
14587 * number of parens. */
14588 /* Note we do NOT check if num == I32_MAX here, as that is
14589 * handled by the RExC_npar check */
14591 if ( /* any numeric escape < 10 is always a backref */
14593 /* any numeric escape < RExC_npar is a backref */
14594 && num >= RExC_npar
14595 /* cannot be an octal escape if it starts with [89]
14597 && ! inRANGE(*RExC_parse, '8', '9')
14599 /* Probably not meant to be a backref, instead likely
14600 * to be an octal character escape, e.g. \35 or \777.
14601 * The above logic should make it obvious why using
14602 * octal escapes in patterns is problematic. - Yves */
14603 RExC_parse_set(atom_parse_start);
14608 /* At this point RExC_parse points at a numeric escape like
14609 * \12 or \88 or the digits in \g{34} or \g34 or something
14610 * similar, which we should NOT treat as an octal escape. It
14611 * may or may not be a valid backref escape. For instance
14612 * \88888888 is unlikely to be a valid backref.
14614 * We've already figured out what value the digits represent.
14615 * Now, move the parse to beyond them. */
14617 RExC_parse_set(endbrace + 1);
14619 else while (isDIGIT(*RExC_parse)) {
14620 RExC_parse_inc_by(1);
14623 if (num >= (I32)RExC_npar) {
14625 /* It might be a forward reference; we can't fail until we
14626 * know, by completing the parse to get all the groups, and
14627 * then reparsing */
14628 if (ALL_PARENS_COUNTED) {
14629 if (num >= RExC_total_parens) {
14630 vFAIL("Reference to nonexistent group");
14634 REQUIRE_PARENS_PASS;
14638 ret = reganode(pRExC_state,
14641 : (ASCII_FOLD_RESTRICTED)
14643 : (AT_LEAST_UNI_SEMANTICS)
14649 if (OP(REGNODE_p(ret)) == REFF) {
14650 RExC_seen_d_op = TRUE;
14652 *flagp |= HASWIDTH;
14654 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14655 FALSE /* Don't force to /x */ );
14659 if (RExC_parse >= RExC_end)
14660 FAIL("Trailing \\");
14663 /* Do not generate "unrecognized" warnings here, we fall
14664 back into the quick-grab loop below */
14665 RExC_parse_set(atom_parse_start);
14667 } /* end of switch on a \foo sequence */
14672 /* '#' comments should have been spaced over before this function was
14674 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14676 if (RExC_flags & RXf_PMf_EXTENDED) {
14677 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
14678 if (RExC_parse < RExC_end)
14688 /* Here, we have determined that the next thing is probably a
14689 * literal character. RExC_parse points to the first byte of its
14690 * definition. (It still may be an escape sequence that evaluates
14691 * to a single character) */
14696 char *s, *old_s = NULL, *old_old_s = NULL;
14698 U32 max_string_len = 255;
14700 /* We may have to reparse the node, artificially stopping filling
14701 * it early, based on info gleaned in the first parse. This
14702 * variable gives where we stop. Make it above the normal stopping
14703 * place first time through; otherwise it would stop too early */
14704 U32 upper_fill = max_string_len + 1;
14706 /* We start out as an EXACT node, even if under /i, until we find a
14707 * character which is in a fold. The algorithm now segregates into
14708 * separate nodes, characters that fold from those that don't under
14709 * /i. (This hopefully will create nodes that are fixed strings
14710 * even under /i, giving the optimizer something to grab on to.)
14711 * So, if a node has something in it and the next character is in
14712 * the opposite category, that node is closed up, and the function
14713 * returns. Then regatom is called again, and a new node is
14714 * created for the new category. */
14715 U8 node_type = EXACT;
14717 /* Assume the node will be fully used; the excess is given back at
14718 * the end. Under /i, we may need to temporarily add the fold of
14719 * an extra character or two at the end to check for splitting
14720 * multi-char folds, so allocate extra space for that. We can't
14721 * make any other length assumptions, as a byte input sequence
14722 * could shrink down. */
14723 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14727 ? UTF8_MAXBYTES_CASE
14728 /* Max non-UTF-8 expansion is 2 */ : 2)));
14730 bool next_is_quantifier;
14731 char * oldp = NULL;
14733 /* We can convert EXACTF nodes to EXACTFU if they contain only
14734 * characters that match identically regardless of the target
14735 * string's UTF8ness. The reason to do this is that EXACTF is not
14736 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14739 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14740 * contain only above-Latin1 characters (hence must be in UTF8),
14741 * which don't participate in folds with Latin1-range characters,
14742 * as the latter's folds aren't known until runtime. */
14743 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14745 /* Single-character EXACTish nodes are almost always SIMPLE. This
14746 * allows us to override this as encountered */
14747 U8 maybe_SIMPLE = SIMPLE;
14749 /* Does this node contain something that can't match unless the
14750 * target string is (also) in UTF-8 */
14751 bool requires_utf8_target = FALSE;
14753 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14754 bool has_ss = FALSE;
14756 /* So is the MICRO SIGN */
14757 bool has_micro_sign = FALSE;
14759 /* Set when we fill up the current node and there is still more
14760 * text to process */
14763 /* Allocate an EXACT node. The node_type may change below to
14764 * another EXACTish node, but since the size of the node doesn't
14765 * change, it works */
14766 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
14767 FILL_NODE(ret, node_type);
14768 RExC_emit += NODE_STEP_REGNODE;
14770 s = STRING(REGNODE_p(ret));
14781 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14782 maybe_SIMPLE = SIMPLE;
14783 requires_utf8_target = FALSE;
14785 has_micro_sign = FALSE;
14789 /* This breaks under rare circumstances. If folding, we do not
14790 * want to split a node at a character that is a non-final in a
14791 * multi-char fold, as an input string could just happen to want to
14792 * match across the node boundary. The code at the end of the loop
14793 * looks for this, and backs off until it finds not such a
14794 * character, but it is possible (though extremely, extremely
14795 * unlikely) for all characters in the node to be non-final fold
14796 * ones, in which case we just leave the node fully filled, and
14797 * hope that it doesn't match the string in just the wrong place */
14799 assert( ! UTF /* Is at the beginning of a character */
14800 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14801 || UTF8_IS_START(UCHARAT(RExC_parse)));
14803 overflowed = FALSE;
14805 /* Here, we have a literal character. Find the maximal string of
14806 * them in the input that we can fit into a single EXACTish node.
14807 * We quit at the first non-literal or when the node gets full, or
14808 * under /i the categorization of folding/non-folding character
14810 while (p < RExC_end && len < upper_fill) {
14812 /* In most cases each iteration adds one byte to the output.
14813 * The exceptions override this */
14814 Size_t added_len = 1;
14820 /* White space has already been ignored */
14821 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14822 || ! is_PATWS_safe((p), RExC_end, UTF));
14825 const char* message;
14838 /* Literal Escapes Switch
14840 This switch is meant to handle escape sequences that
14841 resolve to a literal character.
14843 Every escape sequence that represents something
14844 else, like an assertion or a char class, is handled
14845 in the switch marked 'Special Escapes' above in this
14846 routine, but also has an entry here as anything that
14847 isn't explicitly mentioned here will be treated as
14848 an unescaped equivalent literal.
14851 switch ((U8)*++p) {
14853 /* These are all the special escapes. */
14854 case 'A': /* Start assertion */
14855 case 'b': case 'B': /* Word-boundary assertion*/
14856 case 'C': /* Single char !DANGEROUS! */
14857 case 'd': case 'D': /* digit class */
14858 case 'g': case 'G': /* generic-backref, pos assertion */
14859 case 'h': case 'H': /* HORIZWS */
14860 case 'k': case 'K': /* named backref, keep marker */
14861 case 'p': case 'P': /* Unicode property */
14862 case 'R': /* LNBREAK */
14863 case 's': case 'S': /* space class */
14864 case 'v': case 'V': /* VERTWS */
14865 case 'w': case 'W': /* word class */
14866 case 'X': /* eXtended Unicode "combining
14867 character sequence" */
14868 case 'z': case 'Z': /* End of line/string assertion */
14872 /* Anything after here is an escape that resolves to a
14873 literal. (Except digits, which may or may not)
14879 case 'N': /* Handle a single-code point named character. */
14880 RExC_parse_set( p + 1 );
14881 if (! grok_bslash_N(pRExC_state,
14882 NULL, /* Fail if evaluates to
14883 anything other than a
14884 single code point */
14885 &ender, /* The returned single code
14887 NULL, /* Don't need a count of
14888 how many code points */
14893 if (*flagp & NEED_UTF8)
14894 FAIL("panic: grok_bslash_N set NEED_UTF8");
14895 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14897 /* Here, it wasn't a single code point. Go close
14898 * up this EXACTish node. The switch() prior to
14899 * this switch handles the other cases */
14905 RExC_parse_set(atom_parse_start);
14907 /* The \N{} means the pattern, if previously /d,
14908 * becomes /u. That means it can't be an EXACTF node,
14909 * but an EXACTFU */
14910 if (node_type == EXACTF) {
14911 node_type = EXACTFU;
14913 /* If the node already contains something that
14914 * differs between EXACTF and EXACTFU, reparse it
14916 if (! maybe_exactfu) {
14937 ender = ESC_NATIVE;
14945 if (! grok_bslash_o(&p,
14950 (bool) RExC_strict,
14951 FALSE, /* No illegal cp's */
14954 RExC_parse_set(p); /* going to die anyway; point to
14955 exact spot of failure */
14959 if (message && TO_OUTPUT_WARNINGS(p)) {
14960 warn_non_literal_string(p, packed_warn, message);
14964 if (! grok_bslash_x(&p,
14969 (bool) RExC_strict,
14970 FALSE, /* No illegal cp's */
14973 RExC_parse_set(p); /* going to die anyway; point
14974 to exact spot of failure */
14978 if (message && TO_OUTPUT_WARNINGS(p)) {
14979 warn_non_literal_string(p, packed_warn, message);
14983 if (ender < 0x100) {
14984 if (RExC_recode_x_to_native) {
14985 ender = LATIN1_TO_NATIVE(ender);
14992 if (! grok_bslash_c(*p, &grok_c_char,
14993 &message, &packed_warn))
14995 /* going to die anyway; point to exact spot of
14997 char *new_p= p + ((UTF)
14998 ? UTF8_SAFE_SKIP(p, RExC_end)
15000 RExC_parse_set(new_p);
15004 ender = grok_c_char;
15006 if (message && TO_OUTPUT_WARNINGS(p)) {
15007 warn_non_literal_string(p, packed_warn, message);
15011 case '8': case '9': /* must be a backreference */
15013 /* we have an escape like \8 which cannot be an octal escape
15014 * so we exit the loop, and let the outer loop handle this
15015 * escape which may or may not be a legitimate backref. */
15017 case '1': case '2': case '3':case '4':
15018 case '5': case '6': case '7':
15020 /* When we parse backslash escapes there is ambiguity
15021 * between backreferences and octal escapes. Any escape
15022 * from \1 - \9 is a backreference, any multi-digit
15023 * escape which does not start with 0 and which when
15024 * evaluated as decimal could refer to an already
15025 * parsed capture buffer is a back reference. Anything
15028 * Note this implies that \118 could be interpreted as
15029 * 118 OR as "\11" . "8" depending on whether there
15030 * were 118 capture buffers defined already in the
15033 /* NOTE, RExC_npar is 1 more than the actual number of
15034 * parens we have seen so far, hence the "<" as opposed
15036 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
15037 { /* Not to be treated as an octal constant, go
15045 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
15046 | PERL_SCAN_NOTIFY_ILLDIGIT;
15048 ender = grok_oct(p, &numlen, &flags, NULL);
15050 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
15051 && isDIGIT(*p) /* like \08, \178 */
15052 && ckWARN(WARN_REGEXP))
15054 reg_warn_non_literal_string(
15056 form_alien_digit_msg(8, numlen, p,
15057 RExC_end, UTF, FALSE));
15063 FAIL("Trailing \\");
15066 if (isALPHANUMERIC(*p)) {
15067 /* An alpha followed by '{' is going to fail next
15068 * iteration, so don't output this warning in that
15070 if (! isALPHA(*p) || *(p + 1) != '{') {
15071 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
15072 " passed through", p);
15075 goto normal_default;
15076 } /* End of switch on '\' */
15079 /* Trying to gain new uses for '{' without breaking too
15080 * much existing code is hard. The solution currently
15082 * 1) If there is no ambiguity that a '{' should always
15083 * be taken literally, at the start of a construct, we
15085 * 2) If the literal '{' conflicts with our desired use
15086 * of it as a metacharacter, we die. The deprecation
15087 * cycles for this have come and gone.
15088 * 3) If there is ambiguity, we raise a simple warning.
15089 * This could happen, for example, if the user
15090 * intended it to introduce a quantifier, but slightly
15091 * misspelled the quantifier. Without this warning,
15092 * the quantifier would silently be taken as a literal
15093 * string of characters instead of a meta construct */
15094 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
15096 || ( p > atom_parse_start + 1
15097 && isALPHA_A(*(p - 1))
15098 && *(p - 2) == '\\'))
15100 RExC_parse_set(p + 1);
15101 vFAIL("Unescaped left brace in regex is "
15104 ckWARNreg(p + 1, "Unescaped left brace in regex is"
15105 " passed through");
15107 goto normal_default;
15110 if (p > RExC_parse && RExC_strict) {
15111 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
15114 default: /* A literal character */
15116 if (! UTF8_IS_INVARIANT(*p) && UTF) {
15118 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
15119 &numlen, UTF8_ALLOW_DEFAULT);
15125 } /* End of switch on the literal */
15127 /* Here, have looked at the literal character, and <ender>
15128 * contains its ordinal; <p> points to the character after it.
15132 REQUIRE_UTF8(flagp);
15133 if ( UNICODE_IS_PERL_EXTENDED(ender)
15134 && TO_OUTPUT_WARNINGS(p))
15136 ckWARN2_non_literal_string(p,
15137 packWARN(WARN_PORTABLE),
15138 PL_extended_cp_format,
15143 /* We need to check if the next non-ignored thing is a
15144 * quantifier. Move <p> to after anything that should be
15145 * ignored, which, as a side effect, positions <p> for the next
15146 * loop iteration */
15147 skip_to_be_ignored_text(pRExC_state, &p,
15148 FALSE /* Don't force to /x */ );
15150 /* If the next thing is a quantifier, it applies to this
15151 * character only, which means that this character has to be in
15152 * its own node and can't just be appended to the string in an
15153 * existing node, so if there are already other characters in
15154 * the node, close the node with just them, and set up to do
15155 * this character again next time through, when it will be the
15156 * only thing in its new node */
15158 next_is_quantifier = LIKELY(p < RExC_end)
15159 && UNLIKELY(isQUANTIFIER(p, RExC_end));
15161 if (next_is_quantifier && LIKELY(len)) {
15166 /* Ready to add 'ender' to the node */
15168 if (! FOLD) { /* The simple case, just append the literal */
15171 /* Don't output if it would overflow */
15172 if (UNLIKELY(len > max_string_len - ((UTF)
15173 ? UVCHR_SKIP(ender)
15180 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
15181 *(s++) = (char) ender;
15184 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
15185 added_len = (char *) new_s - s;
15186 s = (char *) new_s;
15189 requires_utf8_target = TRUE;
15193 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
15195 /* Here are folding under /l, and the code point is
15196 * problematic. If this is the first character in the
15197 * node, change the node type to folding. Otherwise, if
15198 * this is the first problematic character, close up the
15199 * existing node, so can start a new node with this one */
15201 node_type = EXACTFL;
15202 RExC_contains_locale = 1;
15204 else if (node_type == EXACT) {
15209 /* This problematic code point means we can't simplify
15211 maybe_exactfu = FALSE;
15213 /* Although these two characters have folds that are
15214 * locale-problematic, they also have folds to above Latin1
15215 * that aren't a problem. Doing these now helps at
15217 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
15218 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
15223 /* Here, we are adding a problematic fold character.
15224 * "Problematic" in this context means that its fold isn't
15225 * known until runtime. (The non-problematic code points
15226 * are the above-Latin1 ones that fold to also all
15227 * above-Latin1. Their folds don't vary no matter what the
15228 * locale is.) But here we have characters whose fold
15229 * depends on the locale. We just add in the unfolded
15230 * character, and wait until runtime to fold it */
15231 goto not_fold_common;
15233 else /* regular fold; see if actually is in a fold */
15234 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
15236 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
15238 /* Here, folding, but the character isn't in a fold.
15240 * Start a new node if previous characters in the node were
15242 if (len && node_type != EXACT) {
15247 /* Here, continuing a node with non-folded characters. Add
15249 goto not_fold_common;
15251 else { /* Here, does participate in some fold */
15253 /* If this is the first character in the node, change its
15254 * type to folding. Otherwise, if this is the first
15255 * folding character in the node, close up the existing
15256 * node, so can start a new node with this one. */
15258 node_type = compute_EXACTish(pRExC_state);
15260 else if (node_type == EXACT) {
15265 if (UTF) { /* Alway use the folded value for UTF-8
15267 if (UVCHR_IS_INVARIANT(ender)) {
15268 if (UNLIKELY(len + 1 > max_string_len)) {
15273 *(s)++ = (U8) toFOLD(ender);
15279 folded = _to_uni_fold_flags(
15281 (U8 *) s, /* We have allocated extra space
15282 in 's' so can't run off the
15286 | (( ASCII_FOLD_RESTRICTED
15287 || node_type == EXACTFL)
15288 ? FOLD_FLAGS_NOMIX_ASCII
15290 if (UNLIKELY(len + added_len > max_string_len)) {
15298 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
15300 /* U+B5 folds to the MU, so its possible for a
15301 * non-UTF-8 target to match it */
15302 requires_utf8_target = TRUE;
15306 else { /* Here is non-UTF8. */
15308 /* The fold will be one or (rarely) two characters.
15309 * Check that there's room for at least a single one
15310 * before setting any flags, etc. Because otherwise an
15311 * overflowing character could cause a flag to be set
15312 * even though it doesn't end up in this node. (For
15313 * the two character fold, we check again, before
15314 * setting any flags) */
15315 if (UNLIKELY(len + 1 > max_string_len)) {
15320 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
15321 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
15322 || UNICODE_DOT_DOT_VERSION > 0)
15324 /* On non-ancient Unicodes, check for the only possible
15325 * multi-char fold */
15326 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
15328 /* This potential multi-char fold means the node
15329 * can't be simple (because it could match more
15330 * than a single char). And in some cases it will
15331 * match 'ss', so set that flag */
15335 /* It can't change to be an EXACTFU (unless already
15336 * is one). We fold it iff under /u rules. */
15337 if (node_type != EXACTFU) {
15338 maybe_exactfu = FALSE;
15341 if (UNLIKELY(len + 2 > max_string_len)) {
15350 goto done_with_this_char;
15353 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
15355 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
15357 /* Also, the sequence 'ss' is special when not
15358 * under /u. If the target string is UTF-8, it
15359 * should match SHARP S; otherwise it won't. So,
15360 * here we have to exclude the possibility of this
15361 * node moving to /u.*/
15363 maybe_exactfu = FALSE;
15366 /* Here, the fold will be a single character */
15368 if (UNLIKELY(ender == MICRO_SIGN)) {
15369 has_micro_sign = TRUE;
15371 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
15373 /* If the character's fold differs between /d and
15374 * /u, this can't change to be an EXACTFU node */
15375 maybe_exactfu = FALSE;
15378 *(s++) = (DEPENDS_SEMANTICS)
15379 ? (char) toFOLD(ender)
15381 /* Under /u, the fold of any character in
15382 * the 0-255 range happens to be its
15383 * lowercase equivalent, except for LATIN
15384 * SMALL LETTER SHARP S, which was handled
15385 * above, and the MICRO SIGN, whose fold
15386 * requires UTF-8 to represent. */
15387 : (char) toLOWER_L1(ender);
15389 } /* End of adding current character to the node */
15391 done_with_this_char:
15395 if (next_is_quantifier) {
15397 /* Here, the next input is a quantifier, and to get here,
15398 * the current character is the only one in the node. */
15402 } /* End of loop through literal characters */
15404 /* Here we have either exhausted the input or run out of room in
15405 * the node. If the former, we are done. (If we encountered a
15406 * character that can't be in the node, transfer is made directly
15407 * to <loopdone>, and so we wouldn't have fallen off the end of the
15409 if (LIKELY(! overflowed)) {
15413 /* Here we have run out of room. We can grow plain EXACT and
15414 * LEXACT nodes. If the pattern is gigantic enough, though,
15415 * eventually we'll have to artificially chunk the pattern into
15416 * multiple nodes. */
15417 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
15418 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
15419 Size_t overhead_expansion = 0;
15421 Size_t max_nodes_for_string;
15425 /* Here we couldn't fit the final character in the current
15426 * node, so it will have to be reparsed, no matter what else we
15430 /* If would have overflowed a regular EXACT node, switch
15431 * instead to an LEXACT. The code below is structured so that
15432 * the actual growing code is common to changing from an EXACT
15433 * or just increasing the LEXACT size. This means that we have
15434 * to save the string in the EXACT case before growing, and
15435 * then copy it afterwards to its new location */
15436 if (node_type == EXACT) {
15437 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
15438 RExC_emit += overhead_expansion;
15439 Copy(s0, temp, len, char);
15442 /* Ready to grow. If it was a plain EXACT, the string was
15443 * saved, and the first few bytes of it overwritten by adding
15444 * an argument field. We assume, as we do elsewhere in this
15445 * file, that one byte of remaining input will translate into
15446 * one byte of output, and if that's too small, we grow again,
15447 * if too large the excess memory is freed at the end */
15449 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
15450 achievable = MIN(max_nodes_for_string,
15451 current_string_nodes + STR_SZ(RExC_end - p));
15452 delta = achievable - current_string_nodes;
15454 /* If there is just no more room, go finish up this chunk of
15460 change_engine_size(pRExC_state, delta + overhead_expansion);
15461 current_string_nodes += delta;
15463 = sizeof(struct regnode) * current_string_nodes;
15464 upper_fill = max_string_len + 1;
15466 /* If the length was small, we know this was originally an
15467 * EXACT node now converted to LEXACT, and the string has to be
15468 * restored. Otherwise the string was untouched. 260 is just
15469 * a number safely above 255 so don't have to worry about
15470 * getting it precise */
15472 node_type = LEXACT;
15473 FILL_NODE(ret, node_type);
15474 s0 = STRING(REGNODE_p(ret));
15475 Copy(temp, s0, len, char);
15479 goto continue_parse;
15482 bool splittable = FALSE;
15483 bool backed_up = FALSE;
15484 char * e; /* should this be U8? */
15485 char * s_start; /* should this be U8? */
15487 /* Here is /i. Running out of room creates a problem if we are
15488 * folding, and the split happens in the middle of a
15489 * multi-character fold, as a match that should have occurred,
15490 * won't, due to the way nodes are matched, and our artificial
15491 * boundary. So back off until we aren't splitting such a
15492 * fold. If there is no such place to back off to, we end up
15493 * taking the entire node as-is. This can happen if the node
15494 * consists entirely of 'f' or entirely of 's' characters (or
15495 * things that fold to them) as 'ff' and 'ss' are
15496 * multi-character folds.
15498 * The Unicode standard says that multi character folds consist
15499 * of either two or three characters. That means we would be
15500 * splitting one if the final character in the node is at the
15501 * beginning of either type, or is the second of a three
15505 * ender is the code point of the character that won't fit
15507 * s points to just beyond the final byte in the node.
15508 * It's where we would place ender if there were
15509 * room, and where in fact we do place ender's fold
15510 * in the code below, as we've over-allocated space
15511 * for s0 (hence s) to allow for this
15512 * e starts at 's' and advances as we append things.
15513 * old_s is the same as 's'. (If ender had fit, 's' would
15514 * have been advanced to beyond it).
15515 * old_old_s points to the beginning byte of the final
15516 * character in the node
15517 * p points to the beginning byte in the input of the
15518 * character beyond 'ender'.
15519 * oldp points to the beginning byte in the input of
15522 * In the case of /il, we haven't folded anything that could be
15523 * affected by the locale. That means only above-Latin1
15524 * characters that fold to other above-latin1 characters get
15525 * folded at compile time. To check where a good place to
15526 * split nodes is, everything in it will have to be folded.
15527 * The boolean 'maybe_exactfu' keeps track in /il if there are
15528 * any unfolded characters in the node. */
15529 bool need_to_fold_loc = LOC && ! maybe_exactfu;
15531 /* If we do need to fold the node, we need a place to store the
15532 * folded copy, and a way to map back to the unfolded original
15534 char * locfold_buf = NULL;
15535 Size_t * loc_correspondence = NULL;
15537 if (! need_to_fold_loc) { /* The normal case. Just
15538 initialize to the actual node */
15541 s = old_old_s; /* Point to the beginning of the final char
15542 that fits in the node */
15546 /* Here, we have filled a /il node, and there are unfolded
15547 * characters in it. If the runtime locale turns out to be
15548 * UTF-8, there are possible multi-character folds, just
15549 * like when not under /l. The node hence can't terminate
15550 * in the middle of such a fold. To determine this, we
15551 * have to create a folded copy of this node. That means
15552 * reparsing the node, folding everything assuming a UTF-8
15553 * locale. (If at runtime it isn't such a locale, the
15554 * actions here wouldn't have been necessary, but we have
15555 * to assume the worst case.) If we find we need to back
15556 * off the folded string, we do so, and then map that
15557 * position back to the original unfolded node, which then
15558 * gets output, truncated at that spot */
15560 char * redo_p = RExC_parse;
15564 /* Allow enough space assuming a single byte input folds to
15565 * a single byte output, plus assume that the two unparsed
15566 * characters (that we may need) fold to the largest number
15567 * of bytes possible, plus extra for one more worst case
15568 * scenario. In the loop below, if we start eating into
15569 * that final spare space, we enlarge this initial space */
15570 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
15572 Newxz(locfold_buf, size, char);
15573 Newxz(loc_correspondence, size, Size_t);
15575 /* Redo this node's parse, folding into 'locfold_buf' */
15576 redo_p = RExC_parse;
15577 old_redo_e = redo_e = locfold_buf;
15578 while (redo_p <= oldp) {
15580 old_redo_e = redo_e;
15581 loc_correspondence[redo_e - locfold_buf]
15582 = redo_p - RExC_parse;
15587 (void) _to_utf8_fold_flags((U8 *) redo_p,
15592 redo_e += added_len;
15593 redo_p += UTF8SKIP(redo_p);
15597 /* Note that if this code is run on some ancient
15598 * Unicode versions, SHARP S doesn't fold to 'ss',
15599 * but rather than clutter the code with #ifdef's,
15600 * as is done above, we ignore that possibility.
15601 * This is ok because this code doesn't affect what
15602 * gets matched, but merely where the node gets
15604 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
15605 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
15615 /* If we're getting so close to the end that a
15616 * worst-case fold in the next character would cause us
15617 * to overflow, increase, assuming one byte output byte
15618 * per one byte input one, plus room for another worst
15620 if ( redo_p <= oldp
15621 && redo_e > locfold_buf + size
15622 - (UTF8_MAXBYTES_CASE + 1))
15624 Size_t new_size = size
15626 + UTF8_MAXBYTES_CASE + 1;
15627 Ptrdiff_t e_offset = redo_e - locfold_buf;
15629 Renew(locfold_buf, new_size, char);
15630 Renew(loc_correspondence, new_size, Size_t);
15633 redo_e = locfold_buf + e_offset;
15637 /* Set so that things are in terms of the folded, temporary
15640 s_start = locfold_buf;
15645 /* Here, we have 's', 's_start' and 'e' set up to point to the
15646 * input that goes into the node, folded.
15648 * If the final character of the node and the fold of ender
15649 * form the first two characters of a three character fold, we
15650 * need to peek ahead at the next (unparsed) character in the
15651 * input to determine if the three actually do form such a
15652 * fold. Just looking at that character is not generally
15653 * sufficient, as it could be, for example, an escape sequence
15654 * that evaluates to something else, and it needs to be folded.
15656 * khw originally thought to just go through the parse loop one
15657 * extra time, but that doesn't work easily as that iteration
15658 * could cause things to think that the parse is over and to
15659 * goto loopdone. The character could be a '$' for example, or
15660 * the character beyond could be a quantifier, and other
15661 * glitches as well.
15663 * The solution used here for peeking ahead is to look at that
15664 * next character. If it isn't ASCII punctuation, then it will
15665 * be something that would continue on in an EXACTish node if
15666 * there were space. We append the fold of it to s, having
15667 * reserved enough room in s0 for the purpose. If we can't
15668 * reasonably peek ahead, we instead assume the worst case:
15669 * that it is something that would form the completion of a
15672 * If we can't split between s and ender, we work backwards
15673 * character-by-character down to s0. At each current point
15674 * see if we are at the beginning of a multi-char fold. If so,
15675 * that means we would be splitting the fold across nodes, and
15676 * so we back up one and try again.
15678 * If we're not at the beginning, we still could be at the
15679 * final two characters of a (rare) three character fold. We
15680 * check if the sequence starting at the character before the
15681 * current position (and including the current and next
15682 * characters) is a three character fold. If not, the node can
15683 * be split here. If it is, we have to backup two characters
15686 * Otherwise, the node can be split at the current position.
15688 * The same logic is used for UTF-8 patterns and not */
15692 /* Append the fold of ender */
15693 (void) _to_uni_fold_flags(
15697 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15698 ? FOLD_FLAGS_NOMIX_ASCII
15702 /* 's' and the character folded to by ender may be the
15703 * first two of a three-character fold, in which case the
15704 * node should not be split here. That may mean examining
15705 * the so-far unparsed character starting at 'p'. But if
15706 * ender folded to more than one character, we already have
15707 * three characters to look at. Also, we first check if
15708 * the sequence consisting of s and the next character form
15709 * the first two of some three character fold. If not,
15710 * there's no need to peek ahead. */
15711 if ( added_len <= UTF8SKIP(e - added_len)
15712 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15714 /* Here, the two do form the beginning of a potential
15715 * three character fold. The unexamined character may
15716 * or may not complete it. Peek at it. It might be
15717 * something that ends the node or an escape sequence,
15718 * in which case we don't know without a lot of work
15719 * what it evaluates to, so we have to assume the worst
15720 * case: that it does complete the fold, and so we
15721 * can't split here. All such instances will have
15722 * that character be an ASCII punctuation character,
15723 * like a backslash. So, for that case, backup one and
15724 * drop down to try at that position */
15726 s = (char *) utf8_hop_back((U8 *) s, -1,
15731 /* Here, since it's not punctuation, it must be a
15732 * real character, and we can append its fold to
15733 * 'e' (having deliberately reserved enough space
15734 * for this eventuality) and drop down to check if
15735 * the three actually do form a folded sequence */
15736 (void) _to_utf8_fold_flags(
15737 (U8 *) p, (U8 *) RExC_end,
15740 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15741 ? FOLD_FLAGS_NOMIX_ASCII
15747 /* Here, we either have three characters available in
15748 * sequence starting at 's', or we have two characters and
15749 * know that the following one can't possibly be part of a
15750 * three character fold. We go through the node backwards
15751 * until we find a place where we can split it without
15752 * breaking apart a multi-character fold. At any given
15753 * point we have to worry about if such a fold begins at
15754 * the current 's', and also if a three-character fold
15755 * begins at s-1, (containing s and s+1). Splitting in
15756 * either case would break apart a fold */
15758 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15761 /* If is a multi-char fold, can't split here. Backup
15762 * one char and try again */
15763 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15769 /* If the two characters beginning at 's' are part of a
15770 * three character fold starting at the character
15771 * before s, we can't split either before or after s.
15772 * Backup two chars and try again */
15773 if ( LIKELY(s > s_start)
15774 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15777 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15782 /* Here there's no multi-char fold between s and the
15783 * next character following it. We can split */
15787 } while (s > s_start); /* End of loops backing up through the node */
15789 /* Here we either couldn't find a place to split the node,
15790 * or else we broke out of the loop setting 'splittable' to
15791 * true. In the latter case, the place to split is between
15792 * the first and second characters in the sequence starting
15798 else { /* Pattern not UTF-8 */
15799 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15800 || ASCII_FOLD_RESTRICTED)
15802 assert( toLOWER_L1(ender) < 256 );
15803 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15811 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15818 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15819 || ASCII_FOLD_RESTRICTED)
15821 assert( toLOWER_L1(ender) < 256 );
15822 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15832 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15838 if ( LIKELY(s > s_start)
15839 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15849 } while (s > s_start);
15856 /* Here, we are done backing up. If we didn't backup at all
15857 * (the likely case), just proceed */
15860 /* If we did find a place to split, reparse the entire node
15861 * stopping where we have calculated. */
15864 /* If we created a temporary folded string under /l, we
15865 * have to map that back to the original */
15866 if (need_to_fold_loc) {
15867 upper_fill = loc_correspondence[s - s_start];
15868 if (upper_fill == 0) {
15869 FAIL2("panic: loc_correspondence[%d] is 0",
15870 (int) (s - s_start));
15872 Safefree(locfold_buf);
15873 Safefree(loc_correspondence);
15876 upper_fill = s - s0;
15881 /* Here the node consists entirely of non-final multi-char
15882 * folds. (Likely it is all 'f's or all 's's.) There's no
15883 * decent place to split it, so give up and just take the
15888 if (need_to_fold_loc) {
15889 Safefree(locfold_buf);
15890 Safefree(loc_correspondence);
15892 } /* End of verifying node ends with an appropriate char */
15894 /* We need to start the next node at the character that didn't fit
15898 loopdone: /* Jumped to when encounters something that shouldn't be
15901 /* Free up any over-allocated space; cast is to silence bogus
15902 * warning in MS VC */
15903 change_engine_size(pRExC_state,
15904 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15906 /* I (khw) don't know if you can get here with zero length, but the
15907 * old code handled this situation by creating a zero-length EXACT
15908 * node. Might as well be NOTHING instead */
15910 OP(REGNODE_p(ret)) = NOTHING;
15914 /* If the node type is EXACT here, check to see if it
15915 * should be EXACTL, or EXACT_REQ8. */
15916 if (node_type == EXACT) {
15918 node_type = EXACTL;
15920 else if (requires_utf8_target) {
15921 node_type = EXACT_REQ8;
15924 else if (node_type == LEXACT) {
15925 if (requires_utf8_target) {
15926 node_type = LEXACT_REQ8;
15930 if ( UNLIKELY(has_micro_sign || has_ss)
15931 && (node_type == EXACTFU || ( node_type == EXACTF
15932 && maybe_exactfu)))
15933 { /* These two conditions are problematic in non-UTF-8
15936 node_type = EXACTFUP;
15938 else if (node_type == EXACTFL) {
15940 /* 'maybe_exactfu' is deliberately set above to
15941 * indicate this node type, where all code points in it
15943 if (maybe_exactfu) {
15944 node_type = EXACTFLU8;
15947 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15949 /* A character that folds to more than one will
15950 * match multiple characters, so can't be SIMPLE.
15951 * We don't have to worry about this with EXACTFLU8
15952 * nodes just above, as they have already been
15953 * folded (since the fold doesn't vary at run
15954 * time). Here, if the final character in the node
15955 * folds to multiple, it can't be simple. (This
15956 * only has an effect if the node has only a single
15957 * character, hence the final one, as elsewhere we
15958 * turn off simple for nodes whose length > 1 */
15962 else if (node_type == EXACTF) { /* Means is /di */
15964 /* This intermediate variable is needed solely because
15965 * the asserts in the macro where used exceed Win32's
15966 * literal string capacity */
15967 char first_char = * STRING(REGNODE_p(ret));
15969 /* If 'maybe_exactfu' is clear, then we need to stay
15970 * /di. If it is set, it means there are no code
15971 * points that match differently depending on UTF8ness
15972 * of the target string, so it can become an EXACTFU
15974 if (! maybe_exactfu) {
15975 RExC_seen_d_op = TRUE;
15977 else if ( isALPHA_FOLD_EQ(first_char, 's')
15978 || isALPHA_FOLD_EQ(ender, 's'))
15980 /* But, if the node begins or ends in an 's' we
15981 * have to defer changing it into an EXACTFU, as
15982 * the node could later get joined with another one
15983 * that ends or begins with 's' creating an 'ss'
15984 * sequence which would then wrongly match the
15985 * sharp s without the target being UTF-8. We
15986 * create a special node that we resolve later when
15987 * we join nodes together */
15989 node_type = EXACTFU_S_EDGE;
15992 node_type = EXACTFU;
15996 if (requires_utf8_target && node_type == EXACTFU) {
15997 node_type = EXACTFU_REQ8;
16001 OP(REGNODE_p(ret)) = node_type;
16002 setSTR_LEN(REGNODE_p(ret), len);
16003 RExC_emit += STR_SZ(len);
16005 /* If the node isn't a single character, it can't be SIMPLE */
16006 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
16010 *flagp |= HASWIDTH | maybe_SIMPLE;
16016 /* len is STRLEN which is unsigned, need to copy to signed */
16019 vFAIL("Internal disaster");
16022 } /* End of label 'defchar:' */
16024 } /* End of giant switch on input character */
16026 /* Position parse to next real character */
16027 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16028 FALSE /* Don't force to /x */ );
16029 if ( *RExC_parse == '{'
16030 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
16033 RExC_parse_inc_by(1);
16034 vFAIL("Unescaped left brace in regex is illegal here");
16036 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
16037 " passed through");
16045 S_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
16047 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
16048 * sets up the bitmap and any flags, removing those code points from the
16049 * inversion list, setting it to NULL should it become completely empty */
16052 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
16054 /* There is no bitmap for this node type */
16055 if (REGNODE_TYPE(OP(node)) != ANYOF) {
16059 ANYOF_BITMAP_ZERO(node);
16060 if (*invlist_ptr) {
16062 /* This gets set if we actually need to modify things */
16063 bool change_invlist = FALSE;
16067 /* Start looking through *invlist_ptr */
16068 invlist_iterinit(*invlist_ptr);
16069 while (invlist_iternext(*invlist_ptr, &start, &end)) {
16073 /* Quit if are above what we should change */
16074 if (start >= NUM_ANYOF_CODE_POINTS) {
16078 change_invlist = TRUE;
16080 /* Set all the bits in the range, up to the max that we are doing */
16081 high = (end < NUM_ANYOF_CODE_POINTS - 1)
16083 : NUM_ANYOF_CODE_POINTS - 1;
16084 for (i = start; i <= (int) high; i++) {
16085 ANYOF_BITMAP_SET(node, i);
16088 invlist_iterfinish(*invlist_ptr);
16090 /* Done with loop; remove any code points that are in the bitmap from
16092 if (change_invlist) {
16093 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
16096 /* If have completely emptied it, remove it completely */
16097 if (_invlist_len(*invlist_ptr) == 0) {
16098 SvREFCNT_dec_NN(*invlist_ptr);
16099 *invlist_ptr = NULL;
16104 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
16105 Character classes ([:foo:]) can also be negated ([:^foo:]).
16106 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
16107 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
16108 but trigger failures because they are currently unimplemented. */
16110 #define POSIXCC_DONE(c) ((c) == ':')
16111 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
16112 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
16113 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
16115 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
16116 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
16117 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
16119 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
16121 /* 'posix_warnings' and 'warn_text' are names of variables in the following
16123 #define ADD_POSIX_WARNING(p, text) STMT_START { \
16124 if (posix_warnings) { \
16125 if (! RExC_warn_text ) RExC_warn_text = \
16126 (AV *) sv_2mortal((SV *) newAV()); \
16127 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
16131 REPORT_LOCATION_ARGS(p))); \
16134 #define CLEAR_POSIX_WARNINGS() \
16136 if (posix_warnings && RExC_warn_text) \
16137 av_clear(RExC_warn_text); \
16140 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
16142 CLEAR_POSIX_WARNINGS(); \
16147 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
16149 const char * const s, /* Where the putative posix class begins.
16150 Normally, this is one past the '['. This
16151 parameter exists so it can be somewhere
16152 besides RExC_parse. */
16153 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
16155 AV ** posix_warnings, /* Where to place any generated warnings, or
16157 const bool check_only /* Don't die if error */
16160 /* This parses what the caller thinks may be one of the three POSIX
16162 * 1) a character class, like [:blank:]
16163 * 2) a collating symbol, like [. .]
16164 * 3) an equivalence class, like [= =]
16165 * In the latter two cases, it croaks if it finds a syntactically legal
16166 * one, as these are not handled by Perl.
16168 * The main purpose is to look for a POSIX character class. It returns:
16169 * a) the class number
16170 * if it is a completely syntactically and semantically legal class.
16171 * 'updated_parse_ptr', if not NULL, is set to point to just after the
16172 * closing ']' of the class
16173 * b) OOB_NAMEDCLASS
16174 * if it appears that one of the three POSIX constructs was meant, but
16175 * its specification was somehow defective. 'updated_parse_ptr', if
16176 * not NULL, is set to point to the character just after the end
16177 * character of the class. See below for handling of warnings.
16178 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
16179 * if it doesn't appear that a POSIX construct was intended.
16180 * 'updated_parse_ptr' is not changed. No warnings nor errors are
16183 * In b) there may be errors or warnings generated. If 'check_only' is
16184 * TRUE, then any errors are discarded. Warnings are returned to the
16185 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
16186 * instead it is NULL, warnings are suppressed.
16188 * The reason for this function, and its complexity is that a bracketed
16189 * character class can contain just about anything. But it's easy to
16190 * mistype the very specific posix class syntax but yielding a valid
16191 * regular bracketed class, so it silently gets compiled into something
16192 * quite unintended.
16194 * The solution adopted here maintains backward compatibility except that
16195 * it adds a warning if it looks like a posix class was intended but
16196 * improperly specified. The warning is not raised unless what is input
16197 * very closely resembles one of the 14 legal posix classes. To do this,
16198 * it uses fuzzy parsing. It calculates how many single-character edits it
16199 * would take to transform what was input into a legal posix class. Only
16200 * if that number is quite small does it think that the intention was a
16201 * posix class. Obviously these are heuristics, and there will be cases
16202 * where it errs on one side or another, and they can be tweaked as
16203 * experience informs.
16205 * The syntax for a legal posix class is:
16207 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
16209 * What this routine considers syntactically to be an intended posix class
16210 * is this (the comments indicate some restrictions that the pattern
16213 * qr/(?x: \[? # The left bracket, possibly
16215 * \h* # possibly followed by blanks
16216 * (?: \^ \h* )? # possibly a misplaced caret
16217 * [:;]? # The opening class character,
16218 * # possibly omitted. A typo
16219 * # semi-colon can also be used.
16221 * \^? # possibly a correctly placed
16222 * # caret, but not if there was also
16223 * # a misplaced one
16225 * .{3,15} # The class name. If there are
16226 * # deviations from the legal syntax,
16227 * # its edit distance must be close
16228 * # to a real class name in order
16229 * # for it to be considered to be
16230 * # an intended posix class.
16232 * [[:punct:]]? # The closing class character,
16233 * # possibly omitted. If not a colon
16234 * # nor semi colon, the class name
16235 * # must be even closer to a valid
16238 * \]? # The right bracket, possibly
16242 * In the above, \h must be ASCII-only.
16244 * These are heuristics, and can be tweaked as field experience dictates.
16245 * There will be cases when someone didn't intend to specify a posix class
16246 * that this warns as being so. The goal is to minimize these, while
16247 * maximizing the catching of things intended to be a posix class that
16248 * aren't parsed as such.
16252 const char * const e = RExC_end;
16253 unsigned complement = 0; /* If to complement the class */
16254 bool found_problem = FALSE; /* Assume OK until proven otherwise */
16255 bool has_opening_bracket = FALSE;
16256 bool has_opening_colon = FALSE;
16257 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
16259 const char * possible_end = NULL; /* used for a 2nd parse pass */
16260 const char* name_start; /* ptr to class name first char */
16262 /* If the number of single-character typos the input name is away from a
16263 * legal name is no more than this number, it is considered to have meant
16264 * the legal name */
16265 int max_distance = 2;
16267 /* to store the name. The size determines the maximum length before we
16268 * decide that no posix class was intended. Should be at least
16269 * sizeof("alphanumeric") */
16271 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
16273 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
16275 CLEAR_POSIX_WARNINGS();
16278 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
16281 if (*(p - 1) != '[') {
16282 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
16283 found_problem = TRUE;
16286 has_opening_bracket = TRUE;
16289 /* They could be confused and think you can put spaces between the
16292 found_problem = TRUE;
16296 } while (p < e && isBLANK(*p));
16298 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16301 /* For [. .] and [= =]. These are quite different internally from [: :],
16302 * so they are handled separately. */
16303 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
16304 and 1 for at least one char in it
16307 const char open_char = *p;
16308 const char * temp_ptr = p + 1;
16310 /* These two constructs are not handled by perl, and if we find a
16311 * syntactically valid one, we croak. khw, who wrote this code, finds
16312 * this explanation of them very unclear:
16313 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
16314 * And searching the rest of the internet wasn't very helpful either.
16315 * It looks like just about any byte can be in these constructs,
16316 * depending on the locale. But unless the pattern is being compiled
16317 * under /l, which is very rare, Perl runs under the C or POSIX locale.
16318 * In that case, it looks like [= =] isn't allowed at all, and that
16319 * [. .] could be any single code point, but for longer strings the
16320 * constituent characters would have to be the ASCII alphabetics plus
16321 * the minus-hyphen. Any sensible locale definition would limit itself
16322 * to these. And any portable one definitely should. Trying to parse
16323 * the general case is a nightmare (see [perl #127604]). So, this code
16324 * looks only for interiors of these constructs that match:
16326 * Using \w relaxes the apparent rules a little, without adding much
16327 * danger of mistaking something else for one of these constructs.
16329 * [. .] in some implementations described on the internet is usable to
16330 * escape a character that otherwise is special in bracketed character
16331 * classes. For example [.].] means a literal right bracket instead of
16332 * the ending of the class
16334 * [= =] can legitimately contain a [. .] construct, but we don't
16335 * handle this case, as that [. .] construct will later get parsed
16336 * itself and croak then. And [= =] is checked for even when not under
16337 * /l, as Perl has long done so.
16339 * The code below relies on there being a trailing NUL, so it doesn't
16340 * have to keep checking if the parse ptr < e.
16342 if (temp_ptr[1] == open_char) {
16345 else while ( temp_ptr < e
16346 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
16351 if (*temp_ptr == open_char) {
16353 if (*temp_ptr == ']') {
16355 if (! found_problem && ! check_only) {
16356 RExC_parse_set((char *) temp_ptr);
16357 vFAIL3("POSIX syntax [%c %c] is reserved for future "
16358 "extensions", open_char, open_char);
16361 /* Here, the syntax wasn't completely valid, or else the call
16362 * is to check-only */
16363 if (updated_parse_ptr) {
16364 *updated_parse_ptr = (char *) temp_ptr;
16367 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
16371 /* If we find something that started out to look like one of these
16372 * constructs, but isn't, we continue below so that it can be checked
16373 * for being a class name with a typo of '.' or '=' instead of a colon.
16377 /* Here, we think there is a possibility that a [: :] class was meant, and
16378 * we have the first real character. It could be they think the '^' comes
16381 found_problem = TRUE;
16382 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
16387 found_problem = TRUE;
16391 } while (p < e && isBLANK(*p));
16393 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16397 /* But the first character should be a colon, which they could have easily
16398 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
16399 * distinguish from a colon, so treat that as a colon). */
16402 has_opening_colon = TRUE;
16404 else if (*p == ';') {
16405 found_problem = TRUE;
16407 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16408 has_opening_colon = TRUE;
16411 found_problem = TRUE;
16412 ADD_POSIX_WARNING(p, "there must be a starting ':'");
16414 /* Consider an initial punctuation (not one of the recognized ones) to
16415 * be a left terminator */
16416 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
16421 /* They may think that you can put spaces between the components */
16423 found_problem = TRUE;
16427 } while (p < e && isBLANK(*p));
16429 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16434 /* We consider something like [^:^alnum:]] to not have been intended to
16435 * be a posix class, but XXX maybe we should */
16437 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16444 /* Again, they may think that you can put spaces between the components */
16446 found_problem = TRUE;
16450 } while (p < e && isBLANK(*p));
16452 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16457 /* XXX This ']' may be a typo, and something else was meant. But
16458 * treating it as such creates enough complications, that that
16459 * possibility isn't currently considered here. So we assume that the
16460 * ']' is what is intended, and if we've already found an initial '[',
16461 * this leaves this construct looking like [:] or [:^], which almost
16462 * certainly weren't intended to be posix classes */
16463 if (has_opening_bracket) {
16464 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16467 /* But this function can be called when we parse the colon for
16468 * something like qr/[alpha:]]/, so we back up to look for the
16473 found_problem = TRUE;
16474 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16476 else if (*p != ':') {
16478 /* XXX We are currently very restrictive here, so this code doesn't
16479 * consider the possibility that, say, /[alpha.]]/ was intended to
16480 * be a posix class. */
16481 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16484 /* Here we have something like 'foo:]'. There was no initial colon,
16485 * and we back up over 'foo. XXX Unlike the going forward case, we
16486 * don't handle typos of non-word chars in the middle */
16487 has_opening_colon = FALSE;
16490 while (p > RExC_start && isWORDCHAR(*p)) {
16495 /* Here, we have positioned ourselves to where we think the first
16496 * character in the potential class is */
16499 /* Now the interior really starts. There are certain key characters that
16500 * can end the interior, or these could just be typos. To catch both
16501 * cases, we may have to do two passes. In the first pass, we keep on
16502 * going unless we come to a sequence that matches
16503 * qr/ [[:punct:]] [[:blank:]]* \] /xa
16504 * This means it takes a sequence to end the pass, so two typos in a row if
16505 * that wasn't what was intended. If the class is perfectly formed, just
16506 * this one pass is needed. We also stop if there are too many characters
16507 * being accumulated, but this number is deliberately set higher than any
16508 * real class. It is set high enough so that someone who thinks that
16509 * 'alphanumeric' is a correct name would get warned that it wasn't.
16510 * While doing the pass, we keep track of where the key characters were in
16511 * it. If we don't find an end to the class, and one of the key characters
16512 * was found, we redo the pass, but stop when we get to that character.
16513 * Thus the key character was considered a typo in the first pass, but a
16514 * terminator in the second. If two key characters are found, we stop at
16515 * the second one in the first pass. Again this can miss two typos, but
16516 * catches a single one
16518 * In the first pass, 'possible_end' starts as NULL, and then gets set to
16519 * point to the first key character. For the second pass, it starts as -1.
16525 bool has_blank = FALSE;
16526 bool has_upper = FALSE;
16527 bool has_terminating_colon = FALSE;
16528 bool has_terminating_bracket = FALSE;
16529 bool has_semi_colon = FALSE;
16530 unsigned int name_len = 0;
16531 int punct_count = 0;
16535 /* Squeeze out blanks when looking up the class name below */
16536 if (isBLANK(*p) ) {
16538 found_problem = TRUE;
16543 /* The name will end with a punctuation */
16545 const char * peek = p + 1;
16547 /* Treat any non-']' punctuation followed by a ']' (possibly
16548 * with intervening blanks) as trying to terminate the class.
16549 * ']]' is very likely to mean a class was intended (but
16550 * missing the colon), but the warning message that gets
16551 * generated shows the error position better if we exit the
16552 * loop at the bottom (eventually), so skip it here. */
16554 if (peek < e && isBLANK(*peek)) {
16556 found_problem = TRUE;
16559 } while (peek < e && isBLANK(*peek));
16562 if (peek < e && *peek == ']') {
16563 has_terminating_bracket = TRUE;
16565 has_terminating_colon = TRUE;
16567 else if (*p == ';') {
16568 has_semi_colon = TRUE;
16569 has_terminating_colon = TRUE;
16572 found_problem = TRUE;
16579 /* Here we have punctuation we thought didn't end the class.
16580 * Keep track of the position of the key characters that are
16581 * more likely to have been class-enders */
16582 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
16584 /* Allow just one such possible class-ender not actually
16585 * ending the class. */
16586 if (possible_end) {
16592 /* If we have too many punctuation characters, no use in
16594 if (++punct_count > max_distance) {
16598 /* Treat the punctuation as a typo. */
16599 input_text[name_len++] = *p;
16602 else if (isUPPER(*p)) { /* Use lowercase for lookup */
16603 input_text[name_len++] = toLOWER(*p);
16605 found_problem = TRUE;
16607 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
16608 input_text[name_len++] = *p;
16612 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
16616 /* The declaration of 'input_text' is how long we allow a potential
16617 * class name to be, before saying they didn't mean a class name at
16619 if (name_len >= C_ARRAY_LENGTH(input_text)) {
16624 /* We get to here when the possible class name hasn't been properly
16625 * terminated before:
16626 * 1) we ran off the end of the pattern; or
16627 * 2) found two characters, each of which might have been intended to
16628 * be the name's terminator
16629 * 3) found so many punctuation characters in the purported name,
16630 * that the edit distance to a valid one is exceeded
16631 * 4) we decided it was more characters than anyone could have
16632 * intended to be one. */
16634 found_problem = TRUE;
16636 /* In the final two cases, we know that looking up what we've
16637 * accumulated won't lead to a match, even a fuzzy one. */
16638 if ( name_len >= C_ARRAY_LENGTH(input_text)
16639 || punct_count > max_distance)
16641 /* If there was an intermediate key character that could have been
16642 * an intended end, redo the parse, but stop there */
16643 if (possible_end && possible_end != (char *) -1) {
16644 possible_end = (char *) -1; /* Special signal value to say
16645 we've done a first pass */
16650 /* Otherwise, it can't have meant to have been a class */
16651 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16654 /* If we ran off the end, and the final character was a punctuation
16655 * one, back up one, to look at that final one just below. Later, we
16656 * will restore the parse pointer if appropriate */
16657 if (name_len && p == e && isPUNCT(*(p-1))) {
16662 if (p < e && isPUNCT(*p)) {
16664 has_terminating_bracket = TRUE;
16666 /* If this is a 2nd ']', and the first one is just below this
16667 * one, consider that to be the real terminator. This gives a
16668 * uniform and better positioning for the warning message */
16670 && possible_end != (char *) -1
16671 && *possible_end == ']'
16672 && name_len && input_text[name_len - 1] == ']')
16677 /* And this is actually equivalent to having done the 2nd
16678 * pass now, so set it to not try again */
16679 possible_end = (char *) -1;
16684 has_terminating_colon = TRUE;
16686 else if (*p == ';') {
16687 has_semi_colon = TRUE;
16688 has_terminating_colon = TRUE;
16696 /* Here, we have a class name to look up. We can short circuit the
16697 * stuff below for short names that can't possibly be meant to be a
16698 * class name. (We can do this on the first pass, as any second pass
16699 * will yield an even shorter name) */
16700 if (name_len < 3) {
16701 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16704 /* Find which class it is. Initially switch on the length of the name.
16706 switch (name_len) {
16708 if (memEQs(name_start, 4, "word")) {
16709 /* this is not POSIX, this is the Perl \w */
16710 class_number = ANYOF_WORDCHAR;
16714 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16715 * graph lower print punct space upper
16716 * Offset 4 gives the best switch position. */
16717 switch (name_start[4]) {
16719 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16720 class_number = ANYOF_ALPHA;
16723 if (memBEGINs(name_start, 5, "spac")) /* space */
16724 class_number = ANYOF_SPACE;
16727 if (memBEGINs(name_start, 5, "grap")) /* graph */
16728 class_number = ANYOF_GRAPH;
16731 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16732 class_number = ANYOF_ASCII;
16735 if (memBEGINs(name_start, 5, "blan")) /* blank */
16736 class_number = ANYOF_BLANK;
16739 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16740 class_number = ANYOF_CNTRL;
16743 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16744 class_number = ANYOF_ALPHANUMERIC;
16747 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16748 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16749 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16750 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16753 if (memBEGINs(name_start, 5, "digi")) /* digit */
16754 class_number = ANYOF_DIGIT;
16755 else if (memBEGINs(name_start, 5, "prin")) /* print */
16756 class_number = ANYOF_PRINT;
16757 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16758 class_number = ANYOF_PUNCT;
16763 if (memEQs(name_start, 6, "xdigit"))
16764 class_number = ANYOF_XDIGIT;
16768 /* If the name exactly matches a posix class name the class number will
16769 * here be set to it, and the input almost certainly was meant to be a
16770 * posix class, so we can skip further checking. If instead the syntax
16771 * is exactly correct, but the name isn't one of the legal ones, we
16772 * will return that as an error below. But if neither of these apply,
16773 * it could be that no posix class was intended at all, or that one
16774 * was, but there was a typo. We tease these apart by doing fuzzy
16775 * matching on the name */
16776 if (class_number == OOB_NAMEDCLASS && found_problem) {
16777 const UV posix_names[][6] = {
16778 { 'a', 'l', 'n', 'u', 'm' },
16779 { 'a', 'l', 'p', 'h', 'a' },
16780 { 'a', 's', 'c', 'i', 'i' },
16781 { 'b', 'l', 'a', 'n', 'k' },
16782 { 'c', 'n', 't', 'r', 'l' },
16783 { 'd', 'i', 'g', 'i', 't' },
16784 { 'g', 'r', 'a', 'p', 'h' },
16785 { 'l', 'o', 'w', 'e', 'r' },
16786 { 'p', 'r', 'i', 'n', 't' },
16787 { 'p', 'u', 'n', 'c', 't' },
16788 { 's', 'p', 'a', 'c', 'e' },
16789 { 'u', 'p', 'p', 'e', 'r' },
16790 { 'w', 'o', 'r', 'd' },
16791 { 'x', 'd', 'i', 'g', 'i', 't' }
16793 /* The names of the above all have added NULs to make them the same
16794 * size, so we need to also have the real lengths */
16795 const UV posix_name_lengths[] = {
16796 sizeof("alnum") - 1,
16797 sizeof("alpha") - 1,
16798 sizeof("ascii") - 1,
16799 sizeof("blank") - 1,
16800 sizeof("cntrl") - 1,
16801 sizeof("digit") - 1,
16802 sizeof("graph") - 1,
16803 sizeof("lower") - 1,
16804 sizeof("print") - 1,
16805 sizeof("punct") - 1,
16806 sizeof("space") - 1,
16807 sizeof("upper") - 1,
16808 sizeof("word") - 1,
16809 sizeof("xdigit")- 1
16812 int temp_max = max_distance; /* Use a temporary, so if we
16813 reparse, we haven't changed the
16816 /* Use a smaller max edit distance if we are missing one of the
16818 if ( has_opening_bracket + has_opening_colon < 2
16819 || has_terminating_bracket + has_terminating_colon < 2)
16824 /* See if the input name is close to a legal one */
16825 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16827 /* Short circuit call if the lengths are too far apart to be
16829 if (abs( (int) (name_len - posix_name_lengths[i]))
16835 if (edit_distance(input_text,
16838 posix_name_lengths[i],
16842 { /* If it is close, it probably was intended to be a class */
16843 goto probably_meant_to_be;
16847 /* Here the input name is not close enough to a valid class name
16848 * for us to consider it to be intended to be a posix class. If
16849 * we haven't already done so, and the parse found a character that
16850 * could have been terminators for the name, but which we absorbed
16851 * as typos during the first pass, repeat the parse, signalling it
16852 * to stop at that character */
16853 if (possible_end && possible_end != (char *) -1) {
16854 possible_end = (char *) -1;
16859 /* Here neither pass found a close-enough class name */
16860 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16863 probably_meant_to_be:
16865 /* Here we think that a posix specification was intended. Update any
16867 if (updated_parse_ptr) {
16868 *updated_parse_ptr = (char *) p;
16871 /* If a posix class name was intended but incorrectly specified, we
16872 * output or return the warnings */
16873 if (found_problem) {
16875 /* We set flags for these issues in the parse loop above instead of
16876 * adding them to the list of warnings, because we can parse it
16877 * twice, and we only want one warning instance */
16879 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16882 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16884 if (has_semi_colon) {
16885 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16887 else if (! has_terminating_colon) {
16888 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16890 if (! has_terminating_bracket) {
16891 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16894 if ( posix_warnings
16896 && av_count(RExC_warn_text) > 0)
16898 *posix_warnings = RExC_warn_text;
16901 else if (class_number != OOB_NAMEDCLASS) {
16902 /* If it is a known class, return the class. The class number
16903 * #defines are structured so each complement is +1 to the normal
16905 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16907 else if (! check_only) {
16909 /* Here, it is an unrecognized class. This is an error (unless the
16910 * call is to check only, which we've already handled above) */
16911 const char * const complement_string = (complement)
16914 RExC_parse_set((char *) p);
16915 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16917 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16921 return OOB_NAMEDCLASS;
16923 #undef ADD_POSIX_WARNING
16925 STATIC unsigned int
16926 S_regex_set_precedence(const U8 my_operator) {
16928 /* Returns the precedence in the (?[...]) construct of the input operator,
16929 * specified by its character representation. The precedence follows
16930 * general Perl rules, but it extends this so that ')' and ']' have (low)
16931 * precedence even though they aren't really operators */
16933 switch (my_operator) {
16949 NOT_REACHED; /* NOTREACHED */
16950 return 0; /* Silence compiler warning */
16953 STATIC regnode_offset
16954 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16955 I32 *flagp, U32 depth)
16957 /* Handle the (?[...]) construct to do set operations */
16959 U8 curchar; /* Current character being parsed */
16960 UV start, end; /* End points of code point ranges */
16961 SV* final = NULL; /* The end result inversion list */
16962 SV* result_string; /* 'final' stringified */
16963 AV* stack; /* stack of operators and operands not yet
16965 AV* fence_stack = NULL; /* A stack containing the positions in
16966 'stack' of where the undealt-with left
16967 parens would be if they were actually
16969 /* The 'volatile' is a workaround for an optimiser bug
16970 * in Solaris Studio 12.3. See RT #127455 */
16971 volatile IV fence = 0; /* Position of where most recent undealt-
16972 with left paren in stack is; -1 if none.
16974 STRLEN len; /* Temporary */
16975 regnode_offset node; /* Temporary, and final regnode returned by
16977 const bool save_fold = FOLD; /* Temporary */
16978 char *save_end, *save_parse; /* Temporaries */
16979 const bool in_locale = LOC; /* we turn off /l during processing */
16981 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16983 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16985 DEBUG_PARSE("xcls");
16988 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16991 /* The use of this operator implies /u. This is required so that the
16992 * compile time values are valid in all runtime cases */
16993 REQUIRE_UNI_RULES(flagp, 0);
16995 /* Everything in this construct is a metacharacter. Operands begin with
16996 * either a '\' (for an escape sequence), or a '[' for a bracketed
16997 * character class. Any other character should be an operator, or
16998 * parenthesis for grouping. Both types of operands are handled by calling
16999 * regclass() to parse them. It is called with a parameter to indicate to
17000 * return the computed inversion list. The parsing here is implemented via
17001 * a stack. Each entry on the stack is a single character representing one
17002 * of the operators; or else a pointer to an operand inversion list. */
17004 #define IS_OPERATOR(a) SvIOK(a)
17005 #define IS_OPERAND(a) (! IS_OPERATOR(a))
17007 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
17008 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
17009 * with pronouncing it called it Reverse Polish instead, but now that YOU
17010 * know how to pronounce it you can use the correct term, thus giving due
17011 * credit to the person who invented it, and impressing your geek friends.
17012 * Wikipedia says that the pronounciation of "Ł" has been changing so that
17013 * it is now more like an English initial W (as in wonk) than an L.)
17015 * This means that, for example, 'a | b & c' is stored on the stack as
17023 * where the numbers in brackets give the stack [array] element number.
17024 * In this implementation, parentheses are not stored on the stack.
17025 * Instead a '(' creates a "fence" so that the part of the stack below the
17026 * fence is invisible except to the corresponding ')' (this allows us to
17027 * replace testing for parens, by using instead subtraction of the fence
17028 * position). As new operands are processed they are pushed onto the stack
17029 * (except as noted in the next paragraph). New operators of higher
17030 * precedence than the current final one are inserted on the stack before
17031 * the lhs operand (so that when the rhs is pushed next, everything will be
17032 * in the correct positions shown above. When an operator of equal or
17033 * lower precedence is encountered in parsing, all the stacked operations
17034 * of equal or higher precedence are evaluated, leaving the result as the
17035 * top entry on the stack. This makes higher precedence operations
17036 * evaluate before lower precedence ones, and causes operations of equal
17037 * precedence to left associate.
17039 * The only unary operator '!' is immediately pushed onto the stack when
17040 * encountered. When an operand is encountered, if the top of the stack is
17041 * a '!", the complement is immediately performed, and the '!' popped. The
17042 * resulting value is treated as a new operand, and the logic in the
17043 * previous paragraph is executed. Thus in the expression
17045 * the stack looks like
17051 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
17058 * A ')' is treated as an operator with lower precedence than all the
17059 * aforementioned ones, which causes all operations on the stack above the
17060 * corresponding '(' to be evaluated down to a single resultant operand.
17061 * Then the fence for the '(' is removed, and the operand goes through the
17062 * algorithm above, without the fence.
17064 * A separate stack is kept of the fence positions, so that the position of
17065 * the latest so-far unbalanced '(' is at the top of it.
17067 * The ']' ending the construct is treated as the lowest operator of all,
17068 * so that everything gets evaluated down to a single operand, which is the
17071 stack = (AV*)newSV_type_mortal(SVt_PVAV);
17072 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
17074 while (RExC_parse < RExC_end) {
17075 I32 top_index; /* Index of top-most element in 'stack' */
17076 SV** top_ptr; /* Pointer to top 'stack' element */
17077 SV* current = NULL; /* To contain the current inversion list
17079 SV* only_to_avoid_leaks;
17081 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
17082 TRUE /* Force /x */ );
17083 if (RExC_parse >= RExC_end) { /* Fail */
17087 curchar = UCHARAT(RExC_parse);
17091 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17092 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
17093 DEBUG_U(dump_regex_sets_structures(pRExC_state,
17094 stack, fence, fence_stack));
17097 top_index = av_tindex_skip_len_mg(stack);
17100 SV** stacked_ptr; /* Ptr to something already on 'stack' */
17101 char stacked_operator; /* The topmost operator on the 'stack'. */
17102 SV* lhs; /* Operand to the left of the operator */
17103 SV* rhs; /* Operand to the right of the operator */
17104 SV* fence_ptr; /* Pointer to top element of the fence
17108 if ( RExC_parse < RExC_end - 2
17109 && UCHARAT(RExC_parse + 1) == '?'
17110 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
17112 const regnode_offset orig_emit = RExC_emit;
17113 SV * resultant_invlist;
17115 /* Here it could be an embedded '(?flags:(?[...])'.
17116 * This happens when we have some thing like
17118 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
17120 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
17122 * Here we would be handling the interpolated
17123 * '$thai_or_lao'. We handle this by a recursive call to
17124 * reg which returns the inversion list the
17125 * interpolated expression evaluates to. Actually, the
17126 * return is a special regnode containing a pointer to that
17127 * inversion list. If the return isn't that regnode alone,
17128 * we know that this wasn't such an interpolation, which is
17129 * an error: we need to get a single inversion list back
17130 * from the recursion */
17132 RExC_parse_inc_by(1);
17135 node = reg(pRExC_state, 2, flagp, depth+1);
17136 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17138 if ( OP(REGNODE_p(node)) != REGEX_SET
17139 /* If more than a single node returned, the nested
17140 * parens evaluated to more than just a (?[...]),
17141 * which isn't legal */
17142 || RExC_emit != orig_emit
17143 + NODE_STEP_REGNODE
17144 + REGNODE_ARG_LEN(REGEX_SET))
17146 vFAIL("Expecting interpolated extended charclass");
17148 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
17149 current = invlist_clone(resultant_invlist, NULL);
17150 SvREFCNT_dec(resultant_invlist);
17153 RExC_emit = orig_emit;
17154 goto handle_operand;
17157 /* A regular '('. Look behind for illegal syntax */
17158 if (top_index - fence >= 0) {
17159 /* If the top entry on the stack is an operator, it had
17160 * better be a '!', otherwise the entry below the top
17161 * operand should be an operator */
17162 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
17163 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
17164 || ( IS_OPERAND(*top_ptr)
17165 && ( top_index - fence < 1
17166 || ! (stacked_ptr = av_fetch(stack,
17169 || ! IS_OPERATOR(*stacked_ptr))))
17171 RExC_parse_inc_by(1);
17172 vFAIL("Unexpected '(' with no preceding operator");
17176 /* Stack the position of this undealt-with left paren */
17177 av_push(fence_stack, newSViv(fence));
17178 fence = top_index + 1;
17182 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
17183 * multi-char folds are allowed. */
17184 if (!regclass(pRExC_state, flagp, depth+1,
17185 TRUE, /* means parse just the next thing */
17186 FALSE, /* don't allow multi-char folds */
17187 FALSE, /* don't silence non-portable warnings. */
17189 FALSE, /* Require return to be an ANYOF */
17192 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17193 goto regclass_failed;
17198 /* regclass() will return with parsing just the \ sequence,
17199 * leaving the parse pointer at the next thing to parse */
17201 goto handle_operand;
17203 case '[': /* Is a bracketed character class */
17205 /* See if this is a [:posix:] class. */
17206 bool is_posix_class = (OOB_NAMEDCLASS
17207 < handle_possible_posix(pRExC_state,
17211 TRUE /* checking only */));
17212 /* If it is a posix class, leave the parse pointer at the '['
17213 * to fool regclass() into thinking it is part of a
17214 * '[[:posix:]]'. */
17215 if (! is_posix_class) {
17216 RExC_parse_inc_by(1);
17219 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
17220 * multi-char folds are allowed. */
17221 if (!regclass(pRExC_state, flagp, depth+1,
17222 is_posix_class, /* parse the whole char
17223 class only if not a
17225 FALSE, /* don't allow multi-char folds */
17226 TRUE, /* silence non-portable warnings. */
17228 FALSE, /* Require return to be an ANYOF */
17231 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17232 goto regclass_failed;
17237 /* function call leaves parse pointing to the ']', except if we
17239 if (is_posix_class) {
17243 goto handle_operand;
17247 if (top_index >= 1) {
17248 goto join_operators;
17251 /* Only a single operand on the stack: are done */
17255 if (av_tindex_skip_len_mg(fence_stack) < 0) {
17256 if (UCHARAT(RExC_parse - 1) == ']') {
17259 RExC_parse_inc_by(1);
17260 vFAIL("Unexpected ')'");
17263 /* If nothing after the fence, is missing an operand */
17264 if (top_index - fence < 0) {
17265 RExC_parse_inc_by(1);
17268 /* If at least two things on the stack, treat this as an
17270 if (top_index - fence >= 1) {
17271 goto join_operators;
17274 /* Here only a single thing on the fenced stack, and there is a
17275 * fence. Get rid of it */
17276 fence_ptr = av_pop(fence_stack);
17278 fence = SvIV(fence_ptr);
17279 SvREFCNT_dec_NN(fence_ptr);
17286 /* Having gotten rid of the fence, we pop the operand at the
17287 * stack top and process it as a newly encountered operand */
17288 current = av_pop(stack);
17289 if (IS_OPERAND(current)) {
17290 goto handle_operand;
17293 RExC_parse_inc_by(1);
17302 /* These binary operators should have a left operand already
17304 if ( top_index - fence < 0
17305 || top_index - fence == 1
17306 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
17307 || ! IS_OPERAND(*top_ptr))
17309 goto unexpected_binary;
17312 /* If only the one operand is on the part of the stack visible
17313 * to us, we just place this operator in the proper position */
17314 if (top_index - fence < 2) {
17316 /* Place the operator before the operand */
17318 SV* lhs = av_pop(stack);
17319 av_push(stack, newSVuv(curchar));
17320 av_push(stack, lhs);
17324 /* But if there is something else on the stack, we need to
17325 * process it before this new operator if and only if the
17326 * stacked operation has equal or higher precedence than the
17331 /* The operator on the stack is supposed to be below both its
17333 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
17334 || IS_OPERAND(*stacked_ptr))
17336 /* But if not, it's legal and indicates we are completely
17337 * done if and only if we're currently processing a ']',
17338 * which should be the final thing in the expression */
17339 if (curchar == ']') {
17344 RExC_parse_inc_by(1);
17345 vFAIL2("Unexpected binary operator '%c' with no "
17346 "preceding operand", curchar);
17348 stacked_operator = (char) SvUV(*stacked_ptr);
17350 if (regex_set_precedence(curchar)
17351 > regex_set_precedence(stacked_operator))
17353 /* Here, the new operator has higher precedence than the
17354 * stacked one. This means we need to add the new one to
17355 * the stack to await its rhs operand (and maybe more
17356 * stuff). We put it before the lhs operand, leaving
17357 * untouched the stacked operator and everything below it
17359 lhs = av_pop(stack);
17360 assert(IS_OPERAND(lhs));
17362 av_push(stack, newSVuv(curchar));
17363 av_push(stack, lhs);
17367 /* Here, the new operator has equal or lower precedence than
17368 * what's already there. This means the operation already
17369 * there should be performed now, before the new one. */
17371 rhs = av_pop(stack);
17372 if (! IS_OPERAND(rhs)) {
17374 /* This can happen when a ! is not followed by an operand,
17375 * like in /(?[\t &!])/ */
17379 lhs = av_pop(stack);
17381 if (! IS_OPERAND(lhs)) {
17383 /* This can happen when there is an empty (), like in
17384 * /(?[[0]+()+])/ */
17388 switch (stacked_operator) {
17390 _invlist_intersection(lhs, rhs, &rhs);
17395 _invlist_union(lhs, rhs, &rhs);
17399 _invlist_subtract(lhs, rhs, &rhs);
17402 case '^': /* The union minus the intersection */
17407 _invlist_union(lhs, rhs, &u);
17408 _invlist_intersection(lhs, rhs, &i);
17409 _invlist_subtract(u, i, &rhs);
17410 SvREFCNT_dec_NN(i);
17411 SvREFCNT_dec_NN(u);
17417 /* Here, the higher precedence operation has been done, and the
17418 * result is in 'rhs'. We overwrite the stacked operator with
17419 * the result. Then we redo this code to either push the new
17420 * operator onto the stack or perform any higher precedence
17421 * stacked operation */
17422 only_to_avoid_leaks = av_pop(stack);
17423 SvREFCNT_dec(only_to_avoid_leaks);
17424 av_push(stack, rhs);
17427 case '!': /* Highest priority, right associative */
17429 /* If what's already at the top of the stack is another '!",
17430 * they just cancel each other out */
17431 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
17432 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
17434 only_to_avoid_leaks = av_pop(stack);
17435 SvREFCNT_dec(only_to_avoid_leaks);
17437 else { /* Otherwise, since it's right associative, just push
17439 av_push(stack, newSVuv(curchar));
17445 if (RExC_parse >= RExC_end) {
17448 vFAIL("Unexpected character");
17452 /* Here 'current' is the operand. If something is already on the
17453 * stack, we have to check if it is a !. But first, the code above
17454 * may have altered the stack in the time since we earlier set
17457 top_index = av_tindex_skip_len_mg(stack);
17458 if (top_index - fence >= 0) {
17459 /* If the top entry on the stack is an operator, it had better
17460 * be a '!', otherwise the entry below the top operand should
17461 * be an operator */
17462 top_ptr = av_fetch(stack, top_index, FALSE);
17464 if (IS_OPERATOR(*top_ptr)) {
17466 /* The only permissible operator at the top of the stack is
17467 * '!', which is applied immediately to this operand. */
17468 curchar = (char) SvUV(*top_ptr);
17469 if (curchar != '!') {
17470 SvREFCNT_dec(current);
17471 vFAIL2("Unexpected binary operator '%c' with no "
17472 "preceding operand", curchar);
17475 _invlist_invert(current);
17477 only_to_avoid_leaks = av_pop(stack);
17478 SvREFCNT_dec(only_to_avoid_leaks);
17480 /* And we redo with the inverted operand. This allows
17481 * handling multiple ! in a row */
17482 goto handle_operand;
17484 /* Single operand is ok only for the non-binary ')'
17486 else if ((top_index - fence == 0 && curchar != ')')
17487 || (top_index - fence > 0
17488 && (! (stacked_ptr = av_fetch(stack,
17491 || IS_OPERAND(*stacked_ptr))))
17493 SvREFCNT_dec(current);
17494 vFAIL("Operand with no preceding operator");
17498 /* Here there was nothing on the stack or the top element was
17499 * another operand. Just add this new one */
17500 av_push(stack, current);
17502 } /* End of switch on next parse token */
17505 } /* End of loop parsing through the construct */
17507 vFAIL("Syntax error in (?[...])");
17511 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
17512 if (RExC_parse < RExC_end) {
17513 RExC_parse_inc_by(1);
17516 vFAIL("Unexpected ']' with no following ')' in (?[...");
17519 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
17520 vFAIL("Unmatched (");
17523 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
17524 || ((final = av_pop(stack)) == NULL)
17525 || ! IS_OPERAND(final)
17526 || ! is_invlist(final)
17527 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
17530 SvREFCNT_dec(final);
17531 vFAIL("Incomplete expression within '(?[ ])'");
17534 /* Here, 'final' is the resultant inversion list from evaluating the
17535 * expression. Return it if so requested */
17536 if (return_invlist) {
17537 *return_invlist = final;
17541 if (RExC_sets_depth) { /* If within a recursive call, return in a special
17543 RExC_parse_inc_by(1);
17544 node = regpnode(pRExC_state, REGEX_SET, final);
17548 /* Otherwise generate a resultant node, based on 'final'. regclass()
17549 * is expecting a string of ranges and individual code points */
17550 invlist_iterinit(final);
17551 result_string = newSVpvs("");
17552 while (invlist_iternext(final, &start, &end)) {
17553 if (start == end) {
17554 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
17557 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
17558 UVXf "}", start, end);
17562 /* About to generate an ANYOF (or similar) node from the inversion list
17563 * we have calculated */
17564 save_parse = RExC_parse;
17565 RExC_parse_set(SvPV(result_string, len));
17566 save_end = RExC_end;
17567 RExC_end = RExC_parse + len;
17568 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
17570 /* We turn off folding around the call, as the class we have
17571 * constructed already has all folding taken into consideration, and we
17572 * don't want regclass() to add to that */
17573 RExC_flags &= ~RXf_PMf_FOLD;
17574 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
17575 * folds are allowed. */
17576 node = regclass(pRExC_state, flagp, depth+1,
17577 FALSE, /* means parse the whole char class */
17578 FALSE, /* don't allow multi-char folds */
17579 TRUE, /* silence non-portable warnings. The above may
17580 very well have generated non-portable code
17581 points, but they're valid on this machine */
17582 FALSE, /* similarly, no need for strict */
17584 /* We can optimize into something besides an ANYOF,
17585 * except under /l, which needs to be ANYOF because of
17586 * runtime checks for locale sanity, etc */
17592 RExC_parse_set(save_parse + 1);
17593 RExC_end = save_end;
17594 SvREFCNT_dec_NN(final);
17595 SvREFCNT_dec_NN(result_string);
17598 RExC_flags |= RXf_PMf_FOLD;
17602 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17603 goto regclass_failed;
17606 /* Fix up the node type if we are in locale. (We have pretended we are
17607 * under /u for the purposes of regclass(), as this construct will only
17608 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
17609 * (so as to cause any warnings about bad locales to be output in
17610 * regexec.c), and add the flag that indicates to check if not in a
17611 * UTF-8 locale. The reason we above forbid optimization into
17612 * something other than an ANYOF node is simply to minimize the number
17613 * of code changes in regexec.c. Otherwise we would have to create new
17614 * EXACTish node types and deal with them. This decision could be
17615 * revisited should this construct become popular.
17617 * (One might think we could look at the resulting ANYOF node and
17618 * suppress the flag if everything is above 255, as those would be
17619 * UTF-8 only, but this isn't true, as the components that led to that
17620 * result could have been locale-affected, and just happen to cancel
17621 * each other out under UTF-8 locales.) */
17623 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
17625 assert(OP(REGNODE_p(node)) == ANYOF);
17627 OP(REGNODE_p(node)) = ANYOFL;
17628 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
17632 nextchar(pRExC_state);
17636 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
17640 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17643 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
17644 AV * stack, const IV fence, AV * fence_stack)
17645 { /* Dumps the stacks in handle_regex_sets() */
17647 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
17648 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
17651 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
17653 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
17655 if (stack_top < 0) {
17656 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
17659 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17660 for (i = stack_top; i >= 0; i--) {
17661 SV ** element_ptr = av_fetch(stack, i, FALSE);
17662 if (! element_ptr) {
17665 if (IS_OPERATOR(*element_ptr)) {
17666 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17667 (int) i, (int) SvIV(*element_ptr));
17670 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17671 sv_dump(*element_ptr);
17676 if (fence_stack_top < 0) {
17677 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17680 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17681 for (i = fence_stack_top; i >= 0; i--) {
17682 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17683 if (! element_ptr) {
17686 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17687 (int) i, (int) SvIV(*element_ptr));
17698 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17700 /* This adds the Latin1/above-Latin1 folding rules.
17702 * This should be called only for a Latin1-range code points, cp, which is
17703 * known to be involved in a simple fold with other code points above
17704 * Latin1. It would give false results if /aa has been specified.
17705 * Multi-char folds are outside the scope of this, and must be handled
17708 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17710 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17712 /* The rules that are valid for all Unicode versions are hard-coded in */
17717 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17721 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17724 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17725 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17727 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17728 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17729 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17731 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17732 *invlist = add_cp_to_invlist(*invlist,
17733 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17736 default: /* Other code points are checked against the data for the
17737 current Unicode version */
17739 Size_t folds_count;
17741 const U32 * remaining_folds;
17745 folded_cp = toFOLD(cp);
17748 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17750 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17753 if (folded_cp > 255) {
17754 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17757 folds_count = _inverse_folds(folded_cp, &first_fold,
17759 if (folds_count == 0) {
17761 /* Use deprecated warning to increase the chances of this being
17763 ckWARN2reg_d(RExC_parse,
17764 "Perl folding rules are not up-to-date for 0x%02X;"
17765 " please use the perlbug utility to report;", cp);
17770 if (first_fold > 255) {
17771 *invlist = add_cp_to_invlist(*invlist, first_fold);
17773 for (i = 0; i < folds_count - 1; i++) {
17774 if (remaining_folds[i] > 255) {
17775 *invlist = add_cp_to_invlist(*invlist,
17776 remaining_folds[i]);
17786 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17788 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17792 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17794 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17796 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17797 CLEAR_POSIX_WARNINGS();
17801 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17802 if (first_is_fatal) { /* Avoid leaking this */
17803 av_undef(posix_warnings); /* This isn't necessary if the
17804 array is mortal, but is a
17806 (void) sv_2mortal(msg);
17809 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17810 SvREFCNT_dec_NN(msg);
17813 UPDATE_WARNINGS_LOC(RExC_parse);
17816 PERL_STATIC_INLINE Size_t
17817 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17819 const U8 * const start = s1;
17820 const U8 * const send = start + max;
17822 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17824 while (s1 < send && *s1 == *s2) {
17832 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17834 /* This adds the string scalar <multi_string> to the array
17835 * <multi_char_matches>. <multi_string> is known to have exactly
17836 * <cp_count> code points in it. This is used when constructing a
17837 * bracketed character class and we find something that needs to match more
17838 * than a single character.
17840 * <multi_char_matches> is actually an array of arrays. Each top-level
17841 * element is an array that contains all the strings known so far that are
17842 * the same length. And that length (in number of code points) is the same
17843 * as the index of the top-level array. Hence, the [2] element is an
17844 * array, each element thereof is a string containing TWO code points;
17845 * while element [3] is for strings of THREE characters, and so on. Since
17846 * this is for multi-char strings there can never be a [0] nor [1] element.
17848 * When we rewrite the character class below, we will do so such that the
17849 * longest strings are written first, so that it prefers the longest
17850 * matching strings first. This is done even if it turns out that any
17851 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17852 * Christiansen has agreed that this is ok. This makes the test for the
17853 * ligature 'ffi' come before the test for 'ff', for example */
17856 AV** this_array_ptr;
17858 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17860 if (! multi_char_matches) {
17861 multi_char_matches = newAV();
17864 if (av_exists(multi_char_matches, cp_count)) {
17865 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17866 this_array = *this_array_ptr;
17869 this_array = newAV();
17870 av_store(multi_char_matches, cp_count,
17873 av_push(this_array, multi_string);
17875 return multi_char_matches;
17878 /* The names of properties whose definitions are not known at compile time are
17879 * stored in this SV, after a constant heading. So if the length has been
17880 * changed since initialization, then there is a run-time definition. */
17881 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17882 (SvCUR(listsv) != initial_listsv_len)
17884 /* There is a restricted set of white space characters that are legal when
17885 * ignoring white space in a bracketed character class. This generates the
17886 * code to skip them.
17888 * There is a line below that uses the same white space criteria but is outside
17889 * this macro. Both here and there must use the same definition */
17890 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
17893 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
17900 STATIC regnode_offset
17901 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17902 const bool stop_at_1, /* Just parse the next thing, don't
17903 look for a full character class */
17904 bool allow_mutiple_chars,
17905 const bool silence_non_portable, /* Don't output warnings
17909 bool optimizable, /* ? Allow a non-ANYOF return
17911 SV** ret_invlist /* Return an inversion list, not a node */
17914 /* parse a bracketed class specification. Most of these will produce an
17915 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17916 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17917 * under /i with multi-character folds: it will be rewritten following the
17918 * paradigm of this example, where the <multi-fold>s are characters which
17919 * fold to multiple character sequences:
17920 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17921 * gets effectively rewritten as:
17922 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17923 * reg() gets called (recursively) on the rewritten version, and this
17924 * function will return what it constructs. (Actually the <multi-fold>s
17925 * aren't physically removed from the [abcdefghi], it's just that they are
17926 * ignored in the recursion by means of a flag:
17927 * <RExC_in_multi_char_class>.)
17929 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17930 * characters, with the corresponding bit set if that character is in the
17931 * list. For characters above this, an inversion list is used. There
17932 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17933 * determinable at compile time
17935 * On success, returns the offset at which any next node should be placed
17936 * into the regex engine program being compiled.
17938 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17939 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17943 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17945 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17946 regnode_offset ret = -1; /* Initialized to an illegal value */
17948 int namedclass = OOB_NAMEDCLASS;
17949 char *rangebegin = NULL;
17950 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17951 aren't available at the time this was called */
17952 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17953 than just initialized. */
17954 SV* properties = NULL; /* Code points that match \p{} \P{} */
17955 SV* posixes = NULL; /* Code points that match classes like [:word:],
17956 extended beyond the Latin1 range. These have to
17957 be kept separate from other code points for much
17958 of this function because their handling is
17959 different under /i, and for most classes under
17961 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17962 separate for a while from the non-complemented
17963 versions because of complications with /d
17965 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17966 treated more simply than the general case,
17967 leading to less compilation and execution
17969 UV element_count = 0; /* Number of distinct elements in the class.
17970 Optimizations may be possible if this is tiny */
17971 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17972 character; used under /i */
17974 char * stop_ptr = RExC_end; /* where to stop parsing */
17976 /* ignore unescaped whitespace? */
17977 const bool skip_white = cBOOL( ret_invlist
17978 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17980 /* inversion list of code points this node matches only when the target
17981 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17983 SV* upper_latin1_only_utf8_matches = NULL;
17985 /* Inversion list of code points this node matches regardless of things
17986 * like locale, folding, utf8ness of the target string */
17987 SV* cp_list = NULL;
17989 /* Like cp_list, but code points on this list need to be checked for things
17990 * that fold to/from them under /i */
17991 SV* cp_foldable_list = NULL;
17993 /* Like cp_list, but code points on this list are valid only when the
17994 * runtime locale is UTF-8 */
17995 SV* only_utf8_locale_list = NULL;
17997 /* In a range, if one of the endpoints is non-character-set portable,
17998 * meaning that it hard-codes a code point that may mean a different
17999 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
18000 * mnemonic '\t' which each mean the same character no matter which
18001 * character set the platform is on. */
18002 unsigned int non_portable_endpoint = 0;
18004 /* Is the range unicode? which means on a platform that isn't 1-1 native
18005 * to Unicode (i.e. non-ASCII), each code point in it should be considered
18006 * to be a Unicode value. */
18007 bool unicode_range = FALSE;
18008 bool invert = FALSE; /* Is this class to be complemented */
18010 bool warn_super = ALWAYS_WARN_SUPER;
18012 const char * orig_parse = RExC_parse;
18014 /* This variable is used to mark where the end in the input is of something
18015 * that looks like a POSIX construct but isn't. During the parse, when
18016 * something looks like it could be such a construct is encountered, it is
18017 * checked for being one, but not if we've already checked this area of the
18018 * input. Only after this position is reached do we check again */
18019 char *not_posix_region_end = RExC_parse - 1;
18021 AV* posix_warnings = NULL;
18022 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
18023 U8 op = ANYOF; /* The returned node-type, initialized to the expected
18025 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
18026 U32 posixl = 0; /* bit field of posix classes matched under /l */
18029 /* Flags as to what things aren't knowable until runtime. (Note that these are
18030 * mutually exclusive.) */
18031 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
18032 haven't been defined as of yet */
18033 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
18035 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
18036 what gets folded */
18037 U32 has_runtime_dependency = 0; /* OR of the above flags */
18039 DECLARE_AND_GET_RE_DEBUG_FLAGS;
18041 PERL_ARGS_ASSERT_REGCLASS;
18043 PERL_UNUSED_ARG(depth);
18046 assert(! (ret_invlist && allow_mutiple_chars));
18048 /* If wants an inversion list returned, we can't optimize to something
18051 optimizable = FALSE;
18054 DEBUG_PARSE("clas");
18056 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
18057 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
18058 && UNICODE_DOT_DOT_VERSION == 0)
18059 allow_mutiple_chars = FALSE;
18062 /* We include the /i status at the beginning of this so that we can
18063 * know it at runtime */
18064 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
18065 initial_listsv_len = SvCUR(listsv);
18066 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
18068 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18070 assert(RExC_parse <= RExC_end);
18072 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
18073 RExC_parse_inc_by(1);
18075 allow_mutiple_chars = FALSE;
18077 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18080 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
18081 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
18082 int maybe_class = handle_possible_posix(pRExC_state,
18084 ¬_posix_region_end,
18086 TRUE /* checking only */);
18087 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
18088 ckWARN4reg(not_posix_region_end,
18089 "POSIX syntax [%c %c] belongs inside character classes%s",
18090 *RExC_parse, *RExC_parse,
18091 (maybe_class == OOB_NAMEDCLASS)
18092 ? ((POSIXCC_NOTYET(*RExC_parse))
18093 ? " (but this one isn't implemented)"
18094 : " (but this one isn't fully valid)")
18100 /* If the caller wants us to just parse a single element, accomplish this
18101 * by faking the loop ending condition */
18102 if (stop_at_1 && RExC_end > RExC_parse) {
18103 stop_ptr = RExC_parse + 1;
18106 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
18107 if (UCHARAT(RExC_parse) == ']')
18108 goto charclassloop;
18112 if ( posix_warnings
18113 && av_tindex_skip_len_mg(posix_warnings) >= 0
18114 && RExC_parse > not_posix_region_end)
18116 /* Warnings about posix class issues are considered tentative until
18117 * we are far enough along in the parse that we can no longer
18118 * change our mind, at which point we output them. This is done
18119 * each time through the loop so that a later class won't zap them
18120 * before they have been dealt with. */
18121 output_posix_warnings(pRExC_state, posix_warnings);
18124 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18126 if (RExC_parse >= stop_ptr) {
18130 if (UCHARAT(RExC_parse) == ']') {
18136 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
18137 save_value = value;
18138 save_prevvalue = prevvalue;
18141 rangebegin = RExC_parse;
18143 non_portable_endpoint = 0;
18145 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
18146 value = utf8n_to_uvchr((U8*)RExC_parse,
18147 RExC_end - RExC_parse,
18148 &numlen, UTF8_ALLOW_DEFAULT);
18149 RExC_parse_inc_by(numlen);
18152 value = UCHARAT(RExC_parse);
18153 RExC_parse_inc_by(1);
18156 if (value == '[') {
18157 char * posix_class_end;
18158 namedclass = handle_possible_posix(pRExC_state,
18161 do_posix_warnings ? &posix_warnings : NULL,
18162 FALSE /* die if error */);
18163 if (namedclass > OOB_NAMEDCLASS) {
18165 /* If there was an earlier attempt to parse this particular
18166 * posix class, and it failed, it was a false alarm, as this
18167 * successful one proves */
18168 if ( posix_warnings
18169 && av_tindex_skip_len_mg(posix_warnings) >= 0
18170 && not_posix_region_end >= RExC_parse
18171 && not_posix_region_end <= posix_class_end)
18173 av_undef(posix_warnings);
18176 RExC_parse_set(posix_class_end);
18178 else if (namedclass == OOB_NAMEDCLASS) {
18179 not_posix_region_end = posix_class_end;
18182 namedclass = OOB_NAMEDCLASS;
18185 else if ( RExC_parse - 1 > not_posix_region_end
18186 && MAYBE_POSIXCC(value))
18188 (void) handle_possible_posix(
18190 RExC_parse - 1, /* -1 because parse has already been
18192 ¬_posix_region_end,
18193 do_posix_warnings ? &posix_warnings : NULL,
18194 TRUE /* checking only */);
18196 else if ( strict && ! skip_white
18197 && ( generic_isCC_(value, CC_VERTSPACE_)
18198 || is_VERTWS_cp_high(value)))
18200 vFAIL("Literal vertical space in [] is illegal except under /x");
18202 else if (value == '\\') {
18203 /* Is a backslash; get the code point of the char after it */
18205 if (RExC_parse >= RExC_end) {
18206 vFAIL("Unmatched [");
18209 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
18210 value = utf8n_to_uvchr((U8*)RExC_parse,
18211 RExC_end - RExC_parse,
18212 &numlen, UTF8_ALLOW_DEFAULT);
18213 RExC_parse_inc_by(numlen);
18216 value = UCHARAT(RExC_parse);
18217 RExC_parse_inc_by(1);
18220 /* Some compilers cannot handle switching on 64-bit integer
18221 * values, therefore value cannot be an UV. Yes, this will
18222 * be a problem later if we want switch on Unicode.
18223 * A similar issue a little bit later when switching on
18224 * namedclass. --jhi */
18226 /* If the \ is escaping white space when white space is being
18227 * skipped, it means that that white space is wanted literally, and
18228 * is already in 'value'. Otherwise, need to translate the escape
18229 * into what it signifies. */
18230 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
18231 const char * message;
18235 case 'w': namedclass = ANYOF_WORDCHAR; break;
18236 case 'W': namedclass = ANYOF_NWORDCHAR; break;
18237 case 's': namedclass = ANYOF_SPACE; break;
18238 case 'S': namedclass = ANYOF_NSPACE; break;
18239 case 'd': namedclass = ANYOF_DIGIT; break;
18240 case 'D': namedclass = ANYOF_NDIGIT; break;
18241 case 'v': namedclass = ANYOF_VERTWS; break;
18242 case 'V': namedclass = ANYOF_NVERTWS; break;
18243 case 'h': namedclass = ANYOF_HORIZWS; break;
18244 case 'H': namedclass = ANYOF_NHORIZWS; break;
18245 case 'N': /* Handle \N{NAME} in class */
18247 const char * const backslash_N_beg = RExC_parse - 2;
18250 if (! grok_bslash_N(pRExC_state,
18251 NULL, /* No regnode */
18252 &value, /* Yes single value */
18253 &cp_count, /* Multiple code pt count */
18259 if (*flagp & NEED_UTF8)
18260 FAIL("panic: grok_bslash_N set NEED_UTF8");
18262 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
18264 if (cp_count < 0) {
18265 vFAIL("\\N in a character class must be a named character: \\N{...}");
18267 else if (cp_count == 0) {
18268 ckWARNreg(RExC_parse,
18269 "Ignoring zero length \\N{} in character class");
18271 else { /* cp_count > 1 */
18272 assert(cp_count > 1);
18273 if (! RExC_in_multi_char_class) {
18274 if ( ! allow_mutiple_chars
18277 || *RExC_parse == '-')
18281 vFAIL("\\N{} here is restricted to one character");
18283 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
18284 break; /* <value> contains the first code
18285 point. Drop out of the switch to
18289 SV * multi_char_N = newSVpvn(backslash_N_beg,
18290 RExC_parse - backslash_N_beg);
18292 = add_multi_match(multi_char_matches,
18297 } /* End of cp_count != 1 */
18299 /* This element should not be processed further in this
18302 value = save_value;
18303 prevvalue = save_prevvalue;
18304 continue; /* Back to top of loop to get next char */
18307 /* Here, is a single code point, and <value> contains it */
18308 unicode_range = TRUE; /* \N{} are Unicode */
18316 if (RExC_pm_flags & PMf_WILDCARD) {
18317 RExC_parse_inc_by(1);
18318 /* diag_listed_as: Use of %s is not allowed in Unicode
18319 property wildcard subpatterns in regex; marked by <--
18321 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
18322 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
18325 /* \p means they want Unicode semantics */
18326 REQUIRE_UNI_RULES(flagp, 0);
18328 if (RExC_parse >= RExC_end)
18329 vFAIL2("Empty \\%c", (U8)value);
18330 if (*RExC_parse == '{') {
18331 const U8 c = (U8)value;
18332 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
18334 RExC_parse_inc_by(1);
18335 vFAIL2("Missing right brace on \\%c{}", c);
18338 RExC_parse_inc_by(1);
18340 /* White space is allowed adjacent to the braces and after
18341 * any '^', even when not under /x */
18342 while (isSPACE(*RExC_parse)) {
18343 RExC_parse_inc_by(1);
18346 if (UCHARAT(RExC_parse) == '^') {
18348 /* toggle. (The rhs xor gets the single bit that
18349 * differs between P and p; the other xor inverts just
18351 value ^= 'P' ^ 'p';
18353 RExC_parse_inc_by(1);
18354 while (isSPACE(*RExC_parse)) {
18355 RExC_parse_inc_by(1);
18359 if (e == RExC_parse)
18360 vFAIL2("Empty \\%c{}", c);
18362 n = e - RExC_parse;
18363 while (isSPACE(*(RExC_parse + n - 1)))
18366 } /* The \p isn't immediately followed by a '{' */
18367 else if (! isALPHA(*RExC_parse)) {
18368 RExC_parse_inc_safe();
18369 vFAIL2("Character following \\%c must be '{' or a "
18370 "single-character Unicode property name",
18378 char* name = RExC_parse;
18380 /* Any message returned about expanding the definition */
18381 SV* msg = newSVpvs_flags("", SVs_TEMP);
18383 /* If set TRUE, the property is user-defined as opposed to
18384 * official Unicode */
18385 bool user_defined = FALSE;
18386 AV * strings = NULL;
18388 SV * prop_definition = parse_uniprop_string(
18389 name, n, UTF, FOLD,
18390 FALSE, /* This is compile-time */
18392 /* We can't defer this defn when
18393 * the full result is required in
18395 ! cBOOL(ret_invlist),
18402 if (SvCUR(msg)) { /* Assumes any error causes a msg */
18403 assert(prop_definition == NULL);
18404 RExC_parse_set(e + 1);
18405 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
18406 thing so, or else the display is
18410 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
18411 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
18412 SvCUR(msg), SvPVX(msg)));
18415 assert(prop_definition || strings);
18419 if (! prop_definition) {
18420 RExC_parse_set(e + 1);
18421 vFAIL("Unicode string properties are not implemented in (?[...])");
18425 "Using just the single character results"
18426 " returned by \\p{} in (?[...])");
18429 else if (! RExC_in_multi_char_class) {
18430 if (invert ^ (value == 'P')) {
18431 RExC_parse_set(e + 1);
18432 vFAIL("Inverting a character class which contains"
18433 " a multi-character sequence is illegal");
18436 /* For each multi-character string ... */
18437 while (av_count(strings) > 0) {
18438 /* ... Each entry is itself an array of code
18440 AV * this_string = (AV *) av_shift( strings);
18441 STRLEN cp_count = av_count(this_string);
18442 SV * final = newSV(cp_count * 4);
18445 /* Create another string of sequences of \x{...} */
18446 while (av_count(this_string) > 0) {
18447 SV * character = av_shift(this_string);
18448 UV cp = SvUV(character);
18451 REQUIRE_UTF8(flagp);
18453 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
18455 SvREFCNT_dec_NN(character);
18457 SvREFCNT_dec_NN(this_string);
18459 /* And add that to the list of such things */
18461 = add_multi_match(multi_char_matches,
18466 SvREFCNT_dec_NN(strings);
18469 if (! prop_definition) { /* If we got only a string,
18470 this iteration didn't really
18471 find a character */
18474 else if (! is_invlist(prop_definition)) {
18476 /* Here, the definition isn't known, so we have gotten
18477 * returned a string that will be evaluated if and when
18478 * encountered at runtime. We add it to the list of
18479 * such properties, along with whether it should be
18480 * complemented or not */
18481 if (value == 'P') {
18482 sv_catpvs(listsv, "!");
18485 sv_catpvs(listsv, "+");
18487 sv_catsv(listsv, prop_definition);
18489 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
18491 /* We don't know yet what this matches, so have to flag
18493 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
18496 assert (prop_definition && is_invlist(prop_definition));
18498 /* Here we do have the complete property definition
18500 * Temporary workaround for [perl #133136]. For this
18501 * precise input that is in the .t that is failing,
18502 * load utf8.pm, which is what the test wants, so that
18503 * that .t passes */
18504 if ( memEQs(RExC_start, e + 1 - RExC_start,
18506 && ! hv_common(GvHVn(PL_incgv),
18508 "utf8.pm", sizeof("utf8.pm") - 1,
18509 0, HV_FETCH_ISEXISTS, NULL, 0))
18511 require_pv("utf8.pm");
18514 if (! user_defined &&
18515 /* We warn on matching an above-Unicode code point
18516 * if the match would return true, except don't
18517 * warn for \p{All}, which has exactly one element
18519 (_invlist_contains_cp(prop_definition, 0x110000)
18520 && (! (_invlist_len(prop_definition) == 1
18521 && *invlist_array(prop_definition) == 0))))
18526 /* Invert if asking for the complement */
18527 if (value == 'P') {
18528 _invlist_union_complement_2nd(properties,
18533 _invlist_union(properties, prop_definition, &properties);
18538 RExC_parse_set(e + 1);
18539 namedclass = ANYOF_UNIPROP; /* no official name, but it's
18543 case 'n': value = '\n'; break;
18544 case 'r': value = '\r'; break;
18545 case 't': value = '\t'; break;
18546 case 'f': value = '\f'; break;
18547 case 'b': value = '\b'; break;
18548 case 'e': value = ESC_NATIVE; break;
18549 case 'a': value = '\a'; break;
18551 RExC_parse--; /* function expects to be pointed at the 'o' */
18552 if (! grok_bslash_o(&RExC_parse,
18558 cBOOL(range), /* MAX_UV allowed for range
18564 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18565 warn_non_literal_string(RExC_parse, packed_warn, message);
18569 non_portable_endpoint++;
18573 RExC_parse--; /* function expects to be pointed at the 'x' */
18574 if (! grok_bslash_x(&RExC_parse,
18580 cBOOL(range), /* MAX_UV allowed for range
18586 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18587 warn_non_literal_string(RExC_parse, packed_warn, message);
18591 non_portable_endpoint++;
18595 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
18598 /* going to die anyway; point to exact spot of
18600 RExC_parse_inc_safe();
18604 value = grok_c_char;
18605 RExC_parse_inc_by(1);
18606 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18607 warn_non_literal_string(RExC_parse, packed_warn, message);
18610 non_portable_endpoint++;
18612 case '0': case '1': case '2': case '3': case '4':
18613 case '5': case '6': case '7':
18615 /* Take 1-3 octal digits */
18616 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
18617 | PERL_SCAN_NOTIFY_ILLDIGIT;
18618 numlen = (strict) ? 4 : 3;
18619 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
18620 RExC_parse_inc_by(numlen);
18623 RExC_parse_inc_safe();
18624 vFAIL("Need exactly 3 octal digits");
18626 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
18627 && RExC_parse < RExC_end
18628 && isDIGIT(*RExC_parse)
18629 && ckWARN(WARN_REGEXP))
18631 reg_warn_non_literal_string(
18633 form_alien_digit_msg(8, numlen, RExC_parse,
18634 RExC_end, UTF, FALSE));
18638 non_portable_endpoint++;
18643 /* Allow \_ to not give an error */
18644 if (isWORDCHAR(value) && value != '_') {
18646 vFAIL2("Unrecognized escape \\%c in character class",
18650 ckWARN2reg(RExC_parse,
18651 "Unrecognized escape \\%c in character class passed through",
18656 } /* End of switch on char following backslash */
18657 } /* end of handling backslash escape sequences */
18659 /* Here, we have the current token in 'value' */
18661 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18664 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
18665 * literal, as is the character that began the false range, i.e.
18666 * the 'a' in the examples */
18668 const int w = (RExC_parse >= rangebegin)
18669 ? RExC_parse - rangebegin
18673 "False [] range \"%" UTF8f "\"",
18674 UTF8fARG(UTF, w, rangebegin));
18677 ckWARN2reg(RExC_parse,
18678 "False [] range \"%" UTF8f "\"",
18679 UTF8fARG(UTF, w, rangebegin));
18680 cp_list = add_cp_to_invlist(cp_list, '-');
18681 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18685 range = 0; /* this was not a true range */
18686 element_count += 2; /* So counts for three values */
18689 classnum = namedclass_to_classnum(namedclass);
18691 if (LOC && namedclass < ANYOF_POSIXL_MAX
18692 #ifndef HAS_ISASCII
18693 && classnum != CC_ASCII_
18696 SV* scratch_list = NULL;
18698 /* What the Posix classes (like \w, [:space:]) match isn't
18699 * generally knowable under locale until actual match time. A
18700 * special node is used for these which has extra space for a
18701 * bitmap, with a bit reserved for each named class that is to
18702 * be matched against. (This isn't needed for \p{} and
18703 * pseudo-classes, as they are not affected by locale, and
18704 * hence are dealt with separately.) However, if a named class
18705 * and its complement are both present, then it matches
18706 * everything, and there is no runtime dependency. Odd numbers
18707 * are the complements of the next lower number, so xor works.
18708 * (Note that something like [\w\D] should match everything,
18709 * because \d should be a proper subset of \w. But rather than
18710 * trust that the locale is well behaved, we leave this to
18711 * runtime to sort out) */
18712 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18713 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18714 POSIXL_ZERO(posixl);
18715 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18716 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18717 continue; /* We could ignore the rest of the class, but
18718 best to parse it for any errors */
18720 else { /* Here, isn't the complement of any already parsed
18722 POSIXL_SET(posixl, namedclass);
18723 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18724 anyof_flags |= ANYOF_MATCHES_POSIXL;
18726 /* The above-Latin1 characters are not subject to locale
18727 * rules. Just add them to the unconditionally-matched
18730 /* Get the list of the above-Latin1 code points this
18732 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18733 PL_XPosix_ptrs[classnum],
18735 /* Odd numbers are complements,
18736 * like NDIGIT, NASCII, ... */
18737 namedclass % 2 != 0,
18739 /* Checking if 'cp_list' is NULL first saves an extra
18740 * clone. Its reference count will be decremented at the
18741 * next union, etc, or if this is the only instance, at the
18742 * end of the routine */
18744 cp_list = scratch_list;
18747 _invlist_union(cp_list, scratch_list, &cp_list);
18748 SvREFCNT_dec_NN(scratch_list);
18750 continue; /* Go get next character */
18755 /* Here, is not /l, or is a POSIX class for which /l doesn't
18756 * matter (or is a Unicode property, which is skipped here). */
18757 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18758 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18760 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18761 * nor /l make a difference in what these match,
18762 * therefore we just add what they match to cp_list. */
18763 if (classnum != CC_VERTSPACE_) {
18764 assert( namedclass == ANYOF_HORIZWS
18765 || namedclass == ANYOF_NHORIZWS);
18767 /* It turns out that \h is just a synonym for
18769 classnum = CC_BLANK_;
18772 _invlist_union_maybe_complement_2nd(
18774 PL_XPosix_ptrs[classnum],
18775 namedclass % 2 != 0, /* Complement if odd
18776 (NHORIZWS, NVERTWS)
18781 else if ( AT_LEAST_UNI_SEMANTICS
18782 || classnum == CC_ASCII_
18783 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
18784 || classnum == CC_XDIGIT_)))
18786 /* We usually have to worry about /d affecting what POSIX
18787 * classes match, with special code needed because we won't
18788 * know until runtime what all matches. But there is no
18789 * extra work needed under /u and /a; and [:ascii:] is
18790 * unaffected by /d; and :digit: and :xdigit: don't have
18791 * runtime differences under /d. So we can special case
18792 * these, and avoid some extra work below, and at runtime.
18794 _invlist_union_maybe_complement_2nd(
18796 ((AT_LEAST_ASCII_RESTRICTED)
18797 ? PL_Posix_ptrs[classnum]
18798 : PL_XPosix_ptrs[classnum]),
18799 namedclass % 2 != 0,
18802 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18803 complement and use nposixes */
18804 SV** posixes_ptr = namedclass % 2 == 0
18807 _invlist_union_maybe_complement_2nd(
18809 PL_XPosix_ptrs[classnum],
18810 namedclass % 2 != 0,
18814 } /* end of namedclass \blah */
18816 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18818 /* If 'range' is set, 'value' is the ending of a range--check its
18819 * validity. (If value isn't a single code point in the case of a
18820 * range, we should have figured that out above in the code that
18821 * catches false ranges). Later, we will handle each individual code
18822 * point in the range. If 'range' isn't set, this could be the
18823 * beginning of a range, so check for that by looking ahead to see if
18824 * the next real character to be processed is the range indicator--the
18829 /* For unicode ranges, we have to test that the Unicode as opposed
18830 * to the native values are not decreasing. (Above 255, there is
18831 * no difference between native and Unicode) */
18832 if (unicode_range && prevvalue < 255 && value < 255) {
18833 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18834 goto backwards_range;
18839 if (prevvalue > value) /* b-a */ {
18844 w = RExC_parse - rangebegin;
18846 "Invalid [] range \"%" UTF8f "\"",
18847 UTF8fARG(UTF, w, rangebegin));
18848 NOT_REACHED; /* NOTREACHED */
18852 prevvalue = value; /* save the beginning of the potential range */
18853 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18854 && *RExC_parse == '-')
18856 char* next_char_ptr = RExC_parse + 1;
18858 /* Get the next real char after the '-' */
18859 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18861 /* If the '-' is at the end of the class (just before the ']',
18862 * it is a literal minus; otherwise it is a range */
18863 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18864 RExC_parse_set(next_char_ptr);
18866 /* a bad range like \w-, [:word:]- ? */
18867 if (namedclass > OOB_NAMEDCLASS) {
18868 if (strict || ckWARN(WARN_REGEXP)) {
18869 const int w = RExC_parse >= rangebegin
18870 ? RExC_parse - rangebegin
18873 vFAIL4("False [] range \"%*.*s\"",
18878 "False [] range \"%*.*s\"",
18882 cp_list = add_cp_to_invlist(cp_list, '-');
18885 range = 1; /* yeah, it's a range! */
18886 continue; /* but do it the next time */
18891 if (namedclass > OOB_NAMEDCLASS) {
18895 /* Here, we have a single value this time through the loop, and
18896 * <prevvalue> is the beginning of the range, if any; or <value> if
18899 /* non-Latin1 code point implies unicode semantics. */
18901 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18902 || prevvalue > MAX_LEGAL_CP))
18904 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18906 REQUIRE_UNI_RULES(flagp, 0);
18907 if ( ! silence_non_portable
18908 && UNICODE_IS_PERL_EXTENDED(value)
18909 && TO_OUTPUT_WARNINGS(RExC_parse))
18911 ckWARN2_non_literal_string(RExC_parse,
18912 packWARN(WARN_PORTABLE),
18913 PL_extended_cp_format,
18918 /* Ready to process either the single value, or the completed range.
18919 * For single-valued non-inverted ranges, we consider the possibility
18920 * of multi-char folds. (We made a conscious decision to not do this
18921 * for the other cases because it can often lead to non-intuitive
18922 * results. For example, you have the peculiar case that:
18923 * "s s" =~ /^[^\xDF]+$/i => Y
18924 * "ss" =~ /^[^\xDF]+$/i => N
18926 * See [perl #89750] */
18927 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18928 if ( value == LATIN_SMALL_LETTER_SHARP_S
18929 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18932 /* Here <value> is indeed a multi-char fold. Get what it is */
18934 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18937 UV folded = _to_uni_fold_flags(
18941 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18942 ? FOLD_FLAGS_NOMIX_ASCII
18946 /* Here, <folded> should be the first character of the
18947 * multi-char fold of <value>, with <foldbuf> containing the
18948 * whole thing. But, if this fold is not allowed (because of
18949 * the flags), <fold> will be the same as <value>, and should
18950 * be processed like any other character, so skip the special
18952 if (folded != value) {
18954 /* Skip if we are recursed, currently parsing the class
18955 * again. Otherwise add this character to the list of
18956 * multi-char folds. */
18957 if (! RExC_in_multi_char_class) {
18958 STRLEN cp_count = utf8_length(foldbuf,
18959 foldbuf + foldlen);
18960 SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
18962 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18965 = add_multi_match(multi_char_matches,
18971 /* This element should not be processed further in this
18974 value = save_value;
18975 prevvalue = save_prevvalue;
18981 if (strict && ckWARN(WARN_REGEXP)) {
18984 /* If the range starts above 255, everything is portable and
18985 * likely to be so for any forseeable character set, so don't
18987 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18988 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18990 else if (prevvalue != value) {
18992 /* Under strict, ranges that stop and/or end in an ASCII
18993 * printable should have each end point be a portable value
18994 * for it (preferably like 'A', but we don't warn if it is
18995 * a (portable) Unicode name or code point), and the range
18996 * must be all digits or all letters of the same case.
18997 * Otherwise, the range is non-portable and unclear as to
18998 * what it contains */
18999 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
19000 && ( non_portable_endpoint
19001 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
19002 || (isLOWER_A(prevvalue) && isLOWER_A(value))
19003 || (isUPPER_A(prevvalue) && isUPPER_A(value))
19005 vWARN(RExC_parse, "Ranges of ASCII printables should"
19006 " be some subset of \"0-9\","
19007 " \"A-Z\", or \"a-z\"");
19009 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
19010 SSize_t index_start;
19011 SSize_t index_final;
19013 /* But the nature of Unicode and languages mean we
19014 * can't do the same checks for above-ASCII ranges,
19015 * except in the case of digit ones. These should
19016 * contain only digits from the same group of 10. The
19017 * ASCII case is handled just above. Hence here, the
19018 * range could be a range of digits. First some
19019 * unlikely special cases. Grandfather in that a range
19020 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
19021 * if its starting value is one of the 10 digits prior
19022 * to it. This is because it is an alternate way of
19023 * writing 19D1, and some people may expect it to be in
19024 * that group. But it is bad, because it won't give
19025 * the expected results. In Unicode 5.2 it was
19026 * considered to be in that group (of 11, hence), but
19027 * this was fixed in the next version */
19029 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
19030 goto warn_bad_digit_range;
19032 else if (UNLIKELY( prevvalue >= 0x1D7CE
19033 && value <= 0x1D7FF))
19035 /* This is the only other case currently in Unicode
19036 * where the algorithm below fails. The code
19037 * points just above are the end points of a single
19038 * range containing only decimal digits. It is 5
19039 * different series of 0-9. All other ranges of
19040 * digits currently in Unicode are just a single
19041 * series. (And mktables will notify us if a later
19042 * Unicode version breaks this.)
19044 * If the range being checked is at most 9 long,
19045 * and the digit values represented are in
19046 * numerical order, they are from the same series.
19048 if ( value - prevvalue > 9
19049 || ((( value - 0x1D7CE) % 10)
19050 <= (prevvalue - 0x1D7CE) % 10))
19052 goto warn_bad_digit_range;
19057 /* For all other ranges of digits in Unicode, the
19058 * algorithm is just to check if both end points
19059 * are in the same series, which is the same range.
19061 index_start = _invlist_search(
19062 PL_XPosix_ptrs[CC_DIGIT_],
19065 /* Warn if the range starts and ends with a digit,
19066 * and they are not in the same group of 10. */
19067 if ( index_start >= 0
19068 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
19070 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
19071 value)) != index_start
19072 && index_final >= 0
19073 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
19075 warn_bad_digit_range:
19076 vWARN(RExC_parse, "Ranges of digits should be"
19077 " from the same group of"
19084 if ((! range || prevvalue == value) && non_portable_endpoint) {
19085 if (isPRINT_A(value)) {
19088 if (isBACKSLASHED_PUNCT(value)) {
19089 literal[d++] = '\\';
19091 literal[d++] = (char) value;
19092 literal[d++] = '\0';
19095 "\"%.*s\" is more clearly written simply as \"%s\"",
19096 (int) (RExC_parse - rangebegin),
19101 else if (isMNEMONIC_CNTRL(value)) {
19103 "\"%.*s\" is more clearly written simply as \"%s\"",
19104 (int) (RExC_parse - rangebegin),
19106 cntrl_to_mnemonic((U8) value)
19112 /* Deal with this element of the class */
19115 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
19118 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
19119 * that don't require special handling, we can just add the range like
19120 * we do for ASCII platforms */
19121 if ((UNLIKELY(prevvalue == 0) && value >= 255)
19122 || ! (prevvalue < 256
19124 || (! non_portable_endpoint
19125 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
19126 || (isUPPER_A(prevvalue)
19127 && isUPPER_A(value)))))))
19129 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
19133 /* Here, requires special handling. This can be because it is a
19134 * range whose code points are considered to be Unicode, and so
19135 * must be individually translated into native, or because its a
19136 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
19137 * EBCDIC, but we have defined them to include only the "expected"
19138 * upper or lower case ASCII alphabetics. Subranges above 255 are
19139 * the same in native and Unicode, so can be added as a range */
19140 U8 start = NATIVE_TO_LATIN1(prevvalue);
19142 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
19143 for (j = start; j <= end; j++) {
19144 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
19147 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
19153 range = 0; /* this range (if it was one) is done now */
19154 } /* End of loop through all the text within the brackets */
19156 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
19157 output_posix_warnings(pRExC_state, posix_warnings);
19160 /* If anything in the class expands to more than one character, we have to
19161 * deal with them by building up a substitute parse string, and recursively
19162 * calling reg() on it, instead of proceeding */
19163 if (multi_char_matches) {
19164 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
19167 char *save_end = RExC_end;
19168 char *save_parse = RExC_parse;
19169 char *save_start = RExC_start;
19170 Size_t constructed_prefix_len = 0; /* This gives the length of the
19171 constructed portion of the
19172 substitute parse. */
19173 bool first_time = TRUE; /* First multi-char occurrence doesn't get
19178 /* Only one level of recursion allowed */
19179 assert(RExC_copy_start_in_constructed == RExC_precomp);
19181 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
19182 because too confusing */
19184 sv_catpvs(substitute_parse, "(?:");
19188 /* Look at the longest strings first */
19189 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
19194 if (av_exists(multi_char_matches, cp_count)) {
19195 AV** this_array_ptr;
19198 this_array_ptr = (AV**) av_fetch(multi_char_matches,
19200 while ((this_sequence = av_pop(*this_array_ptr)) !=
19203 if (! first_time) {
19204 sv_catpvs(substitute_parse, "|");
19206 first_time = FALSE;
19208 sv_catpv(substitute_parse, SvPVX(this_sequence));
19213 /* If the character class contains anything else besides these
19214 * multi-character strings, have to include it in recursive parsing */
19215 if (element_count) {
19216 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
19218 sv_catpvs(substitute_parse, "|");
19219 if (has_l_bracket) { /* Add an [ if the original had one */
19220 sv_catpvs(substitute_parse, "[");
19222 constructed_prefix_len = SvCUR(substitute_parse);
19223 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
19225 /* Put in a closing ']' to match any opening one, but not if going
19226 * off the end, as otherwise we are adding something that really
19228 if (has_l_bracket && RExC_parse < RExC_end) {
19229 sv_catpvs(substitute_parse, "]");
19233 sv_catpvs(substitute_parse, ")");
19236 /* This is a way to get the parse to skip forward a whole named
19237 * sequence instead of matching the 2nd character when it fails the
19239 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
19243 /* Set up the data structure so that any errors will be properly
19244 * reported. See the comments at the definition of
19245 * REPORT_LOCATION_ARGS for details */
19246 RExC_copy_start_in_input = (char *) orig_parse;
19247 RExC_start = SvPV(substitute_parse, len);
19248 RExC_parse_set( RExC_start );
19249 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
19250 RExC_end = RExC_parse + len;
19251 RExC_in_multi_char_class = 1;
19253 ret = reg(pRExC_state, 1, ®_flags, depth+1);
19255 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
19257 /* And restore so can parse the rest of the pattern */
19258 RExC_parse_set(save_parse);
19259 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
19260 RExC_end = save_end;
19261 RExC_in_multi_char_class = 0;
19262 SvREFCNT_dec_NN(multi_char_matches);
19263 SvREFCNT_dec(properties);
19264 SvREFCNT_dec(cp_list);
19265 SvREFCNT_dec(simple_posixes);
19266 SvREFCNT_dec(posixes);
19267 SvREFCNT_dec(nposixes);
19268 SvREFCNT_dec(cp_foldable_list);
19272 /* If folding, we calculate all characters that could fold to or from the
19273 * ones already on the list */
19274 if (cp_foldable_list) {
19276 UV start, end; /* End points of code point ranges */
19278 SV* fold_intersection = NULL;
19281 /* Our calculated list will be for Unicode rules. For locale
19282 * matching, we have to keep a separate list that is consulted at
19283 * runtime only when the locale indicates Unicode rules (and we
19284 * don't include potential matches in the ASCII/Latin1 range, as
19285 * any code point could fold to any other, based on the run-time
19286 * locale). For non-locale, we just use the general list */
19288 use_list = &only_utf8_locale_list;
19291 use_list = &cp_list;
19294 /* Only the characters in this class that participate in folds need
19295 * be checked. Get the intersection of this class and all the
19296 * possible characters that are foldable. This can quickly narrow
19297 * down a large class */
19298 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
19299 &fold_intersection);
19301 /* Now look at the foldable characters in this class individually */
19302 invlist_iterinit(fold_intersection);
19303 while (invlist_iternext(fold_intersection, &start, &end)) {
19307 /* Look at every character in the range */
19308 for (j = start; j <= end; j++) {
19309 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
19312 Size_t folds_count;
19314 const U32 * remaining_folds;
19318 /* Under /l, we don't know what code points below 256
19319 * fold to, except we do know the MICRO SIGN folds to
19320 * an above-255 character if the locale is UTF-8, so we
19321 * add it to the special list (in *use_list) Otherwise
19322 * we know now what things can match, though some folds
19323 * are valid under /d only if the target is UTF-8.
19324 * Those go in a separate list */
19325 if ( IS_IN_SOME_FOLD_L1(j)
19326 && ! (LOC && j != MICRO_SIGN))
19329 /* ASCII is always matched; non-ASCII is matched
19330 * only under Unicode rules (which could happen
19331 * under /l if the locale is a UTF-8 one */
19332 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
19333 *use_list = add_cp_to_invlist(*use_list,
19334 PL_fold_latin1[j]);
19336 else if (j != PL_fold_latin1[j]) {
19337 upper_latin1_only_utf8_matches
19338 = add_cp_to_invlist(
19339 upper_latin1_only_utf8_matches,
19340 PL_fold_latin1[j]);
19344 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
19345 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
19347 add_above_Latin1_folds(pRExC_state,
19354 /* Here is an above Latin1 character. We don't have the
19355 * rules hard-coded for it. First, get its fold. This is
19356 * the simple fold, as the multi-character folds have been
19357 * handled earlier and separated out */
19358 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
19359 (ASCII_FOLD_RESTRICTED)
19360 ? FOLD_FLAGS_NOMIX_ASCII
19363 /* Single character fold of above Latin1. Add everything
19364 * in its fold closure to the list that this node should
19366 folds_count = _inverse_folds(folded, &first_fold,
19368 for (k = 0; k <= folds_count; k++) {
19369 UV c = (k == 0) /* First time through use itself */
19371 : (k == 1) /* 2nd time use, the first fold */
19374 /* Then the remaining ones */
19375 : remaining_folds[k-2];
19377 /* /aa doesn't allow folds between ASCII and non- */
19378 if (( ASCII_FOLD_RESTRICTED
19379 && (isASCII(c) != isASCII(j))))
19384 /* Folds under /l which cross the 255/256 boundary are
19385 * added to a separate list. (These are valid only
19386 * when the locale is UTF-8.) */
19387 if (c < 256 && LOC) {
19388 *use_list = add_cp_to_invlist(*use_list, c);
19392 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
19394 cp_list = add_cp_to_invlist(cp_list, c);
19397 /* Similarly folds involving non-ascii Latin1
19398 * characters under /d are added to their list */
19399 upper_latin1_only_utf8_matches
19400 = add_cp_to_invlist(
19401 upper_latin1_only_utf8_matches,
19407 SvREFCNT_dec_NN(fold_intersection);
19410 /* Now that we have finished adding all the folds, there is no reason
19411 * to keep the foldable list separate */
19412 _invlist_union(cp_list, cp_foldable_list, &cp_list);
19413 SvREFCNT_dec_NN(cp_foldable_list);
19416 /* And combine the result (if any) with any inversion lists from posix
19417 * classes. The lists are kept separate up to now because we don't want to
19418 * fold the classes */
19419 if (simple_posixes) { /* These are the classes known to be unaffected by
19422 _invlist_union(cp_list, simple_posixes, &cp_list);
19423 SvREFCNT_dec_NN(simple_posixes);
19426 cp_list = simple_posixes;
19429 if (posixes || nposixes) {
19430 if (! DEPENDS_SEMANTICS) {
19432 /* For everything but /d, we can just add the current 'posixes' and
19433 * 'nposixes' to the main list */
19436 _invlist_union(cp_list, posixes, &cp_list);
19437 SvREFCNT_dec_NN(posixes);
19445 _invlist_union(cp_list, nposixes, &cp_list);
19446 SvREFCNT_dec_NN(nposixes);
19449 cp_list = nposixes;
19454 /* Under /d, things like \w match upper Latin1 characters only if
19455 * the target string is in UTF-8. But things like \W match all the
19456 * upper Latin1 characters if the target string is not in UTF-8.
19458 * Handle the case with something like \W separately */
19460 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
19462 /* A complemented posix class matches all upper Latin1
19463 * characters if not in UTF-8. And it matches just certain
19464 * ones when in UTF-8. That means those certain ones are
19465 * matched regardless, so can just be added to the
19466 * unconditional list */
19468 _invlist_union(cp_list, nposixes, &cp_list);
19469 SvREFCNT_dec_NN(nposixes);
19473 cp_list = nposixes;
19476 /* Likewise for 'posixes' */
19477 _invlist_union(posixes, cp_list, &cp_list);
19478 SvREFCNT_dec(posixes);
19480 /* Likewise for anything else in the range that matched only
19482 if (upper_latin1_only_utf8_matches) {
19483 _invlist_union(cp_list,
19484 upper_latin1_only_utf8_matches,
19486 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19487 upper_latin1_only_utf8_matches = NULL;
19490 /* If we don't match all the upper Latin1 characters regardless
19491 * of UTF-8ness, we have to set a flag to match the rest when
19493 _invlist_subtract(only_non_utf8_list, cp_list,
19494 &only_non_utf8_list);
19495 if (_invlist_len(only_non_utf8_list) != 0) {
19496 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
19498 SvREFCNT_dec_NN(only_non_utf8_list);
19501 /* Here there were no complemented posix classes. That means
19502 * the upper Latin1 characters in 'posixes' match only when the
19503 * target string is in UTF-8. So we have to add them to the
19504 * list of those types of code points, while adding the
19505 * remainder to the unconditional list.
19507 * First calculate what they are */
19508 SV* nonascii_but_latin1_properties = NULL;
19509 _invlist_intersection(posixes, PL_UpperLatin1,
19510 &nonascii_but_latin1_properties);
19512 /* And add them to the final list of such characters. */
19513 _invlist_union(upper_latin1_only_utf8_matches,
19514 nonascii_but_latin1_properties,
19515 &upper_latin1_only_utf8_matches);
19517 /* Remove them from what now becomes the unconditional list */
19518 _invlist_subtract(posixes, nonascii_but_latin1_properties,
19521 /* And add those unconditional ones to the final list */
19523 _invlist_union(cp_list, posixes, &cp_list);
19524 SvREFCNT_dec_NN(posixes);
19531 SvREFCNT_dec(nonascii_but_latin1_properties);
19533 /* Get rid of any characters from the conditional list that we
19534 * now know are matched unconditionally, which may make that
19536 _invlist_subtract(upper_latin1_only_utf8_matches,
19538 &upper_latin1_only_utf8_matches);
19539 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
19540 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19541 upper_latin1_only_utf8_matches = NULL;
19547 /* And combine the result (if any) with any inversion list from properties.
19548 * The lists are kept separate up to now so that we can distinguish the two
19549 * in regards to matching above-Unicode. A run-time warning is generated
19550 * if a Unicode property is matched against a non-Unicode code point. But,
19551 * we allow user-defined properties to match anything, without any warning,
19552 * and we also suppress the warning if there is a portion of the character
19553 * class that isn't a Unicode property, and which matches above Unicode, \W
19554 * or [\x{110000}] for example.
19555 * (Note that in this case, unlike the Posix one above, there is no
19556 * <upper_latin1_only_utf8_matches>, because having a Unicode property
19557 * forces Unicode semantics */
19561 /* If it matters to the final outcome, see if a non-property
19562 * component of the class matches above Unicode. If so, the
19563 * warning gets suppressed. This is true even if just a single
19564 * such code point is specified, as, though not strictly correct if
19565 * another such code point is matched against, the fact that they
19566 * are using above-Unicode code points indicates they should know
19567 * the issues involved */
19569 warn_super = ! (invert
19570 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
19573 _invlist_union(properties, cp_list, &cp_list);
19574 SvREFCNT_dec_NN(properties);
19577 cp_list = properties;
19581 anyof_flags |= ANYOF_WARN_SUPER__shared;
19583 /* Because an ANYOF node is the only one that warns, this node
19584 * can't be optimized into something else */
19585 optimizable = FALSE;
19589 /* Here, we have calculated what code points should be in the character
19592 * Now we can see about various optimizations. Fold calculation (which we
19593 * did above) needs to take place before inversion. Otherwise /[^k]/i
19594 * would invert to include K, which under /i would match k, which it
19595 * shouldn't. Therefore we can't invert folded locale now, as it won't be
19596 * folded until runtime */
19598 /* If we didn't do folding, it's because some information isn't available
19599 * until runtime; set the run-time fold flag for these We know to set the
19600 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
19601 * at least one 0-255 range code point */
19604 /* Some things on the list might be unconditionally included because of
19605 * other components. Remove them, and clean up the list if it goes to
19607 if (only_utf8_locale_list && cp_list) {
19608 _invlist_subtract(only_utf8_locale_list, cp_list,
19609 &only_utf8_locale_list);
19611 if (_invlist_len(only_utf8_locale_list) == 0) {
19612 SvREFCNT_dec_NN(only_utf8_locale_list);
19613 only_utf8_locale_list = NULL;
19616 if ( only_utf8_locale_list
19618 && ( _invlist_contains_cp(cp_list,
19619 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
19620 || _invlist_contains_cp(cp_list,
19621 LATIN_SMALL_LETTER_DOTLESS_I))))
19623 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19624 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
19626 else if (cp_list && invlist_lowest(cp_list) < 256) {
19627 /* If nothing is below 256, has no locale dependency; otherwise it
19629 anyof_flags |= ANYOFL_FOLD;
19630 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19632 /* In a Turkish locale these could match, notify the run-time code
19633 * to check for that */
19634 if ( _invlist_contains_cp(cp_list, 'I')
19635 || _invlist_contains_cp(cp_list, 'i'))
19637 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
19641 else if ( DEPENDS_SEMANTICS
19642 && ( upper_latin1_only_utf8_matches
19644 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
19646 RExC_seen_d_op = TRUE;
19647 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
19650 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
19654 && ! has_runtime_dependency)
19656 _invlist_invert(cp_list);
19658 /* Clear the invert flag since have just done it here */
19662 /* All possible optimizations below still have these characteristics.
19663 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
19665 *flagp |= HASWIDTH|SIMPLE;
19668 *ret_invlist = cp_list;
19670 return (cp_list) ? RExC_emit : 0;
19673 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19674 RExC_contains_locale = 1;
19679 /* Some character classes are equivalent to other nodes. Such nodes
19680 * take up less room, and some nodes require fewer operations to
19681 * execute, than ANYOF nodes. EXACTish nodes may be joinable with
19682 * adjacent nodes to improve efficiency. */
19683 op = optimize_regclass(pRExC_state, cp_list,
19684 only_utf8_locale_list,
19685 upper_latin1_only_utf8_matches,
19686 has_runtime_dependency,
19688 &anyof_flags, &invert, &ret, flagp);
19689 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
19691 /* If optimized to something else and emitted, clean up and return */
19693 SvREFCNT_dec(cp_list);;
19694 SvREFCNT_dec(only_utf8_locale_list);
19695 SvREFCNT_dec(upper_latin1_only_utf8_matches);
19699 /* If no optimization was found, an END was returned and we will now
19706 /* Here are going to emit an ANYOF; set the particular type */
19708 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
19719 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
19720 FILL_NODE(ret, op); /* We set the argument later */
19721 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
19722 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19724 /* Here, <cp_list> contains all the code points we can determine at
19725 * compile time that match under all conditions. Go through it, and
19726 * for things that belong in the bitmap, put them there, and delete from
19727 * <cp_list>. While we are at it, see if everything above 255 is in the
19728 * list, and if so, set a flag to speed up execution */
19730 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
19733 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19737 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19740 /* Here, the bitmap has been populated with all the Latin1 code points that
19741 * always match. Can now add to the overall list those that match only
19742 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19744 if (upper_latin1_only_utf8_matches) {
19746 _invlist_union(cp_list,
19747 upper_latin1_only_utf8_matches,
19749 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19752 cp_list = upper_latin1_only_utf8_matches;
19754 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
19757 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19758 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19761 only_utf8_locale_list);
19763 SvREFCNT_dec(cp_list);;
19764 SvREFCNT_dec(only_utf8_locale_list);
19769 S_optimize_regclass(pTHX_
19770 RExC_state_t *pRExC_state,
19772 SV* only_utf8_locale_list,
19773 SV* upper_latin1_only_utf8_matches,
19774 const U32 has_runtime_dependency,
19778 regnode_offset * ret,
19782 /* This function exists just to make S_regclass() smaller. It extracts out
19783 * the code that looks for potential optimizations away from a full generic
19784 * ANYOF node. The parameter names are the same as the corresponding
19785 * variables in S_regclass.
19787 * It returns the new op (the impossible END one if no optimization found)
19788 * and sets *ret to any created regnode. If the new op is sufficiently
19789 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
19791 * Certain of the parameters may be updated as a result of the changes
19794 U8 op = END; /* The returned node-type, initialized to an impossible
19797 PERL_UINT_FAST8_T i;
19798 UV partial_cp_count = 0;
19799 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19800 UV end[MAX_FOLD_FROMS+1] = { 0 };
19801 bool single_range = FALSE;
19802 UV lowest_cp = 0, highest_cp = 0;
19804 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
19806 if (cp_list) { /* Count the code points in enough ranges that we would see
19807 all the ones possible in any fold in this version of
19810 invlist_iterinit(cp_list);
19811 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19812 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19815 partial_cp_count += end[i] - start[i] + 1;
19819 single_range = TRUE;
19821 invlist_iterfinish(cp_list);
19823 /* If we know at compile time that this matches every possible code
19824 * point, any run-time dependencies don't matter */
19825 if (start[0] == 0 && end[0] == UV_MAX) {
19827 goto return_OPFAIL;
19834 /* Use a clearer mnemonic for below */
19835 lowest_cp = start[0];
19837 highest_cp = invlist_highest(cp_list);
19840 /* Similarly, for /l posix classes, if both a class and its complement
19841 * match, any run-time dependencies don't matter */
19844 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
19845 if ( POSIXL_TEST(posixl, namedclass) /* class */
19846 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19849 goto return_OPFAIL;
19858 /* For well-behaved locales, some classes are subsets of others, so
19859 * complementing the subset and including the non-complemented superset
19860 * should match everything, like [\D[:alnum:]], and
19861 * [[:^alpha:][:alnum:]], but some implementations of locales are
19862 * buggy, and khw thinks its a bad idea to have optimization change
19863 * behavior, even if it avoids an OS bug in a given case */
19865 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19867 /* If is a single posix /l class, can optimize to just that op. Such a
19868 * node will not match anything in the Latin1 range, as that is not
19869 * determinable until runtime, but will match whatever the class does
19870 * outside that range. (Note that some classes won't match anything
19871 * outside the range, like [:ascii:]) */
19872 if ( isSINGLE_BIT_SET(posixl)
19873 && (partial_cp_count == 0 || lowest_cp > 255))
19876 SV * class_above_latin1 = NULL;
19877 bool already_inverted;
19878 bool are_equivalent;
19881 namedclass = single_1bit_pos32(posixl);
19882 classnum = namedclass_to_classnum(namedclass);
19884 /* The named classes are such that the inverted number is one
19885 * larger than the non-inverted one */
19886 already_inverted = namedclass - classnum_to_namedclass(classnum);
19888 /* Create an inversion list of the official property, inverted if
19889 * the constructed node list is inverted, and restricted to only
19890 * the above latin1 code points, which are the only ones known at
19892 _invlist_intersection_maybe_complement_2nd(
19894 PL_XPosix_ptrs[classnum],
19896 &class_above_latin1);
19897 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
19898 SvREFCNT_dec_NN(class_above_latin1);
19900 if (are_equivalent) {
19902 /* Resolve the run-time inversion flag with this possibly
19903 * inverted class */
19904 *invert = *invert ^ already_inverted;
19906 op = POSIXL + *invert * (NPOSIXL - POSIXL);
19907 *ret = reg_node(pRExC_state, op);
19908 FLAGS(REGNODE_p(*ret)) = classnum;
19914 /* khw can't think of any other possible transformation involving these. */
19915 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19919 if (! has_runtime_dependency) {
19921 /* If the list is empty, nothing matches. This happens, for example,
19922 * when a Unicode property that doesn't match anything is the only
19923 * element in the character class (perluniprops.pod notes such
19925 if (partial_cp_count == 0) {
19930 goto return_OPFAIL;
19934 /* If matches everything but \n */
19935 if ( start[0] == 0 && end[0] == '\n' - 1
19936 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19938 assert (! *invert);
19940 *ret = reg_node(pRExC_state, op);
19946 /* Next see if can optimize classes that contain just a few code points
19947 * into an EXACTish node. The reason to do this is to let the optimizer
19948 * join this node with adjacent EXACTish ones, and ANYOF nodes require
19949 * runtime conversion to code point from UTF-8, which we'd like to avoid.
19951 * An EXACTFish node can be generated even if not under /i, and vice versa.
19952 * But care must be taken. An EXACTFish node has to be such that it only
19953 * matches precisely the code points in the class, but we want to generate
19954 * the least restrictive one that does that, to increase the odds of being
19955 * able to join with an adjacent node. For example, if the class contains
19956 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
19957 * from matching. Whether we are under /i or not is irrelevant in this
19958 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
19959 * LETTER APOSTROPHE. That is supposed to match the single character U+0149
19960 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
19961 * is no simple fold that includes \X{02BC}, there is a multi-char fold
19962 * that does, and so the node generated for it must be an EXACTFish one.
19963 * On the other hand qr/:/i should generate a plain EXACT node since the
19964 * colon participates in no fold whatsoever, and having it be EXACT tells
19965 * the optimizer the target string cannot match unless it has a colon in
19970 /* Only try if there are no more code points in the class than in
19971 * the max possible fold */
19972 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19974 /* We can always make a single code point class into an EXACTish node.
19976 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
19979 /* Here is /l: Use EXACTL, except if there is a fold not known
19980 * until runtime so shows as only a single code point here.
19981 * For code points above 255, we know which can cause problems
19982 * by having a potential fold to the Latin1 range. */
19984 || ( lowest_cp > 255
19985 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
19993 else if (! FOLD) { /* Not /l and not /i */
19994 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
19996 else if (lowest_cp < 256) { /* /i, not /l, and the code point is
19999 /* Under /i, it gets a little tricky. A code point that
20000 * doesn't participate in a fold should be an EXACT node. We
20001 * know this one isn't the result of a simple fold, or there'd
20002 * be more than one code point in the list, but it could be
20003 * part of a multi-character fold. In that case we better not
20004 * create an EXACT node, as we would wrongly be telling the
20005 * optimizer that this code point must be in the target string,
20006 * and that is wrong. This is because if the sequence around
20007 * this code point forms a multi-char fold, what needs to be in
20008 * the string could be the code point that folds to the
20011 * This handles the case of below-255 code points, as we have
20012 * an easy look up for those. The next clause handles the
20014 op = IS_IN_SOME_FOLD_L1(lowest_cp)
20018 else { /* /i, larger code point. Since we are under /i, and have
20019 just this code point, we know that it can't fold to
20020 something else, so PL_InMultiCharFold applies to it */
20021 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
20028 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
20029 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
20031 /* Here, the only runtime dependency, if any, is from /d, and the
20032 * class matches more than one code point, and the lowest code
20033 * point participates in some fold. It might be that the other
20034 * code points are /i equivalent to this one, and hence they would
20035 * be representable by an EXACTFish node. Above, we eliminated
20036 * classes that contain too many code points to be EXACTFish, with
20037 * the test for MAX_FOLD_FROMS
20039 * First, special case the ASCII fold pairs, like 'B' and 'b'. We
20040 * do this because we have EXACTFAA at our disposal for the ASCII
20042 if (partial_cp_count == 2 && isASCII(lowest_cp)) {
20044 /* The only ASCII characters that participate in folds are
20046 assert(isALPHA(lowest_cp));
20047 if ( end[0] == start[0] /* First range is a single
20048 character, so 2nd exists */
20049 && isALPHA_FOLD_EQ(start[0], start[1]))
20051 /* Here, is part of an ASCII fold pair */
20053 if ( ASCII_FOLD_RESTRICTED
20054 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
20056 /* If the second clause just above was true, it means
20057 * we can't be under /i, or else the list would have
20058 * included more than this fold pair. Therefore we
20059 * have to exclude the possibility of whatever else it
20060 * is that folds to these, by using EXACTFAA */
20063 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
20065 /* Here, there's no simple fold that lowest_cp is part
20066 * of, but there is a multi-character one. If we are
20067 * not under /i, we want to exclude that possibility;
20068 * if under /i, we want to include it */
20069 op = (FOLD) ? EXACTFU : EXACTFAA;
20073 /* Here, the only possible fold lowest_cp particpates in
20074 * is with start[1]. /i or not isn't relevant */
20078 value = toFOLD(lowest_cp);
20081 else if ( ! upper_latin1_only_utf8_matches
20082 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
20084 invlist_highest(upper_latin1_only_utf8_matches)]
20087 /* Here, the smallest character is non-ascii or there are more
20088 * than 2 code points matched by this node. Also, we either
20089 * don't have /d UTF-8 dependent matches, or if we do, they
20090 * look like they could be a single character that is the fold
20091 * of the lowest one is in the always-match list. This test
20092 * quickly excludes most of the false positives when there are
20093 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
20094 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
20095 * iff the target string is UTF-8. (We don't have to worry
20096 * above about exceeding the array bounds of PL_fold_latin1[]
20097 * because any code point in 'upper_latin1_only_utf8_matches'
20100 * EXACTFAA would apply only to pairs (hence exactly 2 code
20101 * points) in the ASCII range, so we can't use it here to
20102 * artificially restrict the fold domain, so we check if the
20103 * class does or does not match some EXACTFish node. Further,
20104 * if we aren't under /i, and and the folded-to character is
20105 * part of a multi-character fold, we can't do this
20106 * optimization, as the sequence around it could be that
20107 * multi-character fold, and we don't here know the context, so
20108 * we have to assume it is that multi-char fold, to prevent
20111 * To do the general case, we first find the fold of the lowest
20112 * code point (which may be higher than that lowest unfolded
20113 * one), then find everything that folds to it. (The data
20114 * structure we have only maps from the folded code points, so
20115 * we have to do the earlier step.) */
20118 U8 foldbuf[UTF8_MAXBYTES_CASE];
20119 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
20121 const U32 * remaining_folds;
20122 Size_t folds_to_this_cp_count = _inverse_folds(
20126 Size_t folds_count = folds_to_this_cp_count + 1;
20127 SV * fold_list = _new_invlist(folds_count);
20130 /* If there are UTF-8 dependent matches, create a temporary
20131 * list of what this node matches, including them. */
20132 SV * all_cp_list = NULL;
20133 SV ** use_this_list = &cp_list;
20135 if (upper_latin1_only_utf8_matches) {
20136 all_cp_list = _new_invlist(0);
20137 use_this_list = &all_cp_list;
20138 _invlist_union(cp_list,
20139 upper_latin1_only_utf8_matches,
20143 /* Having gotten everything that participates in the fold
20144 * containing the lowest code point, we turn that into an
20145 * inversion list, making sure everything is included. */
20146 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
20147 fold_list = add_cp_to_invlist(fold_list, folded);
20148 if (folds_to_this_cp_count > 0) {
20149 fold_list = add_cp_to_invlist(fold_list, first_fold);
20150 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
20151 fold_list = add_cp_to_invlist(fold_list,
20152 remaining_folds[i]);
20156 /* If the fold list is identical to what's in this ANYOF node,
20157 * the node can be represented by an EXACTFish one instead */
20158 if (_invlistEQ(*use_this_list, fold_list,
20159 0 /* Don't complement */ )
20162 /* But, we have to be careful, as mentioned above. Just
20163 * the right sequence of characters could match this if it
20164 * is part of a multi-character fold. That IS what we want
20165 * if we are under /i. But it ISN'T what we want if not
20166 * under /i, as it could match when it shouldn't. So, when
20167 * we aren't under /i and this character participates in a
20168 * multi-char fold, we don't optimize into an EXACTFish
20169 * node. So, for each case below we have to check if we
20170 * are folding, and if not, if it is not part of a
20171 * multi-char fold. */
20172 if (lowest_cp > 255) { /* Highish code point */
20173 if (FOLD || ! _invlist_contains_cp(
20174 PL_InMultiCharFold, folded))
20178 : (ASCII_FOLD_RESTRICTED)
20183 } /* Below, the lowest code point < 256 */
20186 && DEPENDS_SEMANTICS)
20187 { /* An EXACTF node containing a single character 's',
20188 can be an EXACTFU if it doesn't get joined with an
20190 op = EXACTFU_S_EDGE;
20194 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
20196 if (upper_latin1_only_utf8_matches) {
20199 /* We can't use the fold, as that only matches
20203 else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
20205 { /* EXACTFUP is a special node for this character */
20206 op = (ASCII_FOLD_RESTRICTED)
20209 value = MICRO_SIGN;
20211 else if ( ASCII_FOLD_RESTRICTED
20212 && ! isASCII(lowest_cp))
20213 { /* For ASCII under /iaa, we can use EXACTFU below
20225 SvREFCNT_dec_NN(fold_list);
20226 SvREFCNT_dec(all_cp_list);
20233 /* Here, we have calculated what EXACTish node to use. Have to
20234 * convert to UTF-8 if not already there */
20237 SvREFCNT_dec(cp_list);;
20238 REQUIRE_UTF8(flagp);
20241 /* This is a kludge to the special casing issues with this
20242 * ligature under /aa. FB05 should fold to FB06, but the call
20243 * above to _to_uni_fold_flags() didn't find this, as it didn't
20244 * use the /aa restriction in order to not miss other folds
20245 * that would be affected. This is the only instance likely to
20246 * ever be a problem in all of Unicode. So special case it. */
20247 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
20248 && ASCII_FOLD_RESTRICTED)
20250 value = LATIN_SMALL_LIGATURE_ST;
20254 len = (UTF) ? UVCHR_SKIP(value) : 1;
20256 *ret = REGNODE_GUTS(pRExC_state, op, len);
20257 FILL_NODE(*ret, op);
20258 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
20259 setSTR_LEN(REGNODE_p(*ret), len);
20261 *STRINGs(REGNODE_p(*ret)) = (U8) value;
20264 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
20271 if (! has_runtime_dependency) {
20273 /* See if this can be turned into an ANYOFM node. Think about the bit
20274 * patterns in two different bytes. In some positions, the bits in
20275 * each will be 1; and in other positions both will be 0; and in some
20276 * positions the bit will be 1 in one byte, and 0 in the other. Let
20277 * 'n' be the number of positions where the bits differ. We create a
20278 * mask which has exactly 'n' 0 bits, each in a position where the two
20279 * bytes differ. Now take the set of all bytes that when ANDed with
20280 * the mask yield the same result. That set has 2**n elements, and is
20281 * representable by just two 8 bit numbers: the result and the mask.
20282 * Importantly, matching the set can be vectorized by creating a word
20283 * full of the result bytes, and a word full of the mask bytes,
20284 * yielding a significant speed up. Here, see if this node matches
20285 * such a set. As a concrete example consider [01], and the byte
20286 * representing '0' which is 0x30 on ASCII machines. It has the bits
20287 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
20288 * that mask we get 0x30. Any other bytes ANDed yield something else.
20289 * So [01], which is a common usage, is optimizable into ANYOFM, and
20290 * can benefit from the speed up. We can only do this on UTF-8
20291 * invariant bytes, because they have the same bit patterns under UTF-8
20293 PERL_UINT_FAST8_T inverted = 0;
20295 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
20297 const PERL_UINT_FAST8_T max_permissible
20298 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
20300 /* If doesn't fit the criteria for ANYOFM, invert and try again. If
20301 * that works we will instead later generate an NANYOFM, and invert
20302 * back when through */
20303 if (highest_cp > max_permissible) {
20304 _invlist_invert(cp_list);
20308 if (invlist_highest(cp_list) <= max_permissible) {
20309 UV this_start, this_end;
20310 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
20311 U8 bits_differing = 0;
20312 Size_t full_cp_count = 0;
20313 bool first_time = TRUE;
20315 /* Go through the bytes and find the bit positions that differ */
20316 invlist_iterinit(cp_list);
20317 while (invlist_iternext(cp_list, &this_start, &this_end)) {
20318 unsigned int i = this_start;
20321 if (! UVCHR_IS_INVARIANT(i)) {
20325 first_time = FALSE;
20326 lowest_cp = this_start;
20328 /* We have set up the code point to compare with. Don't
20329 * compare it with itself */
20333 /* Find the bit positions that differ from the lowest code
20334 * point in the node. Keep track of all such positions by
20336 for (; i <= this_end; i++) {
20337 if (! UVCHR_IS_INVARIANT(i)) {
20341 bits_differing |= i ^ lowest_cp;
20344 full_cp_count += this_end - this_start + 1;
20347 /* At the end of the loop, we count how many bits differ from the
20348 * bits in lowest code point, call the count 'd'. If the set we
20349 * found contains 2**d elements, it is the closure of all code
20350 * points that differ only in those bit positions. To convince
20351 * yourself of that, first note that the number in the closure must
20352 * be a power of 2, which we test for. The only way we could have
20353 * that count and it be some differing set, is if we got some code
20354 * points that don't differ from the lowest code point in any
20355 * position, but do differ from each other in some other position.
20356 * That means one code point has a 1 in that position, and another
20357 * has a 0. But that would mean that one of them differs from the
20358 * lowest code point in that position, which possibility we've
20359 * already excluded. */
20360 if ( (inverted || full_cp_count > 1)
20361 && full_cp_count == 1U << PL_bitcount[bits_differing])
20365 op = ANYOFM + inverted;;
20367 /* We need to make the bits that differ be 0's */
20368 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
20370 /* The argument is the lowest code point */
20371 *ret = reganode(pRExC_state, op, lowest_cp);
20372 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
20376 invlist_iterfinish(cp_list);
20380 _invlist_invert(cp_list);
20387 /* XXX We could create an ANYOFR_LOW node here if we saved above if all
20388 * were invariants, it wasn't inverted, and there is a single range.
20389 * This would be faster than some of the posix nodes we create below
20390 * like /\d/a, but would be twice the size. Without having actually
20391 * measured the gain, khw doesn't think the tradeoff is really worth it
20395 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
20396 PERL_UINT_FAST8_T type;
20397 SV * intersection = NULL;
20398 SV* d_invlist = NULL;
20400 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
20401 * ones are about the same speed as ANYOF ops, but take less room; the
20402 * ones that have above-Latin1 code point matches are somewhat faster
20405 for (type = POSIXA; type >= POSIXD; type--) {
20408 if (type == POSIXL) { /* But not /l posix classes */
20412 for (posix_class = 0;
20413 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
20416 SV** our_code_points = &cp_list;
20417 SV** official_code_points;
20420 if (type == POSIXA) {
20421 official_code_points = &PL_Posix_ptrs[posix_class];
20424 official_code_points = &PL_XPosix_ptrs[posix_class];
20427 /* Skip non-existent classes of this type. e.g. \v only has an
20428 * entry in PL_XPosix_ptrs */
20429 if (! *official_code_points) {
20433 /* Try both the regular class, and its inversion */
20434 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
20435 bool this_inverted = *invert ^ try_inverted;
20437 if (type != POSIXD) {
20439 /* This class that isn't /d can't match if we have /d
20441 if (has_runtime_dependency
20442 & HAS_D_RUNTIME_DEPENDENCY)
20447 else /* is /d */ if (! this_inverted) {
20449 /* /d classes don't match anything non-ASCII below 256
20450 * unconditionally (which cp_list contains) */
20451 _invlist_intersection(cp_list, PL_UpperLatin1,
20453 if (_invlist_len(intersection) != 0) {
20457 SvREFCNT_dec(d_invlist);
20458 d_invlist = invlist_clone(cp_list, NULL);
20460 /* But under UTF-8 it turns into using /u rules. Add
20461 * the things it matches under these conditions so that
20462 * we check below that these are identical to what the
20463 * tested class should match */
20464 if (upper_latin1_only_utf8_matches) {
20467 upper_latin1_only_utf8_matches,
20470 our_code_points = &d_invlist;
20472 else { /* POSIXD, inverted. If this doesn't have this
20473 flag set, it isn't /d. */
20474 if (! ( *anyof_flags
20475 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
20480 our_code_points = &cp_list;
20483 /* Here, have weeded out some things. We want to see if
20484 * the list of characters this node contains
20485 * ('*our_code_points') precisely matches those of the
20486 * class we are currently checking against
20487 * ('*official_code_points'). */
20488 if (_invlistEQ(*our_code_points,
20489 *official_code_points,
20492 /* Here, they precisely match. Optimize this ANYOF
20493 * node into its equivalent POSIX one of the correct
20494 * type, possibly inverted.
20496 * Some of these nodes match a single range of
20497 * characters (or [:alpha:] matches two parallel ranges
20498 * on ASCII platforms). The array lookup at execution
20499 * time could be replaced by a range check for such
20500 * nodes. But regnodes are a finite resource, and the
20501 * possible performance boost isn't large, so this
20502 * hasn't been done. An attempt to use just one node
20503 * (and its inverse) to encompass all such cases was
20504 * made in d62feba66bf43f35d092bb026694f927e9f94d38.
20505 * But the shifting/masking it used ended up being
20506 * slower than the array look up, so it was reverted */
20507 op = (try_inverted)
20508 ? type + NPOSIXA - POSIXA
20510 *ret = reg_node(pRExC_state, op);
20511 FLAGS(REGNODE_p(*ret)) = posix_class;
20512 SvREFCNT_dec(d_invlist);
20513 SvREFCNT_dec(intersection);
20519 SvREFCNT_dec(d_invlist);
20520 SvREFCNT_dec(intersection);
20523 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
20524 * in size and speed. Currently, a 20 bit range base (smallest code point
20525 * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
20526 * This allows for using it on all of the Unicode code points except for
20527 * the highest plane, which is only for private use code points. khw
20528 * doubts that a bigger delta is likely in real world applications */
20530 && ! has_runtime_dependency
20531 && *anyof_flags == 0
20532 && start[0] < (1 << ANYOFR_BASE_BITS)
20533 && end[0] - start[0]
20534 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
20535 * CHARBITS - ANYOFR_BASE_BITS))))
20538 U8 low_utf8[UTF8_MAXBYTES+1];
20539 U8 high_utf8[UTF8_MAXBYTES+1];
20542 *ret = reganode(pRExC_state, op,
20543 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
20545 /* Place the lowest UTF-8 start byte in the flags field, so as to allow
20546 * efficient ruling out at run time of many possible inputs. */
20547 (void) uvchr_to_utf8(low_utf8, start[0]);
20548 (void) uvchr_to_utf8(high_utf8, end[0]);
20550 /* If all code points share the same first byte, this can be an
20551 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
20552 * quickly rule out many inputs at run-time without having to compute
20553 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
20554 * transformation would not rule out nearly so many things */
20555 if (low_utf8[0] == high_utf8[0]) {
20557 OP(REGNODE_p(*ret)) = op;
20558 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
20561 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
20567 /* If didn't find an optimization and there is no need for a bitmap,
20568 * of the lowest code points, optimize to indicate that */
20569 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
20571 && ! upper_latin1_only_utf8_matches
20572 && *anyof_flags == 0)
20574 U8 low_utf8[UTF8_MAXBYTES+1];
20575 UV highest_cp = invlist_highest(cp_list);
20577 /* Currently the maximum allowed code point by the system is IV_MAX.
20578 * Higher ones are reserved for future internal use. This particular
20579 * regnode can be used for higher ones, but we can't calculate the code
20580 * point of those. IV_MAX suffices though, as it will be a large first
20582 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
20585 /* We store the lowest possible first byte of the UTF-8 representation,
20586 * using the flags field. This allows for quick ruling out of some
20587 * inputs without having to convert from UTF-8 to code point. For
20588 * EBCDIC, we use I8, as not doing that transformation would not rule
20589 * out nearly so many things */
20590 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
20594 /* If the first UTF-8 start byte for the highest code point in the
20595 * range is suitably small, we may be able to get an upper bound as
20597 if (highest_cp <= IV_MAX) {
20598 U8 high_utf8[UTF8_MAXBYTES+1];
20599 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
20601 /* If the lowest and highest are the same, we can get an exact
20602 * first byte instead of a just minimum or even a sequence of exact
20603 * leading bytes. We signal these with different regnodes */
20604 if (low_utf8[0] == high_utf8[0]) {
20605 Size_t len = find_first_differing_byte_pos(low_utf8,
20607 MIN(low_len, high_len));
20610 /* No need to convert to I8 for EBCDIC as this is an exact
20612 *anyof_flags = low_utf8[0];
20614 if (high_len == 2) {
20615 /* If the elements matched all have a 2-byte UTF-8
20616 * representation, with the first byte being the same,
20617 * we can use a compact, fast regnode. capable of
20618 * matching any combination of continuation byte
20621 * (A similar regnode could be created for the Latin1
20622 * range; the complication being that it could match
20623 * non-UTF8 targets. The internal bitmap would serve
20624 * both cases; with some extra code in regexec.c) */
20626 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
20627 FILL_NODE(*ret, op);
20628 ((struct regnode_bbm *) REGNODE_p(*ret))->first_byte = low_utf8[0],
20630 /* The 64 bit (or 32 on EBCCDIC) map can be looked up
20631 * directly based on the continuation byte, without
20632 * needing to convert to code point */
20633 populate_bitmap_from_invlist(
20636 /* The base code point is from the start byte */
20637 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
20638 UTF_CONTINUATION_MARK | 0),
20640 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
20641 REGNODE_BBM_BITMAP_LEN);
20642 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
20651 *ret = REGNODE_GUTS(pRExC_state, op,
20652 REGNODE_ARG_LEN(op) + STR_SZ(len));
20653 FILL_NODE(*ret, op);
20654 ((struct regnode_anyofhs *) REGNODE_p(*ret))->str_len
20656 Copy(low_utf8, /* Add the common bytes */
20657 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
20659 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
20660 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
20661 NULL, only_utf8_locale_list);
20665 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
20667 /* Here, the high byte is not the same as the low, but is small
20668 * enough that its reasonable to have a loose upper bound,
20669 * which is packed in with the strict lower bound. See
20670 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
20671 * platforms, I8 is used. On ASCII platforms I8 is the same
20672 * thing as UTF-8 */
20675 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
20676 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
20679 if (range_diff <= max_range_diff / 8) {
20682 else if (range_diff <= max_range_diff / 4) {
20685 else if (range_diff <= max_range_diff / 2) {
20688 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
20698 *ret = reganode(pRExC_state, op, 0);
20703 *ret = reg_node(pRExC_state, op);
20708 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
20711 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
20712 regnode* const node,
20714 SV* const runtime_defns,
20715 SV* const only_utf8_locale_list)
20717 /* Sets the arg field of an ANYOF-type node 'node', using information about
20718 * the node passed-in. If only the bitmap is needed to determine what
20719 * matches, the arg is set appropriately to either
20720 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
20721 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
20723 * Otherwise, it sets the argument to the count returned by add_data(),
20724 * having allocated and stored an array, av, as follows:
20725 * av[0] stores the inversion list defining this class as far as known at
20726 * this time, or PL_sv_undef if nothing definite is now known.
20727 * av[1] stores the inversion list of code points that match only if the
20728 * current locale is UTF-8, or if none, PL_sv_undef if there is an
20729 * av[2], or no entry otherwise.
20730 * av[2] stores the list of user-defined properties whose subroutine
20731 * definitions aren't known at this time, or no entry if none. */
20735 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
20737 /* If this is set, the final disposition won't be known until runtime, so
20738 * we can't do any of the compile time optimizations */
20739 if (! runtime_defns) {
20741 /* On plain ANYOF nodes without the possibility of a runtime locale
20742 * making a difference, maybe there's no information to be gleaned
20743 * except for what's in the bitmap */
20744 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
20746 /* There are two such cases:
20747 * 1) there is no list of code points matched outside the bitmap
20750 ARG_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
20754 /* 2) the list indicates everything outside the bitmap matches */
20755 if ( invlist_highest(cp_list) == UV_MAX
20756 && invlist_highest_range_start(cp_list)
20757 <= NUM_ANYOF_CODE_POINTS)
20759 ARG_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
20763 /* In all other cases there are things outside the bitmap that we
20764 * may need to check at runtime. */
20767 /* Here, we have resolved all the possible run-time matches, and they
20768 * are stored in one or both of two possible lists. (While some match
20769 * only under certain runtime circumstances, we know all the possible
20770 * ones for each such circumstance.)
20772 * It may very well be that the pattern being compiled contains an
20773 * identical class, already encountered. Reusing that class here saves
20774 * space. Look through all classes so far encountered. */
20775 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
20776 for (unsigned int i = 0; i < existing_items; i++) {
20778 /* Only look at auxiliary data of this type */
20779 if (RExC_rxi->data->what[i] != 's') {
20783 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
20784 AV * const av = MUTABLE_AV(SvRV(rv));
20786 /* If the already encountered class has data that won't be known
20787 * until runtime (stored in the final element of the array), we
20789 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
20793 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
20794 false /* no lvalue */);
20796 /* The new and the existing one both have to have or both not
20797 * have this element, for this one to duplicate that one */
20798 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
20802 /* If the inversion lists aren't equivalent, can't share */
20803 if (cp_list && ! _invlistEQ(cp_list,
20804 *stored_cp_list_ptr,
20805 FALSE /* don't complement */))
20810 /* Similarly for the other list */
20811 SV ** stored_only_utf8_locale_list_ptr = av_fetch(
20813 ONLY_LOCALE_MATCHES_INDEX,
20814 false /* no lvalue */);
20815 if ( cBOOL(only_utf8_locale_list)
20816 != cBOOL(stored_only_utf8_locale_list_ptr))
20821 if (only_utf8_locale_list && ! _invlistEQ(
20822 only_utf8_locale_list,
20823 *stored_only_utf8_locale_list_ptr,
20824 FALSE /* don't complement */))
20829 /* Here, the existence and contents of both compile-time lists
20830 * are identical between the new and existing data. Re-use the
20834 } /* end of loop through existing classes */
20837 /* Here, we need to create a new auxiliary data element; either because
20838 * this doesn't duplicate an existing one, or we can't tell at this time if
20839 * it eventually will */
20841 AV * const av = newAV();
20845 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20848 /* (Note that if any of this changes, the size calculations in
20849 * S_optimize_regclass() might need to be updated.) */
20851 if (only_utf8_locale_list) {
20852 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20853 SvREFCNT_inc_NN(only_utf8_locale_list));
20856 if (runtime_defns) {
20857 av_store(av, DEFERRED_USER_DEFINED_INDEX,
20858 SvREFCNT_inc_NN(runtime_defns));
20861 rv = newRV_noinc(MUTABLE_SV(av));
20862 n = add_data(pRExC_state, STR_WITH_LEN("s"));
20863 RExC_rxi->data->data[n] = (void*)rv;
20869 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20870 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20872 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20876 /* For internal core use only.
20877 * Returns the inversion list for the input 'node' in the regex 'prog'.
20878 * If <doinit> is 'true', will attempt to create the inversion list if not
20879 * already done. If it is created, it will add to the normal inversion
20880 * list any that comes from user-defined properties. It croaks if this
20881 * is called before such a list is ready to be generated, that is when a
20882 * user-defined property has been declared, buyt still not yet defined.
20883 * If <listsvp> is non-null, will return the printable contents of the
20884 * property definition. This can be used to get debugging information
20885 * even before the inversion list exists, by calling this function with
20886 * 'doinit' set to false, in which case the components that will be used
20887 * to eventually create the inversion list are returned (in a printable
20889 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20890 * store an inversion list of code points that should match only if the
20891 * execution-time locale is a UTF-8 one.
20892 * If <output_invlist> is not NULL, it is where this routine is to store an
20893 * inversion list of the code points that would be instead returned in
20894 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20895 * when this parameter is used, is just the non-code point data that
20896 * will go into creating the inversion list. This currently should be just
20897 * user-defined properties whose definitions were not known at compile
20898 * time. Using this parameter allows for easier manipulation of the
20899 * inversion list's data by the caller. It is illegal to call this
20900 * function with this parameter set, but not <listsvp>
20902 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20903 * that, in spite of this function's name, the inversion list it returns
20904 * may include the bitmap data as well */
20906 SV *si = NULL; /* Input initialization string */
20907 SV* invlist = NULL;
20909 RXi_GET_DECL_NULL(prog, progi);
20910 const struct reg_data * const data = prog ? progi->data : NULL;
20912 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20913 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
20915 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
20917 assert(! output_invlist || listsvp);
20919 if (data && data->count) {
20920 const U32 n = ARG(node);
20922 if (data->what[n] == 's') {
20923 SV * const rv = MUTABLE_SV(data->data[n]);
20924 AV * const av = MUTABLE_AV(SvRV(rv));
20925 SV **const ary = AvARRAY(av);
20927 invlist = ary[INVLIST_INDEX];
20929 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20930 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20933 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20934 si = ary[DEFERRED_USER_DEFINED_INDEX];
20937 if (doinit && (si || invlist)) {
20940 SV * msg = newSVpvs_flags("", SVs_TEMP);
20942 SV * prop_definition = handle_user_defined_property(
20943 "", 0, FALSE, /* There is no \p{}, \P{} */
20944 SvPVX_const(si)[1] - '0', /* /i or not has been
20945 stored here for just
20947 TRUE, /* run time */
20948 FALSE, /* This call must find the defn */
20949 si, /* The property definition */
20952 0 /* base level call */
20956 assert(prop_definition == NULL);
20958 Perl_croak(aTHX_ "%" UTF8f,
20959 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20963 _invlist_union(invlist, prop_definition, &invlist);
20964 SvREFCNT_dec_NN(prop_definition);
20967 invlist = prop_definition;
20970 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20971 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20973 ary[INVLIST_INDEX] = invlist;
20974 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20975 ? ONLY_LOCALE_MATCHES_INDEX
20983 /* If requested, return a printable version of what this ANYOF node matches
20986 SV* matches_string = NULL;
20988 /* This function can be called at compile-time, before everything gets
20989 * resolved, in which case we return the currently best available
20990 * information, which is the string that will eventually be used to do
20991 * that resolving, 'si' */
20993 /* Here, we only have 'si' (and possibly some passed-in data in
20994 * 'invlist', which is handled below) If the caller only wants
20995 * 'si', use that. */
20996 if (! output_invlist) {
20997 matches_string = newSVsv(si);
21000 /* But if the caller wants an inversion list of the node, we
21001 * need to parse 'si' and place as much as possible in the
21002 * desired output inversion list, making 'matches_string' only
21003 * contain the currently unresolvable things */
21004 const char *si_string = SvPVX(si);
21005 STRLEN remaining = SvCUR(si);
21009 /* Ignore everything before and including the first new-line */
21010 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
21011 assert (si_string != NULL);
21013 remaining = SvPVX(si) + SvCUR(si) - si_string;
21015 while (remaining > 0) {
21017 /* The data consists of just strings defining user-defined
21018 * property names, but in prior incarnations, and perhaps
21019 * somehow from pluggable regex engines, it could still
21020 * hold hex code point definitions, all of which should be
21021 * legal (or it wouldn't have gotten this far). Each
21022 * component of a range would be separated by a tab, and
21023 * each range by a new-line. If these are found, instead
21024 * add them to the inversion list */
21025 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
21026 |PERL_SCAN_SILENT_NON_PORTABLE;
21027 STRLEN len = remaining;
21028 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
21030 /* If the hex decode routine found something, it should go
21031 * up to the next \n */
21032 if ( *(si_string + len) == '\n') {
21033 if (count) { /* 2nd code point on line */
21034 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
21037 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
21040 goto prepare_for_next_iteration;
21043 /* If the hex decode was instead for the lower range limit,
21044 * save it, and go parse the upper range limit */
21045 if (*(si_string + len) == '\t') {
21046 assert(count == 0);
21050 prepare_for_next_iteration:
21051 si_string += len + 1;
21052 remaining -= len + 1;
21056 /* Here, didn't find a legal hex number. Just add the text
21057 * from here up to the next \n, omitting any trailing
21061 len = strcspn(si_string,
21062 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
21064 if (matches_string) {
21065 sv_catpvn(matches_string, si_string, len);
21068 matches_string = newSVpvn(si_string, len);
21070 sv_catpvs(matches_string, " ");
21074 && UCHARAT(si_string)
21075 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
21080 if (remaining && UCHARAT(si_string) == '\n') {
21084 } /* end of loop through the text */
21086 assert(matches_string);
21087 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
21088 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
21090 } /* end of has an 'si' */
21093 /* Add the stuff that's already known */
21096 /* Again, if the caller doesn't want the output inversion list, put
21097 * everything in 'matches-string' */
21098 if (! output_invlist) {
21099 if ( ! matches_string) {
21100 matches_string = newSVpvs("\n");
21102 sv_catsv(matches_string, invlist_contents(invlist,
21103 TRUE /* traditional style */
21106 else if (! *output_invlist) {
21107 *output_invlist = invlist_clone(invlist, NULL);
21110 _invlist_union(*output_invlist, invlist, output_invlist);
21114 *listsvp = matches_string;
21120 /* reg_skipcomment()
21122 Absorbs an /x style # comment from the input stream,
21123 returning a pointer to the first character beyond the comment, or if the
21124 comment terminates the pattern without anything following it, this returns
21125 one past the final character of the pattern (in other words, RExC_end) and
21126 sets the REG_RUN_ON_COMMENT_SEEN flag.
21128 Note it's the callers responsibility to ensure that we are
21129 actually in /x mode
21133 PERL_STATIC_INLINE char*
21134 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
21136 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
21140 while (p < RExC_end) {
21141 if (*(++p) == '\n') {
21146 /* we ran off the end of the pattern without ending the comment, so we have
21147 * to add an \n when wrapping */
21148 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
21153 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
21155 const bool force_to_xmod
21158 /* If the text at the current parse position '*p' is a '(?#...)' comment,
21159 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
21160 * is /x whitespace, advance '*p' so that on exit it points to the first
21161 * byte past all such white space and comments */
21163 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
21165 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
21167 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
21170 if (RExC_end - (*p) >= 3
21172 && *(*p + 1) == '?'
21173 && *(*p + 2) == '#')
21175 while (*(*p) != ')') {
21176 if ((*p) == RExC_end)
21177 FAIL("Sequence (?#... not terminated");
21185 const char * save_p = *p;
21186 while ((*p) < RExC_end) {
21188 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
21191 else if (*(*p) == '#') {
21192 (*p) = reg_skipcomment(pRExC_state, (*p));
21198 if (*p != save_p) {
21211 Advances the parse position by one byte, unless that byte is the beginning
21212 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
21213 those two cases, the parse position is advanced beyond all such comments and
21216 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
21220 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
21222 PERL_ARGS_ASSERT_NEXTCHAR;
21224 if (RExC_parse < RExC_end) {
21226 || UTF8_IS_INVARIANT(*RExC_parse)
21227 || UTF8_IS_START(*RExC_parse));
21229 RExC_parse_inc_safe();
21231 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
21232 FALSE /* Don't force /x */ );
21237 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
21239 /* 'size' is the delta number of smallest regnode equivalents to add or
21240 * subtract from the current memory allocated to the regex engine being
21243 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
21248 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
21249 /* +1 for REG_MAGIC */
21252 if ( RExC_rxi == NULL )
21253 FAIL("Regexp out of space");
21254 RXi_SET(RExC_rx, RExC_rxi);
21256 RExC_emit_start = RExC_rxi->program;
21258 Zero(REGNODE_p(RExC_emit), size, regnode);
21262 STATIC regnode_offset
21263 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
21265 /* Allocate a regnode that is (1 + extra_size) times as big as the
21266 * smallest regnode worth of space, and also aligns and increments
21267 * RExC_size appropriately.
21269 * It returns the regnode's offset into the regex engine program */
21271 const regnode_offset ret = RExC_emit;
21273 PERL_ARGS_ASSERT_REGNODE_GUTS;
21275 SIZE_ALIGN(RExC_size);
21276 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
21277 NODE_ALIGN_FILL(REGNODE_p(ret));
21283 STATIC regnode_offset
21284 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
21285 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
21286 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
21287 return S_regnode_guts(aTHX_ pRExC_state, extra_size);
21295 - reg_node - emit a node
21297 STATIC regnode_offset /* Location. */
21298 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
21300 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21301 regnode_offset ptr = ret;
21303 PERL_ARGS_ASSERT_REG_NODE;
21305 assert(REGNODE_ARG_LEN(op) == 0);
21307 FILL_ADVANCE_NODE(ptr, op);
21313 - reganode - emit a node with an argument
21315 STATIC regnode_offset /* Location. */
21316 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
21318 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21319 regnode_offset ptr = ret;
21321 PERL_ARGS_ASSERT_REGANODE;
21323 /* ANYOF are special cased to allow non-length 1 args */
21324 assert(REGNODE_ARG_LEN(op) == 1);
21326 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
21332 - regpnode - emit a temporary node with a SV* argument
21334 STATIC regnode_offset /* Location. */
21335 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
21337 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21338 regnode_offset ptr = ret;
21340 PERL_ARGS_ASSERT_REGPNODE;
21342 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
21347 STATIC regnode_offset
21348 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
21350 /* emit a node with U32 and I32 arguments */
21352 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21353 regnode_offset ptr = ret;
21355 PERL_ARGS_ASSERT_REG2LANODE;
21357 assert(REGNODE_ARG_LEN(op) == 2);
21359 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
21365 - reginsert - insert an operator in front of already-emitted operand
21367 * That means that on exit 'operand' is the offset of the newly inserted
21368 * operator, and the original operand has been relocated.
21370 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
21371 * set up NEXT_OFF() of the inserted node if needed. Something like this:
21373 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
21374 * NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
21376 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
21379 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
21380 const regnode_offset operand, const U32 depth)
21385 const int offset = REGNODE_ARG_LEN((U8)op);
21386 const int size = NODE_STEP_REGNODE + offset;
21387 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21389 PERL_ARGS_ASSERT_REGINSERT;
21390 PERL_UNUSED_CONTEXT;
21391 PERL_UNUSED_ARG(depth);
21392 /* (REGNODE_TYPE((U8)op) == CURLY ? EXTRA_STEP_2ARGS : 0); */
21393 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
21394 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
21395 studying. If this is wrong then we need to adjust RExC_recurse
21396 below like we do with RExC_open_parens/RExC_close_parens. */
21397 change_engine_size(pRExC_state, (Ptrdiff_t) size);
21398 src = REGNODE_p(RExC_emit);
21400 dst = REGNODE_p(RExC_emit);
21402 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
21403 * and [perl #133871] shows this can lead to problems, so skip this
21404 * realignment of parens until a later pass when they are reliable */
21405 if (! IN_PARENS_PASS && RExC_open_parens) {
21407 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
21408 /* remember that RExC_npar is rex->nparens + 1,
21409 * iow it is 1 more than the number of parens seen in
21410 * the pattern so far. */
21411 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
21412 /* note, RExC_open_parens[0] is the start of the
21413 * regex, it can't move. RExC_close_parens[0] is the end
21414 * of the regex, it *can* move. */
21415 if ( paren && RExC_open_parens[paren] >= operand ) {
21416 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
21417 RExC_open_parens[paren] += size;
21419 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
21421 if ( RExC_close_parens[paren] >= operand ) {
21422 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
21423 RExC_close_parens[paren] += size;
21425 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
21430 RExC_end_op += size;
21432 while (src > REGNODE_p(operand)) {
21433 StructCopy(--src, --dst, regnode);
21436 place = REGNODE_p(operand); /* Op node, where operand used to be. */
21437 src = place + 1; /* NOT REGNODE_AFTER! */
21439 FILL_NODE(operand, op);
21441 /* Zero out any arguments in the new node */
21442 Zero(src, offset, regnode);
21446 - regtail - set the next-pointer at the end of a node chain of p to val. If
21447 that value won't fit in the space available, instead returns FALSE.
21448 (Except asserts if we can't fit in the largest space the regex
21449 engine is designed for.)
21450 - SEE ALSO: regtail_study
21453 S_regtail(pTHX_ RExC_state_t * pRExC_state,
21454 const regnode_offset p,
21455 const regnode_offset val,
21458 regnode_offset scan;
21459 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21461 PERL_ARGS_ASSERT_REGTAIL;
21463 PERL_UNUSED_ARG(depth);
21466 /* The final node in the chain is the first one with a nonzero next pointer
21468 scan = (regnode_offset) p;
21470 regnode * const temp = regnext(REGNODE_p(scan));
21472 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
21473 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
21474 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
21475 SvPV_nolen_const(RExC_mysv), scan,
21476 (temp == NULL ? "->" : ""),
21477 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
21482 scan = REGNODE_OFFSET(temp);
21485 /* Populate this node's next pointer */
21486 assert(val >= scan);
21487 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
21488 assert((UV) (val - scan) <= U32_MAX);
21489 ARG_SET(REGNODE_p(scan), val - scan);
21492 if (val - scan > U16_MAX) {
21493 /* Populate this with something that won't loop and will likely
21494 * lead to a crash if the caller ignores the failure return, and
21495 * execution continues */
21496 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
21499 NEXT_OFF(REGNODE_p(scan)) = val - scan;
21507 - regtail_study - set the next-pointer at the end of a node chain of p to val.
21508 - Look for optimizable sequences at the same time.
21509 - currently only looks for EXACT chains.
21511 This is experimental code. The idea is to use this routine to perform
21512 in place optimizations on branches and groups as they are constructed,
21513 with the long term intention of removing optimization from study_chunk so
21514 that it is purely analytical.
21516 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
21517 to control which is which.
21519 This used to return a value that was ignored. It was a problem that it is
21520 #ifdef'd to be another function that didn't return a value. khw has changed it
21521 so both currently return a pass/fail return.
21524 /* TODO: All four parms should be const */
21527 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
21528 const regnode_offset val, U32 depth)
21530 regnode_offset scan;
21532 #ifdef EXPERIMENTAL_INPLACESCAN
21535 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21537 PERL_ARGS_ASSERT_REGTAIL_STUDY;
21540 /* Find last node. */
21544 regnode * const temp = regnext(REGNODE_p(scan));
21545 #ifdef EXPERIMENTAL_INPLACESCAN
21546 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
21547 bool unfolded_multi_char; /* Unexamined in this routine */
21548 if (join_exact(pRExC_state, scan, &min,
21549 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
21550 return TRUE; /* Was return EXACT */
21554 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
21555 if (exact == PSEUDO )
21556 exact= OP(REGNODE_p(scan));
21557 else if (exact != OP(REGNODE_p(scan)) )
21560 else if (OP(REGNODE_p(scan)) != NOTHING) {
21565 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
21566 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
21567 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
21568 SvPV_nolen_const(RExC_mysv),
21570 REGNODE_NAME(exact));
21574 scan = REGNODE_OFFSET(temp);
21577 DEBUG_PARSE_MSG("");
21578 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
21579 Perl_re_printf( aTHX_
21580 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
21581 SvPV_nolen_const(RExC_mysv),
21586 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
21587 assert((UV) (val - scan) <= U32_MAX);
21588 ARG_SET(REGNODE_p(scan), val - scan);
21591 if (val - scan > U16_MAX) {
21592 /* Populate this with something that won't loop and will likely
21593 * lead to a crash if the caller ignores the failure return, and
21594 * execution continues */
21595 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
21598 NEXT_OFF(REGNODE_p(scan)) = val - scan;
21601 return TRUE; /* Was 'return exact' */
21606 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
21608 /* Returns an inversion list of all the code points matched by the
21609 * ANYOFM/NANYOFM node 'n' */
21611 SV * cp_list = _new_invlist(-1);
21612 const U8 lowest = (U8) ARG(n);
21615 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
21617 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
21619 /* Starting with the lowest code point, any code point that ANDed with the
21620 * mask yields the lowest code point is in the set */
21621 for (i = lowest; i <= 0xFF; i++) {
21622 if ((i & FLAGS(n)) == ARG(n)) {
21623 cp_list = add_cp_to_invlist(cp_list, i);
21626 /* We know how many code points (a power of two) that are in the
21627 * set. No use looking once we've got that number */
21628 if (count >= needed) break;
21632 if (OP(n) == NANYOFM) {
21633 _invlist_invert(cp_list);
21639 S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
21640 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
21642 SV * cp_list = NULL;
21643 populate_invlist_from_bitmap(
21644 ((struct regnode_bbm *) n)->bitmap,
21645 REGNODE_BBM_BITMAP_LEN * CHARBITS,
21648 /* The base cp is from the start byte plus a zero continuation */
21649 TWO_BYTE_UTF8_TO_NATIVE(((struct regnode_bbm *) n)->first_byte,
21650 UTF_CONTINUATION_MARK | 0));
21655 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
21660 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
21665 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
21667 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
21668 if (flags & (1<<bit)) {
21669 if (!set++ && lead)
21670 Perl_re_printf( aTHX_ "%s", lead);
21671 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
21676 Perl_re_printf( aTHX_ "\n");
21678 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
21683 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
21689 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
21691 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
21692 if (flags & (1U<<bit)) {
21693 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
21696 if (!set++ && lead)
21697 Perl_re_printf( aTHX_ "%s", lead);
21698 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
21701 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
21702 if (!set++ && lead) {
21703 Perl_re_printf( aTHX_ "%s", lead);
21706 case REGEX_UNICODE_CHARSET:
21707 Perl_re_printf( aTHX_ "UNICODE");
21709 case REGEX_LOCALE_CHARSET:
21710 Perl_re_printf( aTHX_ "LOCALE");
21712 case REGEX_ASCII_RESTRICTED_CHARSET:
21713 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
21715 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
21716 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
21719 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
21725 Perl_re_printf( aTHX_ "\n");
21727 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
21733 Perl_regdump(pTHX_ const regexp *r)
21737 SV * const sv = sv_newmortal();
21738 SV *dsv= sv_newmortal();
21739 RXi_GET_DECL(r, ri);
21740 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21742 PERL_ARGS_ASSERT_REGDUMP;
21744 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
21746 /* Header fields of interest. */
21747 for (i = 0; i < 2; i++) {
21748 if (r->substrs->data[i].substr) {
21749 RE_PV_QUOTED_DECL(s, 0, dsv,
21750 SvPVX_const(r->substrs->data[i].substr),
21751 RE_SV_DUMPLEN(r->substrs->data[i].substr),
21752 PL_dump_re_max_len);
21753 Perl_re_printf( aTHX_
21754 "%s %s%s at %" IVdf "..%" UVuf " ",
21755 i ? "floating" : "anchored",
21757 RE_SV_TAIL(r->substrs->data[i].substr),
21758 (IV)r->substrs->data[i].min_offset,
21759 (UV)r->substrs->data[i].max_offset);
21761 else if (r->substrs->data[i].utf8_substr) {
21762 RE_PV_QUOTED_DECL(s, 1, dsv,
21763 SvPVX_const(r->substrs->data[i].utf8_substr),
21764 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
21766 Perl_re_printf( aTHX_
21767 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
21768 i ? "floating" : "anchored",
21770 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
21771 (IV)r->substrs->data[i].min_offset,
21772 (UV)r->substrs->data[i].max_offset);
21776 if (r->check_substr || r->check_utf8)
21777 Perl_re_printf( aTHX_
21779 ( r->check_substr == r->substrs->data[1].substr
21780 && r->check_utf8 == r->substrs->data[1].utf8_substr
21781 ? "(checking floating" : "(checking anchored"));
21782 if (r->intflags & PREGf_NOSCAN)
21783 Perl_re_printf( aTHX_ " noscan");
21784 if (r->extflags & RXf_CHECK_ALL)
21785 Perl_re_printf( aTHX_ " isall");
21786 if (r->check_substr || r->check_utf8)
21787 Perl_re_printf( aTHX_ ") ");
21789 if (ri->regstclass) {
21790 regprop(r, sv, ri->regstclass, NULL, NULL);
21791 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
21793 if (r->intflags & PREGf_ANCH) {
21794 Perl_re_printf( aTHX_ "anchored");
21795 if (r->intflags & PREGf_ANCH_MBOL)
21796 Perl_re_printf( aTHX_ "(MBOL)");
21797 if (r->intflags & PREGf_ANCH_SBOL)
21798 Perl_re_printf( aTHX_ "(SBOL)");
21799 if (r->intflags & PREGf_ANCH_GPOS)
21800 Perl_re_printf( aTHX_ "(GPOS)");
21801 Perl_re_printf( aTHX_ " ");
21803 if (r->intflags & PREGf_GPOS_SEEN)
21804 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
21805 if (r->intflags & PREGf_SKIP)
21806 Perl_re_printf( aTHX_ "plus ");
21807 if (r->intflags & PREGf_IMPLICIT)
21808 Perl_re_printf( aTHX_ "implicit ");
21809 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
21810 if (r->extflags & RXf_EVAL_SEEN)
21811 Perl_re_printf( aTHX_ "with eval ");
21812 Perl_re_printf( aTHX_ "\n");
21814 regdump_extflags("r->extflags: ", r->extflags);
21815 regdump_intflags("r->intflags: ", r->intflags);
21818 PERL_ARGS_ASSERT_REGDUMP;
21819 PERL_UNUSED_CONTEXT;
21820 PERL_UNUSED_ARG(r);
21821 #endif /* DEBUGGING */
21824 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21827 # if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \
21828 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \
21829 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \
21830 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \
21831 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \
21832 || CC_VERTSPACE_ != 15
21833 # error Need to adjust order of anyofs[]
21835 static const char * const anyofs[] = {
21872 - regprop - printable representation of opcode, with run time support
21876 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21880 const U8 op = OP(o);
21881 RXi_GET_DECL(prog, progi);
21882 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21884 PERL_ARGS_ASSERT_REGPROP;
21888 if (op > REGNODE_MAX) { /* regnode.type is unsigned */
21889 if (pRExC_state) { /* This gives more info, if we have it */
21890 FAIL3("panic: corrupted regexp opcode %d > %d",
21891 (int)op, (int)REGNODE_MAX);
21894 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21895 (int)op, (int)REGNODE_MAX);
21898 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
21900 k = REGNODE_TYPE(op);
21903 sv_catpvs(sv, " ");
21904 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21905 * is a crude hack but it may be the best for now since
21906 * we have no flag "this EXACTish node was UTF-8"
21908 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21909 PL_colors[0], PL_colors[1],
21910 PERL_PV_ESCAPE_UNI_DETECT |
21911 PERL_PV_ESCAPE_NONASCII |
21912 PERL_PV_PRETTY_ELLIPSES |
21913 PERL_PV_PRETTY_LTGT |
21914 PERL_PV_PRETTY_NOCLEAR
21916 } else if (k == TRIE) {
21917 /* print the details of the trie in dumpuntil instead, as
21918 * progi->data isn't available here */
21919 const U32 n = ARG(o);
21920 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21921 (reg_ac_data *)progi->data->data[n] :
21923 const reg_trie_data * const trie
21924 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21926 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags));
21927 DEBUG_TRIE_COMPILE_r({
21929 sv_catpvs(sv, "(JUMP)");
21930 Perl_sv_catpvf(aTHX_ sv,
21931 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21932 (UV)trie->startstate,
21933 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21934 (UV)trie->wordcount,
21937 (UV)TRIE_CHARCOUNT(trie),
21938 (UV)trie->uniquecharcount
21941 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21942 sv_catpvs(sv, "[");
21943 (void) put_charclass_bitmap_innards(sv,
21944 ((IS_ANYOF_TRIE(op))
21946 : TRIE_BITMAP(trie)),
21953 sv_catpvs(sv, "]");
21955 } else if (k == CURLY) {
21956 U32 lo = ARG1(o), hi = ARG2(o);
21957 if (op == CURLYM || op == CURLYN || op == CURLYX)
21958 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21959 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21960 if (hi == REG_INFTY)
21961 sv_catpvs(sv, "INFTY");
21963 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21964 sv_catpvs(sv, "}");
21966 else if (k == WHILEM && o->flags) /* Ordinal/of */
21967 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21968 else if (k == REF || k == OPEN || k == CLOSE
21969 || k == GROUPP || op == ACCEPT)
21971 AV *name_list= NULL;
21972 U32 parno= op == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21973 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21974 if ( RXp_PAREN_NAMES(prog) ) {
21975 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21976 } else if ( pRExC_state ) {
21977 name_list= RExC_paren_name_list;
21980 if ( k != REF || (op < REFN)) {
21981 SV **name= av_fetch(name_list, parno, 0 );
21983 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21987 /* parno must always be larger than 0 for this block
21988 * as it represents a slot into the data array, which
21989 * has the 0 slot reserved for a placeholder so any valid
21990 * index into it is always true, eg non-zero
21991 * see the '%' "what" type and the implementation of
21994 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21995 I32 *nums=(I32*)SvPVX(sv_dat);
21996 SV **name= av_fetch(name_list, nums[0], 0 );
21999 for ( n=0; n<SvIVX(sv_dat); n++ ) {
22000 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
22001 (n ? "," : ""), (IV)nums[n]);
22003 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
22007 if ( k == REF && reginfo) {
22008 U32 n = ARG(o); /* which paren pair */
22009 I32 ln = prog->offs[n].start;
22010 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
22011 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
22012 else if (ln == prog->offs[n].end)
22013 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
22015 const char *s = reginfo->strbeg + ln;
22016 Perl_sv_catpvf(aTHX_ sv, ": ");
22017 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
22018 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
22021 } else if (k == GOSUB) {
22022 AV *name_list= NULL;
22023 if ( RXp_PAREN_NAMES(prog) ) {
22024 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
22025 } else if ( pRExC_state ) {
22026 name_list= RExC_paren_name_list;
22029 /* Paren and offset */
22030 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
22031 (int)((o + (int)ARG2L(o)) - progi->program) );
22033 SV **name= av_fetch(name_list, ARG(o), 0 );
22035 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
22038 else if (k == LOGICAL)
22039 /* 2: embedded, otherwise 1 */
22040 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
22041 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
22044 U8 do_sep = 0; /* Do we need to separate various components of the
22046 /* Set if there is still an unresolved user-defined property */
22047 SV *unresolved = NULL;
22049 /* Things that are ignored except when the runtime locale is UTF-8 */
22050 SV *only_utf8_locale_invlist = NULL;
22052 /* Code points that don't fit in the bitmap */
22053 SV *nonbitmap_invlist = NULL;
22055 /* And things that aren't in the bitmap, but are small enough to be */
22056 SV* bitmap_range_not_in_bitmap = NULL;
22065 flags = ANYOF_FLAGS(o);
22066 bitmap = ANYOF_BITMAP(o);
22069 if (op == ANYOFL || op == ANYOFPOSIXL) {
22070 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
22071 sv_catpvs(sv, "{utf8-locale-reqd}");
22073 if (flags & ANYOFL_FOLD) {
22074 sv_catpvs(sv, "{i}");
22078 inverted = flags & ANYOF_INVERT;
22080 /* If there is stuff outside the bitmap, get it */
22083 /* For a single range, split into the parts inside vs outside the
22085 UV start = ANYOFRbase(o);
22086 UV end = ANYOFRbase(o) + ANYOFRdelta(o);
22088 if (start < NUM_ANYOF_CODE_POINTS) {
22089 if (end < NUM_ANYOF_CODE_POINTS) {
22090 bitmap_range_not_in_bitmap
22091 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
22095 bitmap_range_not_in_bitmap
22096 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
22097 start, NUM_ANYOF_CODE_POINTS);
22098 start = NUM_ANYOF_CODE_POINTS;
22102 if (start >= NUM_ANYOF_CODE_POINTS) {
22103 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
22105 ANYOFRbase(o) + ANYOFRdelta(o));
22108 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
22109 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
22110 NUM_ANYOF_CODE_POINTS,
22113 else if (ANYOF_HAS_AUX(o)) {
22114 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
22116 &only_utf8_locale_invlist,
22117 &nonbitmap_invlist);
22119 /* The aux data may contain stuff that could fit in the bitmap.
22120 * This could come from a user-defined property being finally
22121 * resolved when this call was done; or much more likely because
22122 * there are matches that require UTF-8 to be valid, and so aren't
22123 * in the bitmap (or ANYOFR). This is teased apart later */
22124 _invlist_intersection(nonbitmap_invlist,
22126 &bitmap_range_not_in_bitmap);
22127 /* Leave just the things that don't fit into the bitmap */
22128 _invlist_subtract(nonbitmap_invlist,
22130 &nonbitmap_invlist);
22133 /* Ready to start outputting. First, the initial left bracket */
22134 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
22137 || bitmap_range_not_in_bitmap
22138 || only_utf8_locale_invlist
22141 /* Then all the things that could fit in the bitmap */
22142 do_sep = put_charclass_bitmap_innards(
22145 bitmap_range_not_in_bitmap,
22146 only_utf8_locale_invlist,
22150 /* Can't try inverting for a
22151 * better display if there
22152 * are things that haven't
22154 (unresolved != NULL || k == ANYOFR));
22155 SvREFCNT_dec(bitmap_range_not_in_bitmap);
22157 /* If there are user-defined properties which haven't been defined
22158 * yet, output them. If the result is not to be inverted, it is
22159 * clearest to output them in a separate [] from the bitmap range
22160 * stuff. If the result is to be complemented, we have to show
22161 * everything in one [], as the inversion applies to the whole
22162 * thing. Use {braces} to separate them from anything in the
22163 * bitmap and anything above the bitmap. */
22166 if (! do_sep) { /* If didn't output anything in the bitmap
22168 sv_catpvs(sv, "^");
22170 sv_catpvs(sv, "{");
22173 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
22176 sv_catsv(sv, unresolved);
22178 sv_catpvs(sv, "}");
22180 do_sep = ! inverted;
22182 else if ( do_sep == 2
22183 && ! nonbitmap_invlist
22184 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
22186 /* Here, the display shows the class as inverted, and
22187 * everything above the lower display should also match, but
22188 * there is no indication of that. Add this range so the code
22189 * below will add it to the display */
22190 _invlist_union_complement_2nd(nonbitmap_invlist,
22192 &nonbitmap_invlist);
22196 /* And, finally, add the above-the-bitmap stuff */
22197 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
22200 /* See if truncation size is overridden */
22201 const STRLEN dump_len = (PL_dump_re_max_len > 256)
22202 ? PL_dump_re_max_len
22205 /* This is output in a separate [] */
22207 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
22210 /* And, for easy of understanding, it is shown in the
22211 * uncomplemented form if possible. The one exception being if
22212 * there are unresolved items, where the inversion has to be
22213 * delayed until runtime */
22214 if (inverted && ! unresolved) {
22215 _invlist_invert(nonbitmap_invlist);
22216 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
22219 contents = invlist_contents(nonbitmap_invlist,
22220 FALSE /* output suitable for catsv */
22223 /* If the output is shorter than the permissible maximum, just do it. */
22224 if (SvCUR(contents) <= dump_len) {
22225 sv_catsv(sv, contents);
22228 const char * contents_string = SvPVX(contents);
22229 STRLEN i = dump_len;
22231 /* Otherwise, start at the permissible max and work back to the
22232 * first break possibility */
22233 while (i > 0 && contents_string[i] != ' ') {
22236 if (i == 0) { /* Fail-safe. Use the max if we couldn't
22237 find a legal break */
22241 sv_catpvn(sv, contents_string, i);
22242 sv_catpvs(sv, "...");
22245 SvREFCNT_dec_NN(contents);
22246 SvREFCNT_dec_NN(nonbitmap_invlist);
22249 /* And finally the matching, closing ']' */
22250 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
22252 if (op == ANYOFHs) {
22253 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
22255 else if (REGNODE_TYPE(op) != ANYOF) {
22256 U8 lowest = (op != ANYOFHr)
22258 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
22259 U8 highest = (op == ANYOFHr)
22260 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
22261 : (op == ANYOFH || op == ANYOFR)
22265 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
22268 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
22269 if (lowest != highest) {
22270 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
22272 Perl_sv_catpvf(aTHX_ sv, ")");
22276 SvREFCNT_dec(unresolved);
22278 else if (k == ANYOFM) {
22279 SV * cp_list = get_ANYOFM_contents(o);
22281 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
22282 if (op == NANYOFM) {
22283 _invlist_invert(cp_list);
22286 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
22287 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
22289 SvREFCNT_dec(cp_list);
22291 else if (k == ANYOFHbbm) {
22292 SV * cp_list = get_ANYOFHbbm_contents(o);
22293 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
22295 sv_catsv(sv, invlist_contents(cp_list,
22296 FALSE /* output suitable for catsv */
22298 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
22300 SvREFCNT_dec(cp_list);
22302 else if (k == POSIXD || k == NPOSIXD) {
22303 U8 index = FLAGS(o) * 2;
22304 if (index < C_ARRAY_LENGTH(anyofs)) {
22305 if (*anyofs[index] != '[') {
22306 sv_catpvs(sv, "[");
22308 sv_catpv(sv, anyofs[index]);
22309 if (*anyofs[index] != '[') {
22310 sv_catpvs(sv, "]");
22314 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
22317 else if (k == BOUND || k == NBOUND) {
22318 /* Must be synced with order of 'bound_type' in regcomp.h */
22319 const char * const bounds[] = {
22320 "", /* Traditional */
22326 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
22327 sv_catpv(sv, bounds[FLAGS(o)]);
22329 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
22330 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
22332 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
22334 Perl_sv_catpvf(aTHX_ sv, "]");
22336 else if (op == SBOL)
22337 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
22339 /* add on the verb argument if there is one */
22340 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) {
22342 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
22343 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
22345 sv_catpvs(sv, ":NULL");
22348 PERL_UNUSED_CONTEXT;
22349 PERL_UNUSED_ARG(sv);
22350 PERL_UNUSED_ARG(o);
22351 PERL_UNUSED_ARG(prog);
22352 PERL_UNUSED_ARG(reginfo);
22353 PERL_UNUSED_ARG(pRExC_state);
22354 #endif /* DEBUGGING */
22360 Perl_re_intuit_string(pTHX_ REGEXP * const r)
22361 { /* Assume that RE_INTUIT is set */
22362 /* Returns an SV containing a string that must appear in the target for it
22363 * to match, or NULL if nothing is known that must match.
22365 * CAUTION: the SV can be freed during execution of the regex engine */
22367 struct regexp *const prog = ReANY(r);
22368 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22370 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
22371 PERL_UNUSED_CONTEXT;
22375 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
22376 const char * const s = SvPV_nolen_const(RX_UTF8(r)
22377 ? prog->check_utf8 : prog->check_substr);
22379 if (!PL_colorset) reginitcolors();
22380 Perl_re_printf( aTHX_
22381 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
22383 RX_UTF8(r) ? "utf8 " : "",
22384 PL_colors[5], PL_colors[0],
22387 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
22391 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
22392 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
22398 handles refcounting and freeing the perl core regexp structure. When
22399 it is necessary to actually free the structure the first thing it
22400 does is call the 'free' method of the regexp_engine associated to
22401 the regexp, allowing the handling of the void *pprivate; member
22402 first. (This routine is not overridable by extensions, which is why
22403 the extensions free is called first.)
22405 See regdupe and regdupe_internal if you change anything here.
22407 #ifndef PERL_IN_XSUB_RE
22409 Perl_pregfree(pTHX_ REGEXP *r)
22415 Perl_pregfree2(pTHX_ REGEXP *rx)
22417 struct regexp *const r = ReANY(rx);
22418 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22420 PERL_ARGS_ASSERT_PREGFREE2;
22425 if (r->mother_re) {
22426 ReREFCNT_dec(r->mother_re);
22428 CALLREGFREE_PVT(rx); /* free the private data */
22429 SvREFCNT_dec(RXp_PAREN_NAMES(r));
22433 for (i = 0; i < 2; i++) {
22434 SvREFCNT_dec(r->substrs->data[i].substr);
22435 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
22437 Safefree(r->substrs);
22439 RX_MATCH_COPY_FREE(rx);
22440 #ifdef PERL_ANY_COW
22441 SvREFCNT_dec(r->saved_copy);
22444 SvREFCNT_dec(r->qr_anoncv);
22445 if (r->recurse_locinput)
22446 Safefree(r->recurse_locinput);
22452 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
22453 except that dsv will be created if NULL.
22455 This function is used in two main ways. First to implement
22456 $r = qr/....; $s = $$r;
22458 Secondly, it is used as a hacky workaround to the structural issue of
22460 being stored in the regexp structure which is in turn stored in
22461 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
22462 could be PL_curpm in multiple contexts, and could require multiple
22463 result sets being associated with the pattern simultaneously, such
22464 as when doing a recursive match with (??{$qr})
22466 The solution is to make a lightweight copy of the regexp structure
22467 when a qr// is returned from the code executed by (??{$qr}) this
22468 lightweight copy doesn't actually own any of its data except for
22469 the starp/end and the actual regexp structure itself.
22475 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
22477 struct regexp *drx;
22478 struct regexp *const srx = ReANY(ssv);
22479 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
22481 PERL_ARGS_ASSERT_REG_TEMP_COPY;
22484 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
22486 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
22488 /* our only valid caller, sv_setsv_flags(), should have done
22489 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
22490 assert(!SvOOK(dsv));
22491 assert(!SvIsCOW(dsv));
22492 assert(!SvROK(dsv));
22494 if (SvPVX_const(dsv)) {
22496 Safefree(SvPVX(dsv));
22501 SvOK_off((SV *)dsv);
22504 /* For PVLVs, the head (sv_any) points to an XPVLV, while
22505 * the LV's xpvlenu_rx will point to a regexp body, which
22506 * we allocate here */
22507 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
22508 assert(!SvPVX(dsv));
22509 /* We "steal" the body from the newly allocated SV temp, changing
22510 * the pointer in its HEAD to NULL. We then change its type to
22511 * SVt_NULL so that when we immediately release its only reference,
22512 * no memory deallocation happens.
22514 * The body will eventually be freed (from the PVLV) either in
22515 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
22516 * the regexp body needs to be removed)
22517 * or in Perl_sv_clear() (if the PVLV still holds the pointer until
22518 * the PVLV itself is deallocated). */
22519 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
22520 temp->sv_any = NULL;
22521 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
22522 SvREFCNT_dec_NN(temp);
22523 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
22524 ing below will not set it. */
22525 SvCUR_set(dsv, SvCUR(ssv));
22528 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
22529 sv_force_normal(sv) is called. */
22533 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
22534 SvPV_set(dsv, RX_WRAPPED(ssv));
22535 /* We share the same string buffer as the original regexp, on which we
22536 hold a reference count, incremented when mother_re is set below.
22537 The string pointer is copied here, being part of the regexp struct.
22539 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
22540 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
22544 const I32 npar = srx->nparens+1;
22545 Newx(drx->offs, npar, regexp_paren_pair);
22546 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
22548 if (srx->substrs) {
22550 Newx(drx->substrs, 1, struct reg_substr_data);
22551 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
22553 for (i = 0; i < 2; i++) {
22554 SvREFCNT_inc_void(drx->substrs->data[i].substr);
22555 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
22558 /* check_substr and check_utf8, if non-NULL, point to either their
22559 anchored or float namesakes, and don't hold a second reference. */
22561 RX_MATCH_COPIED_off(dsv);
22562 #ifdef PERL_ANY_COW
22563 drx->saved_copy = NULL;
22565 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
22566 SvREFCNT_inc_void(drx->qr_anoncv);
22567 if (srx->recurse_locinput)
22568 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
22575 /* regfree_internal()
22577 Free the private data in a regexp. This is overloadable by
22578 extensions. Perl takes care of the regexp structure in pregfree(),
22579 this covers the *pprivate pointer which technically perl doesn't
22580 know about, however of course we have to handle the
22581 regexp_internal structure when no extension is in use.
22583 Note this is called before freeing anything in the regexp
22588 Perl_regfree_internal(pTHX_ REGEXP * const rx)
22590 struct regexp *const r = ReANY(rx);
22591 RXi_GET_DECL(r, ri);
22592 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22594 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
22604 SV *dsv= sv_newmortal();
22605 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
22606 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
22607 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
22608 PL_colors[4], PL_colors[5], s);
22612 if (ri->code_blocks)
22613 S_free_codeblocks(aTHX_ ri->code_blocks);
22616 int n = ri->data->count;
22619 /* If you add a ->what type here, update the comment in regcomp.h */
22620 switch (ri->data->what[n]) {
22626 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
22629 Safefree(ri->data->data[n]);
22635 { /* Aho Corasick add-on structure for a trie node.
22636 Used in stclass optimization only */
22638 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
22640 refcount = --aho->refcount;
22643 PerlMemShared_free(aho->states);
22644 PerlMemShared_free(aho->fail);
22645 /* do this last!!!! */
22646 PerlMemShared_free(ri->data->data[n]);
22647 /* we should only ever get called once, so
22648 * assert as much, and also guard the free
22649 * which /might/ happen twice. At the least
22650 * it will make code anlyzers happy and it
22651 * doesn't cost much. - Yves */
22652 assert(ri->regstclass);
22653 if (ri->regstclass) {
22654 PerlMemShared_free(ri->regstclass);
22655 ri->regstclass = 0;
22662 /* trie structure. */
22664 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
22666 refcount = --trie->refcount;
22669 PerlMemShared_free(trie->charmap);
22670 PerlMemShared_free(trie->states);
22671 PerlMemShared_free(trie->trans);
22673 PerlMemShared_free(trie->bitmap);
22675 PerlMemShared_free(trie->jump);
22676 PerlMemShared_free(trie->wordinfo);
22677 /* do this last!!!! */
22678 PerlMemShared_free(ri->data->data[n]);
22683 /* NO-OP a '%' data contains a null pointer, so that add_data
22684 * always returns non-zero, this should only ever happen in the
22689 Perl_croak(aTHX_ "panic: regfree data code '%c'",
22690 ri->data->what[n]);
22693 Safefree(ri->data->what);
22694 Safefree(ri->data);
22700 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
22701 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
22702 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
22705 =for apidoc re_dup_guts
22706 Duplicate a regexp.
22708 This routine is expected to clone a given regexp structure. It is only
22709 compiled under USE_ITHREADS.
22711 After all of the core data stored in struct regexp is duplicated
22712 the C<regexp_engine.dupe> method is used to copy any private data
22713 stored in the *pprivate pointer. This allows extensions to handle
22714 any duplication they need to do.
22718 See pregfree() and regfree_internal() if you change anything here.
22720 #if defined(USE_ITHREADS)
22721 #ifndef PERL_IN_XSUB_RE
22723 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
22726 const struct regexp *r = ReANY(sstr);
22727 struct regexp *ret = ReANY(dstr);
22729 PERL_ARGS_ASSERT_RE_DUP_GUTS;
22731 npar = r->nparens+1;
22732 Newx(ret->offs, npar, regexp_paren_pair);
22733 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
22735 if (ret->substrs) {
22736 /* Do it this way to avoid reading from *r after the StructCopy().
22737 That way, if any of the sv_dup_inc()s dislodge *r from the L1
22738 cache, it doesn't matter. */
22740 const bool anchored = r->check_substr
22741 ? r->check_substr == r->substrs->data[0].substr
22742 : r->check_utf8 == r->substrs->data[0].utf8_substr;
22743 Newx(ret->substrs, 1, struct reg_substr_data);
22744 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
22746 for (i = 0; i < 2; i++) {
22747 ret->substrs->data[i].substr =
22748 sv_dup_inc(ret->substrs->data[i].substr, param);
22749 ret->substrs->data[i].utf8_substr =
22750 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
22753 /* check_substr and check_utf8, if non-NULL, point to either their
22754 anchored or float namesakes, and don't hold a second reference. */
22756 if (ret->check_substr) {
22758 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
22760 ret->check_substr = ret->substrs->data[0].substr;
22761 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
22763 assert(r->check_substr == r->substrs->data[1].substr);
22764 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
22766 ret->check_substr = ret->substrs->data[1].substr;
22767 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
22769 } else if (ret->check_utf8) {
22771 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
22773 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
22778 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
22779 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
22780 if (r->recurse_locinput)
22781 Newx(ret->recurse_locinput, r->nparens + 1, char *);
22784 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
22786 if (RX_MATCH_COPIED(dstr))
22787 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
22789 ret->subbeg = NULL;
22790 #ifdef PERL_ANY_COW
22791 ret->saved_copy = NULL;
22794 /* Whether mother_re be set or no, we need to copy the string. We
22795 cannot refrain from copying it when the storage points directly to
22796 our mother regexp, because that's
22797 1: a buffer in a different thread
22798 2: something we no longer hold a reference on
22799 so we need to copy it locally. */
22800 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
22801 /* set malloced length to a non-zero value so it will be freed
22802 * (otherwise in combination with SVf_FAKE it looks like an alien
22803 * buffer). It doesn't have to be the actual malloced size, since it
22804 * should never be grown */
22805 SvLEN_set(dstr, SvCUR(sstr)+1);
22806 ret->mother_re = NULL;
22808 #endif /* PERL_IN_XSUB_RE */
22813 This is the internal complement to regdupe() which is used to copy
22814 the structure pointed to by the *pprivate pointer in the regexp.
22815 This is the core version of the extension overridable cloning hook.
22816 The regexp structure being duplicated will be copied by perl prior
22817 to this and will be provided as the regexp *r argument, however
22818 with the /old/ structures pprivate pointer value. Thus this routine
22819 may override any copying normally done by perl.
22821 It returns a pointer to the new regexp_internal structure.
22825 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
22827 struct regexp *const r = ReANY(rx);
22828 regexp_internal *reti;
22830 RXi_GET_DECL(r, ri);
22832 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
22836 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
22837 char, regexp_internal);
22838 Copy(ri->program, reti->program, len+1, regnode);
22841 if (ri->code_blocks) {
22843 Newx(reti->code_blocks, 1, struct reg_code_blocks);
22844 Newx(reti->code_blocks->cb, ri->code_blocks->count,
22845 struct reg_code_block);
22846 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22847 ri->code_blocks->count, struct reg_code_block);
22848 for (n = 0; n < ri->code_blocks->count; n++)
22849 reti->code_blocks->cb[n].src_regex = (REGEXP*)
22850 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22851 reti->code_blocks->count = ri->code_blocks->count;
22852 reti->code_blocks->refcnt = 1;
22855 reti->code_blocks = NULL;
22857 reti->regstclass = NULL;
22860 struct reg_data *d;
22861 const int count = ri->data->count;
22864 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22865 char, struct reg_data);
22866 Newx(d->what, count, U8);
22869 for (i = 0; i < count; i++) {
22870 d->what[i] = ri->data->what[i];
22871 switch (d->what[i]) {
22872 /* see also regcomp.h and regfree_internal() */
22873 case 'a': /* actually an AV, but the dup function is identical.
22874 values seem to be "plain sv's" generally. */
22875 case 'r': /* a compiled regex (but still just another SV) */
22876 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22877 this use case should go away, the code could have used
22878 'a' instead - see S_set_ANYOF_arg() for array contents. */
22879 case 'S': /* actually an SV, but the dup function is identical. */
22880 case 'u': /* actually an HV, but the dup function is identical.
22881 values are "plain sv's" */
22882 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22885 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22886 * patterns which could start with several different things. Pre-TRIE
22887 * this was more important than it is now, however this still helps
22888 * in some places, for instance /x?a+/ might produce a SSC equivalent
22889 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22892 /* This is cheating. */
22893 Newx(d->data[i], 1, regnode_ssc);
22894 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22895 reti->regstclass = (regnode*)d->data[i];
22898 /* AHO-CORASICK fail table */
22899 /* Trie stclasses are readonly and can thus be shared
22900 * without duplication. We free the stclass in pregfree
22901 * when the corresponding reg_ac_data struct is freed.
22903 reti->regstclass= ri->regstclass;
22906 /* TRIE transition table */
22908 ((reg_trie_data*)ri->data->data[i])->refcount++;
22911 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22912 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22913 is not from another regexp */
22914 d->data[i] = ri->data->data[i];
22917 /* this is a placeholder type, it exists purely so that
22918 * add_data always returns a non-zero value, this type of
22919 * entry should ONLY be present in the 0 slot of the array */
22921 d->data[i]= ri->data->data[i];
22924 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22925 ri->data->what[i]);
22934 reti->name_list_idx = ri->name_list_idx;
22936 SetProgLen(reti, len);
22938 return (void*)reti;
22941 #endif /* USE_ITHREADS */
22944 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22947 STRLEN len = strlen(pat);
22950 const char *message;
22952 PERL_ARGS_ASSERT_RE_CROAK;
22956 Copy(pat, buf, len , char);
22958 buf[len + 1] = '\0';
22959 va_start(args, pat);
22960 msv = vmess(buf, &args);
22962 message = SvPV_const(msv, len);
22965 Copy(message, buf, len , char);
22966 /* len-1 to avoid \n */
22967 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22970 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22972 #ifndef PERL_IN_XSUB_RE
22974 Perl_save_re_context(pTHX)
22979 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22982 const REGEXP * const rx = PM_GETRE(PL_curpm);
22984 nparens = RX_NPARENS(rx);
22987 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22988 * that PL_curpm will be null, but that utf8.pm and the modules it
22989 * loads will only use $1..$3.
22990 * The t/porting/re_context.t test file checks this assumption.
22995 for (i = 1; i <= nparens; i++) {
22996 char digits[TYPE_CHARS(long)];
22997 const STRLEN len = my_snprintf(digits, sizeof(digits),
22999 GV *const *const gvp
23000 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
23003 GV * const gv = *gvp;
23004 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
23014 S_put_code_point(pTHX_ SV *sv, UV c)
23016 PERL_ARGS_ASSERT_PUT_CODE_POINT;
23019 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
23021 else if (isPRINT(c)) {
23022 const char string = (char) c;
23024 /* We use {phrase} as metanotation in the class, so also escape literal
23026 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
23027 sv_catpvs(sv, "\\");
23028 sv_catpvn(sv, &string, 1);
23030 else if (isMNEMONIC_CNTRL(c)) {
23031 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
23034 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
23039 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
23041 /* Appends to 'sv' a displayable version of the range of code points from
23042 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
23043 * that have them, when they occur at the beginning or end of the range.
23044 * It uses hex to output the remaining code points, unless 'allow_literals'
23045 * is true, in which case the printable ASCII ones are output as-is (though
23046 * some of these will be escaped by put_code_point()).
23048 * NOTE: This is designed only for printing ranges of code points that fit
23049 * inside an ANYOF bitmap. Higher code points are simply suppressed
23052 const unsigned int min_range_count = 3;
23054 assert(start <= end);
23056 PERL_ARGS_ASSERT_PUT_RANGE;
23058 while (start <= end) {
23060 const char * format;
23062 if ( end - start < min_range_count
23063 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
23065 /* Output a range of 1 or 2 chars individually, or longer ranges
23066 * when printable */
23067 for (; start <= end; start++) {
23068 put_code_point(sv, start);
23073 /* If permitted by the input options, and there is a possibility that
23074 * this range contains a printable literal, look to see if there is
23076 if (allow_literals && start <= MAX_PRINT_A) {
23078 /* If the character at the beginning of the range isn't an ASCII
23079 * printable, effectively split the range into two parts:
23080 * 1) the portion before the first such printable,
23082 * and output them separately. */
23083 if (! isPRINT_A(start)) {
23084 UV temp_end = start + 1;
23086 /* There is no point looking beyond the final possible
23087 * printable, in MAX_PRINT_A */
23088 UV max = MIN(end, MAX_PRINT_A);
23090 while (temp_end <= max && ! isPRINT_A(temp_end)) {
23094 /* Here, temp_end points to one beyond the first printable if
23095 * found, or to one beyond 'max' if not. If none found, make
23096 * sure that we use the entire range */
23097 if (temp_end > MAX_PRINT_A) {
23098 temp_end = end + 1;
23101 /* Output the first part of the split range: the part that
23102 * doesn't have printables, with the parameter set to not look
23103 * for literals (otherwise we would infinitely recurse) */
23104 put_range(sv, start, temp_end - 1, FALSE);
23106 /* The 2nd part of the range (if any) starts here. */
23109 /* We do a continue, instead of dropping down, because even if
23110 * the 2nd part is non-empty, it could be so short that we want
23111 * to output it as individual characters, as tested for at the
23112 * top of this loop. */
23116 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
23117 * output a sub-range of just the digits or letters, then process
23118 * the remaining portion as usual. */
23119 if (isALPHANUMERIC_A(start)) {
23120 UV mask = (isDIGIT_A(start))
23125 UV temp_end = start + 1;
23127 /* Find the end of the sub-range that includes just the
23128 * characters in the same class as the first character in it */
23129 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
23134 /* For short ranges, don't duplicate the code above to output
23135 * them; just call recursively */
23136 if (temp_end - start < min_range_count) {
23137 put_range(sv, start, temp_end, FALSE);
23139 else { /* Output as a range */
23140 put_code_point(sv, start);
23141 sv_catpvs(sv, "-");
23142 put_code_point(sv, temp_end);
23144 start = temp_end + 1;
23148 /* We output any other printables as individual characters */
23149 if (isPUNCT_A(start) || isSPACE_A(start)) {
23150 while (start <= end && (isPUNCT_A(start)
23151 || isSPACE_A(start)))
23153 put_code_point(sv, start);
23158 } /* End of looking for literals */
23160 /* Here is not to output as a literal. Some control characters have
23161 * mnemonic names. Split off any of those at the beginning and end of
23162 * the range to print mnemonically. It isn't possible for many of
23163 * these to be in a row, so this won't overwhelm with output */
23165 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
23167 while (isMNEMONIC_CNTRL(start) && start <= end) {
23168 put_code_point(sv, start);
23172 /* If this didn't take care of the whole range ... */
23173 if (start <= end) {
23175 /* Look backwards from the end to find the final non-mnemonic
23178 while (isMNEMONIC_CNTRL(temp_end)) {
23182 /* And separately output the interior range that doesn't start
23183 * or end with mnemonics */
23184 put_range(sv, start, temp_end, FALSE);
23186 /* Then output the mnemonic trailing controls */
23187 start = temp_end + 1;
23188 while (start <= end) {
23189 put_code_point(sv, start);
23196 /* As a final resort, output the range or subrange as hex. */
23198 if (start >= NUM_ANYOF_CODE_POINTS) {
23201 else { /* Have to split range at the bitmap boundary */
23202 this_end = (end < NUM_ANYOF_CODE_POINTS)
23204 : NUM_ANYOF_CODE_POINTS - 1;
23206 #if NUM_ANYOF_CODE_POINTS > 256
23207 format = (this_end < 256)
23208 ? "\\x%02" UVXf "-\\x%02" UVXf
23209 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
23211 format = "\\x%02" UVXf "-\\x%02" UVXf;
23213 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
23214 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
23215 GCC_DIAG_RESTORE_STMT;
23221 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
23223 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
23227 bool allow_literals = TRUE;
23229 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
23231 /* Generally, it is more readable if printable characters are output as
23232 * literals, but if a range (nearly) spans all of them, it's best to output
23233 * it as a single range. This code will use a single range if all but 2
23234 * ASCII printables are in it */
23235 invlist_iterinit(invlist);
23236 while (invlist_iternext(invlist, &start, &end)) {
23238 /* If the range starts beyond the final printable, it doesn't have any
23240 if (start > MAX_PRINT_A) {
23244 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
23245 * all but two, the range must start and end no later than 2 from
23247 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
23248 if (end > MAX_PRINT_A) {
23254 if (end - start >= MAX_PRINT_A - ' ' - 2) {
23255 allow_literals = FALSE;
23260 invlist_iterfinish(invlist);
23262 /* Here we have figured things out. Output each range */
23263 invlist_iterinit(invlist);
23264 while (invlist_iternext(invlist, &start, &end)) {
23265 if (start >= NUM_ANYOF_CODE_POINTS) {
23268 put_range(sv, start, end, allow_literals);
23270 invlist_iterfinish(invlist);
23276 S_put_charclass_bitmap_innards_common(pTHX_
23277 SV* invlist, /* The bitmap */
23278 SV* posixes, /* Under /l, things like [:word:], \S */
23279 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
23280 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
23281 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
23282 const bool invert /* Is the result to be inverted? */
23285 /* Create and return an SV containing a displayable version of the bitmap
23286 * and associated information determined by the input parameters. If the
23287 * output would have been only the inversion indicator '^', NULL is instead
23292 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
23295 output = newSVpvs("^");
23298 output = newSVpvs("");
23301 /* First, the code points in the bitmap that are unconditionally there */
23302 put_charclass_bitmap_innards_invlist(output, invlist);
23304 /* Traditionally, these have been placed after the main code points */
23306 sv_catsv(output, posixes);
23309 if (only_utf8 && _invlist_len(only_utf8)) {
23310 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
23311 put_charclass_bitmap_innards_invlist(output, only_utf8);
23314 if (not_utf8 && _invlist_len(not_utf8)) {
23315 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
23316 put_charclass_bitmap_innards_invlist(output, not_utf8);
23319 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
23320 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
23321 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
23323 /* This is the only list in this routine that can legally contain code
23324 * points outside the bitmap range. The call just above to
23325 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
23326 * output them here. There's about a half-dozen possible, and none in
23327 * contiguous ranges longer than 2 */
23328 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
23330 SV* above_bitmap = NULL;
23332 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
23334 invlist_iterinit(above_bitmap);
23335 while (invlist_iternext(above_bitmap, &start, &end)) {
23338 for (i = start; i <= end; i++) {
23339 put_code_point(output, i);
23342 invlist_iterfinish(above_bitmap);
23343 SvREFCNT_dec_NN(above_bitmap);
23347 if (invert && SvCUR(output) == 1) {
23355 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
23357 SV *nonbitmap_invlist,
23358 SV *only_utf8_locale_invlist,
23359 const regnode * const node,
23361 const bool force_as_is_display)
23363 /* Appends to 'sv' a displayable version of the innards of the bracketed
23364 * character class defined by the other arguments:
23365 * 'bitmap' points to the bitmap, or NULL if to ignore that.
23366 * 'nonbitmap_invlist' is an inversion list of the code points that are in
23367 * the bitmap range, but for some reason aren't in the bitmap; NULL if
23368 * none. The reasons for this could be that they require some
23369 * condition such as the target string being or not being in UTF-8
23370 * (under /d), or because they came from a user-defined property that
23371 * was not resolved at the time of the regex compilation (under /u)
23372 * 'only_utf8_locale_invlist' is an inversion list of the code points that
23373 * are valid only if the runtime locale is a UTF-8 one; NULL if none
23374 * 'node' is the regex pattern ANYOF node. It is needed only when the
23375 * above two parameters are not null, and is passed so that this
23376 * routine can tease apart the various reasons for them.
23377 * 'flags' is the flags field of 'node'
23378 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
23379 * to invert things to see if that leads to a cleaner display. If
23380 * FALSE, this routine is free to use its judgment about doing this.
23382 * It returns 0 if nothing was actually output. (It may be that
23383 * the bitmap, etc is empty.)
23384 * 1 if the output wasn't inverted (didn't begin with a '^')
23385 * 2 if the output was inverted (did begin with a '^')
23387 * When called for outputting the bitmap of a non-ANYOF node, just pass the
23388 * bitmap, with the succeeding parameters set to NULL, and the final one to
23392 /* In general, it tries to display the 'cleanest' representation of the
23393 * innards, choosing whether to display them inverted or not, regardless of
23394 * whether the class itself is to be inverted. However, there are some
23395 * cases where it can't try inverting, as what actually matches isn't known
23396 * until runtime, and hence the inversion isn't either. */
23398 bool inverting_allowed = ! force_as_is_display;
23401 STRLEN orig_sv_cur = SvCUR(sv);
23403 SV* invlist; /* Inversion list we accumulate of code points that
23404 are unconditionally matched */
23405 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
23407 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
23409 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
23410 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
23413 SV* as_is_display; /* The output string when we take the inputs
23415 SV* inverted_display; /* The output string when we invert the inputs */
23417 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
23419 /* We are biased in favor of displaying things without them being inverted,
23420 * as that is generally easier to understand */
23421 const int bias = 5;
23423 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
23425 /* Start off with whatever code points are passed in. (We clone, so we
23426 * don't change the caller's list) */
23427 if (nonbitmap_invlist) {
23428 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
23429 invlist = invlist_clone(nonbitmap_invlist, NULL);
23431 else { /* Worst case size is every other code point is matched */
23432 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
23436 if (OP(node) == ANYOFD) {
23438 /* This flag indicates that the code points below 0x100 in the
23439 * nonbitmap list are precisely the ones that match only when the
23440 * target is UTF-8 (they should all be non-ASCII). */
23441 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
23442 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
23443 _invlist_subtract(invlist, only_utf8, &invlist);
23446 /* And this flag for matching all non-ASCII 0xFF and below */
23447 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
23448 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
23451 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
23453 /* If either of these flags are set, what matches isn't
23454 * determinable except during execution, so don't know enough here
23456 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
23457 inverting_allowed = FALSE;
23460 /* What the posix classes match also varies at runtime, so these
23461 * will be output symbolically. */
23462 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
23465 posixes = newSVpvs("");
23466 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
23467 if (ANYOF_POSIXL_TEST(node, i)) {
23468 sv_catpv(posixes, anyofs[i]);
23475 /* Accumulate the bit map into the unconditional match list */
23477 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
23478 if (BITMAP_TEST(bitmap, i)) {
23481 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
23484 invlist = _add_range_to_invlist(invlist, start, i-1);
23489 /* Make sure that the conditional match lists don't have anything in them
23490 * that match unconditionally; otherwise the output is quite confusing.
23491 * This could happen if the code that populates these misses some
23494 _invlist_subtract(only_utf8, invlist, &only_utf8);
23497 _invlist_subtract(not_utf8, invlist, ¬_utf8);
23500 if (only_utf8_locale_invlist) {
23502 /* Since this list is passed in, we have to make a copy before
23504 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
23506 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
23508 /* And, it can get really weird for us to try outputting an inverted
23509 * form of this list when it has things above the bitmap, so don't even
23511 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
23512 inverting_allowed = FALSE;
23516 /* Calculate what the output would be if we take the input as-is */
23517 as_is_display = put_charclass_bitmap_innards_common(invlist,
23524 /* If have to take the output as-is, just do that */
23525 if (! inverting_allowed) {
23526 if (as_is_display) {
23527 sv_catsv(sv, as_is_display);
23528 SvREFCNT_dec_NN(as_is_display);
23531 else { /* But otherwise, create the output again on the inverted input, and
23532 use whichever version is shorter */
23534 int inverted_bias, as_is_bias;
23536 /* We will apply our bias to whichever of the results doesn't have
23540 trial_invert = FALSE;
23545 trial_invert = TRUE;
23547 inverted_bias = bias;
23550 /* Now invert each of the lists that contribute to the output,
23551 * excluding from the result things outside the possible range */
23553 /* For the unconditional inversion list, we have to add in all the
23554 * conditional code points, so that when inverted, they will be gone
23556 _invlist_union(only_utf8, invlist, &invlist);
23557 _invlist_union(not_utf8, invlist, &invlist);
23558 _invlist_union(only_utf8_locale, invlist, &invlist);
23559 _invlist_invert(invlist);
23560 _invlist_intersection(invlist, PL_InBitmap, &invlist);
23563 _invlist_invert(only_utf8);
23564 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
23566 else if (not_utf8) {
23568 /* If a code point matches iff the target string is not in UTF-8,
23569 * then complementing the result has it not match iff not in UTF-8,
23570 * which is the same thing as matching iff it is UTF-8. */
23571 only_utf8 = not_utf8;
23575 if (only_utf8_locale) {
23576 _invlist_invert(only_utf8_locale);
23577 _invlist_intersection(only_utf8_locale,
23579 &only_utf8_locale);
23582 inverted_display = put_charclass_bitmap_innards_common(
23587 only_utf8_locale, trial_invert);
23589 /* Use the shortest representation, taking into account our bias
23590 * against showing it inverted */
23591 if ( inverted_display
23592 && ( ! as_is_display
23593 || ( SvCUR(inverted_display) + inverted_bias
23594 < SvCUR(as_is_display) + as_is_bias)))
23596 sv_catsv(sv, inverted_display);
23599 else if (as_is_display) {
23600 sv_catsv(sv, as_is_display);
23603 SvREFCNT_dec(as_is_display);
23604 SvREFCNT_dec(inverted_display);
23607 SvREFCNT_dec_NN(invlist);
23608 SvREFCNT_dec(only_utf8);
23609 SvREFCNT_dec(not_utf8);
23610 SvREFCNT_dec(posixes);
23611 SvREFCNT_dec(only_utf8_locale);
23613 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
23614 if (did_output_something) {
23615 /* Distinguish between non and inverted cases */
23616 did_output_something += invert;
23619 return did_output_something;
23622 #define CLEAR_OPTSTART \
23623 if (optstart) STMT_START { \
23624 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
23625 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
23629 #define DUMPUNTIL(b,e) \
23631 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
23633 STATIC const regnode *
23634 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
23635 const regnode *last, const regnode *plast,
23636 SV* sv, I32 indent, U32 depth)
23638 const regnode *next;
23639 const regnode *optstart= NULL;
23641 RXi_GET_DECL(r, ri);
23642 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23644 PERL_ARGS_ASSERT_DUMPUNTIL;
23646 #ifdef DEBUG_DUMPUNTIL
23647 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
23648 last ? last-start : 0, plast ? plast-start : 0);
23651 if (plast && plast < last)
23654 while (node && (!last || node < last)) {
23655 const U8 op = OP(node);
23657 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
23659 next = regnext((regnode *)node);
23660 const regnode *after = regnode_after((regnode *)node,0);
23663 if (op == OPTIMIZED) {
23664 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
23671 regprop(r, sv, node, NULL, NULL);
23672 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
23673 (int)(2*indent + 1), "", SvPVX_const(sv));
23675 if (op != OPTIMIZED) {
23676 if (next == NULL) /* Next ptr. */
23677 Perl_re_printf( aTHX_ " (0)");
23678 else if (REGNODE_TYPE(op) == BRANCH
23679 && REGNODE_TYPE(OP(next)) != BRANCH )
23680 Perl_re_printf( aTHX_ " (FAIL)");
23682 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
23683 Perl_re_printf( aTHX_ "\n");
23687 if (REGNODE_TYPE(op) == BRANCHJ) {
23689 const regnode *nnode = (OP(next) == LONGJMP
23690 ? regnext((regnode *)next)
23692 if (last && nnode > last)
23694 DUMPUNTIL(after, nnode);
23696 else if (REGNODE_TYPE(op) == BRANCH) {
23698 DUMPUNTIL(after, next);
23700 else if ( REGNODE_TYPE(op) == TRIE ) {
23701 const regnode *this_trie = node;
23702 const U32 n = ARG(node);
23703 const reg_ac_data * const ac = op>=AHOCORASICK ?
23704 (reg_ac_data *)ri->data->data[n] :
23706 const reg_trie_data * const trie =
23707 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
23709 AV *const trie_words
23710 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
23712 const regnode *nextbranch= NULL;
23715 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
23716 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
23718 Perl_re_indentf( aTHX_ "%s ",
23721 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
23722 SvCUR(*elem_ptr), PL_dump_re_max_len,
23723 PL_colors[0], PL_colors[1],
23725 ? PERL_PV_ESCAPE_UNI
23727 | PERL_PV_PRETTY_ELLIPSES
23728 | PERL_PV_PRETTY_LTGT
23733 U16 dist= trie->jump[word_idx+1];
23734 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
23735 (UV)((dist ? this_trie + dist : next) - start));
23738 nextbranch= this_trie + trie->jump[0];
23739 DUMPUNTIL(this_trie + dist, nextbranch);
23741 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
23742 nextbranch= regnext((regnode *)nextbranch);
23744 Perl_re_printf( aTHX_ "\n");
23747 if (last && next > last)
23752 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
23753 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
23755 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
23757 DUMPUNTIL(after, next);
23759 else if ( op == PLUS || op == STAR) {
23760 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
23762 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
23763 /* Literal string, where present. */
23764 node = (const regnode *)REGNODE_AFTER_varies(node);
23767 node = REGNODE_AFTER_opcode(node,op);
23769 if (op == CURLYX || op == OPEN || op == SROPEN)
23771 if (REGNODE_TYPE(op) == END)
23775 #ifdef DEBUG_DUMPUNTIL
23776 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
23781 #endif /* DEBUGGING */
23783 #ifndef PERL_IN_XSUB_RE
23785 # include "uni_keywords.h"
23788 Perl_init_uniprops(pTHX)
23792 char * dump_len_string;
23794 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
23795 if ( ! dump_len_string
23796 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
23798 PL_dump_re_max_len = 60; /* A reasonable default */
23802 PL_user_def_props = newHV();
23804 # ifdef USE_ITHREADS
23806 HvSHAREKEYS_off(PL_user_def_props);
23807 PL_user_def_props_aTHX = aTHX;
23811 /* Set up the inversion list interpreter-level variables */
23813 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23814 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23815 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23816 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23817 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23818 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23819 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23820 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23821 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23822 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23823 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23824 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23825 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23826 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23827 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23828 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23830 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23831 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23832 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23833 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23834 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
23835 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23836 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23837 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23838 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23839 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23840 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23841 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23842 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23843 PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
23844 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23845 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23847 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23848 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23849 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23850 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23851 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23853 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23854 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23855 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23856 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23858 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23860 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23861 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23863 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23864 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23866 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23867 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23868 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23869 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23870 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23871 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23872 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23873 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23874 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23875 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23876 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23877 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23878 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23879 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23882 /* The below are used only by deprecated functions. They could be removed */
23883 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23884 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23885 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23889 /* These four functions are compiled only in regcomp.c, where they have access
23890 * to the data they return. They are a way for re_comp.c to get access to that
23891 * data without having to compile the whole data structures. */
23894 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23896 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23898 return match_uniprop((U8 *) key, key_len);
23902 Perl_get_prop_definition(pTHX_ const int table_index)
23904 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23906 /* Create and return the inversion list */
23907 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23910 const char * const *
23911 Perl_get_prop_values(const int table_index)
23913 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23915 return UNI_prop_value_ptrs[table_index];
23919 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23921 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23923 return deprecated_property_msgs[warning_offset];
23928 This code was mainly added for backcompat to give a warning for non-portable
23929 code points in user-defined properties. But experiments showed that the
23930 warning in earlier perls were only omitted on overflow, which should be an
23931 error, so there really isnt a backcompat issue, and actually adding the
23932 warning when none was present before might cause breakage, for little gain. So
23933 khw left this code in, but not enabled. Tests were never added.
23936 Ei |const char *|get_extended_utf8_msg|const UV cp
23938 PERL_STATIC_INLINE const char *
23939 S_get_extended_utf8_msg(pTHX_ const UV cp)
23941 U8 dummy[UTF8_MAXBYTES + 1];
23945 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23948 msg = hv_fetchs(msgs, "text", 0);
23951 (void) sv_2mortal((SV *) msgs);
23953 return SvPVX(*msg);
23957 #endif /* end of ! PERL_IN_XSUB_RE */
23960 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23961 const bool ignore_case)
23963 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23964 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23965 * because nothing outside of ASCII will match. Use /m because the input
23966 * string may be a bunch of lines strung together.
23968 * Also sets up the debugging info */
23970 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23972 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
23973 REGEXP * subpattern_re;
23974 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23976 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23981 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23983 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23984 rx_flags = flags & RXf_PMf_COMPILETIME;
23986 #ifndef PERL_IN_XSUB_RE
23987 /* Use the core engine if this file is regcomp.c. That means no
23988 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23989 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23990 &PL_core_reg_engine,
23994 if (isDEBUG_WILDCARD) {
23995 /* Use the special debugging engine if this file is re_comp.c and wants
23996 * to output the wildcard matching. This uses whatever
23997 * 'use re "Debug ..." is in effect */
23998 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
24004 /* Use the special wildcard engine if this file is re_comp.c and
24005 * doesn't want to output the wildcard matching. This uses whatever
24006 * 'use re "Debug ..." is in effect for compilation, but this engine
24007 * structure has been set up so that it uses the core engine for
24008 * execution, so no execution debugging as a result of re.pm will be
24010 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
24014 /* XXX The above has the effect that any user-supplied regex engine
24015 * won't be called for matching wildcards. That might be good, or bad.
24016 * It could be changed in several ways. The reason it is done the
24017 * current way is to avoid having to save and restore
24018 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
24019 * could be used. Another suggestion is to keep the authoritative
24020 * value of the debug flags in a thread-local variable and add set/get
24021 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
24022 * Still another is to pass a flag, say in the engine's intflags that
24023 * would be checked each time before doing the debug output */
24027 assert(subpattern_re); /* Should have died if didn't compile successfully */
24028 return subpattern_re;
24032 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
24033 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
24036 DECLARE_AND_GET_RE_DEBUG_FLAGS;
24038 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
24042 /* The compilation has set things up so that if the program doesn't want to
24043 * see the wildcard matching procedure, it will get the core execution
24044 * engine, which is subject only to -Dr. So we have to turn that off
24045 * around this procedure */
24046 if (! isDEBUG_WILDCARD) {
24047 /* Note! Casts away 'volatile' */
24049 PL_debug &= ~ DEBUG_r_FLAG;
24052 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
24060 S_handle_user_defined_property(pTHX_
24062 /* Parses the contents of a user-defined property definition; returning the
24063 * expanded definition if possible. If so, the return is an inversion
24066 * If there are subroutines that are part of the expansion and which aren't
24067 * known at the time of the call to this function, this returns what
24068 * parse_uniprop_string() returned for the first one encountered.
24070 * If an error was found, NULL is returned, and 'msg' gets a suitable
24071 * message appended to it. (Appending allows the back trace of how we got
24072 * to the faulty definition to be displayed through nested calls of
24073 * user-defined subs.)
24075 * The caller IS responsible for freeing any returned SV.
24077 * The syntax of the contents is pretty much described in perlunicode.pod,
24078 * but we also allow comments on each line */
24080 const char * name, /* Name of property */
24081 const STRLEN name_len, /* The name's length in bytes */
24082 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
24083 const bool to_fold, /* ? Is this under /i */
24084 const bool runtime, /* ? Are we in compile- or run-time */
24085 const bool deferrable, /* Is it ok for this property's full definition
24086 to be deferred until later? */
24087 SV* contents, /* The property's definition */
24088 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
24089 getting called unless this is thought to be
24090 a user-defined property */
24091 SV * msg, /* Any error or warning msg(s) are appended to
24093 const STRLEN level) /* Recursion level of this call */
24096 const char * string = SvPV_const(contents, len);
24097 const char * const e = string + len;
24098 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
24099 const STRLEN msgs_length_on_entry = SvCUR(msg);
24101 const char * s0 = string; /* Points to first byte in the current line
24102 being parsed in 'string' */
24103 const char overflow_msg[] = "Code point too large in \"";
24104 SV* running_definition = NULL;
24106 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
24108 *user_defined_ptr = TRUE;
24110 /* Look at each line */
24112 const char * s; /* Current byte */
24113 char op = '+'; /* Default operation is 'union' */
24114 IV min = 0; /* range begin code point */
24115 IV max = -1; /* and range end */
24116 SV* this_definition;
24118 /* Skip comment lines */
24120 s0 = strchr(s0, '\n');
24128 /* For backcompat, allow an empty first line */
24134 /* First character in the line may optionally be the operation */
24143 /* If the line is one or two hex digits separated by blank space, its
24144 * a range; otherwise it is either another user-defined property or an
24149 if (! isXDIGIT(*s)) {
24150 goto check_if_property;
24153 do { /* Each new hex digit will add 4 bits. */
24154 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
24155 s = strchr(s, '\n');
24159 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24160 sv_catpv(msg, overflow_msg);
24161 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24162 UTF8fARG(is_contents_utf8, s - s0, s0));
24163 sv_catpvs(msg, "\"");
24164 goto return_failure;
24167 /* Accumulate this digit into the value */
24168 min = (min << 4) + READ_XDIGIT(s);
24169 } while (isXDIGIT(*s));
24171 while (isBLANK(*s)) { s++; }
24173 /* We allow comments at the end of the line */
24175 s = strchr(s, '\n');
24181 else if (s < e && *s != '\n') {
24182 if (! isXDIGIT(*s)) {
24183 goto check_if_property;
24186 /* Look for the high point of the range */
24189 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
24190 s = strchr(s, '\n');
24194 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24195 sv_catpv(msg, overflow_msg);
24196 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24197 UTF8fARG(is_contents_utf8, s - s0, s0));
24198 sv_catpvs(msg, "\"");
24199 goto return_failure;
24202 max = (max << 4) + READ_XDIGIT(s);
24203 } while (isXDIGIT(*s));
24205 while (isBLANK(*s)) { s++; }
24208 s = strchr(s, '\n');
24213 else if (s < e && *s != '\n') {
24214 goto check_if_property;
24218 if (max == -1) { /* The line only had one entry */
24221 else if (max < min) {
24222 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24223 sv_catpvs(msg, "Illegal range in \"");
24224 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24225 UTF8fARG(is_contents_utf8, s - s0, s0));
24226 sv_catpvs(msg, "\"");
24227 goto return_failure;
24230 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
24232 if ( UNICODE_IS_PERL_EXTENDED(min)
24233 || UNICODE_IS_PERL_EXTENDED(max))
24235 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24237 /* If both code points are non-portable, warn only on the lower
24239 sv_catpv(msg, get_extended_utf8_msg(
24240 (UNICODE_IS_PERL_EXTENDED(min))
24242 sv_catpvs(msg, " in \"");
24243 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24244 UTF8fARG(is_contents_utf8, s - s0, s0));
24245 sv_catpvs(msg, "\"");
24250 /* Here, this line contains a legal range */
24251 this_definition = sv_2mortal(_new_invlist(2));
24252 this_definition = _add_range_to_invlist(this_definition, min, max);
24257 /* Here it isn't a legal range line. See if it is a legal property
24258 * line. First find the end of the meat of the line */
24259 s = strpbrk(s, "#\n");
24264 /* Ignore trailing blanks in keeping with the requirements of
24265 * parse_uniprop_string() */
24267 while (s > s0 && isBLANK_A(*s)) {
24272 this_definition = parse_uniprop_string(s0, s - s0,
24273 is_utf8, to_fold, runtime,
24276 user_defined_ptr, msg,
24278 ? level /* Don't increase level
24279 if input is empty */
24282 if (this_definition == NULL) {
24283 goto return_failure; /* 'msg' should have had the reason
24284 appended to it by the above call */
24287 if (! is_invlist(this_definition)) { /* Unknown at this time */
24288 return newSVsv(this_definition);
24292 s = strchr(s, '\n');
24302 _invlist_union(running_definition, this_definition,
24303 &running_definition);
24306 _invlist_subtract(running_definition, this_definition,
24307 &running_definition);
24310 _invlist_intersection(running_definition, this_definition,
24311 &running_definition);
24314 _invlist_union_complement_2nd(running_definition,
24315 this_definition, &running_definition);
24318 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
24319 __FILE__, __LINE__, op);
24323 /* Position past the '\n' */
24325 } /* End of loop through the lines of 'contents' */
24327 /* Here, we processed all the lines in 'contents' without error. If we
24328 * didn't add any warnings, simply return success */
24329 if (msgs_length_on_entry == SvCUR(msg)) {
24331 /* If the expansion was empty, the answer isn't nothing: its an empty
24332 * inversion list */
24333 if (running_definition == NULL) {
24334 running_definition = _new_invlist(1);
24337 return running_definition;
24340 /* Otherwise, add some explanatory text, but we will return success */
24344 running_definition = NULL;
24348 if (name_len > 0) {
24349 sv_catpvs(msg, " in expansion of ");
24350 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24353 return running_definition;
24356 /* As explained below, certain operations need to take place in the first
24357 * thread created. These macros switch contexts */
24358 # ifdef USE_ITHREADS
24359 # define DECLARATION_FOR_GLOBAL_CONTEXT \
24360 PerlInterpreter * save_aTHX = aTHX;
24361 # define SWITCH_TO_GLOBAL_CONTEXT \
24362 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
24363 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
24364 # define CUR_CONTEXT aTHX
24365 # define ORIGINAL_CONTEXT save_aTHX
24367 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
24368 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
24369 # define RESTORE_CONTEXT NOOP
24370 # define CUR_CONTEXT NULL
24371 # define ORIGINAL_CONTEXT NULL
24375 S_delete_recursion_entry(pTHX_ void *key)
24377 /* Deletes the entry used to detect recursion when expanding user-defined
24378 * properties. This is a function so it can be set up to be called even if
24379 * the program unexpectedly quits */
24381 SV ** current_entry;
24382 const STRLEN key_len = strlen((const char *) key);
24383 DECLARATION_FOR_GLOBAL_CONTEXT;
24385 SWITCH_TO_GLOBAL_CONTEXT;
24387 /* If the entry is one of these types, it is a permanent entry, and not the
24388 * one used to detect recursions. This function should delete only the
24389 * recursion entry */
24390 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
24392 && ! is_invlist(*current_entry)
24393 && ! SvPOK(*current_entry))
24395 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
24403 S_get_fq_name(pTHX_
24404 const char * const name, /* The first non-blank in the \p{}, \P{} */
24405 const Size_t name_len, /* Its length in bytes, not including any trailing space */
24406 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
24407 const bool has_colon_colon
24410 /* Returns a mortal SV containing the fully qualified version of the input
24415 fq_name = newSVpvs_flags("", SVs_TEMP);
24417 /* Use the current package if it wasn't included in our input */
24418 if (! has_colon_colon) {
24419 const HV * pkg = (IN_PERL_COMPILETIME)
24421 : CopSTASH(PL_curcop);
24422 const char* pkgname = HvNAME(pkg);
24424 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
24425 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
24426 sv_catpvs(fq_name, "::");
24429 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
24430 UTF8fARG(is_utf8, name_len, name));
24435 S_parse_uniprop_string(pTHX_
24437 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
24438 * now. If so, the return is an inversion list.
24440 * If the property is user-defined, it is a subroutine, which in turn
24441 * may call other subroutines. This function will call the whole nest of
24442 * them to get the definition they return; if some aren't known at the time
24443 * of the call to this function, the fully qualified name of the highest
24444 * level sub is returned. It is an error to call this function at runtime
24445 * without every sub defined.
24447 * If an error was found, NULL is returned, and 'msg' gets a suitable
24448 * message appended to it. (Appending allows the back trace of how we got
24449 * to the faulty definition to be displayed through nested calls of
24450 * user-defined subs.)
24452 * The caller should NOT try to free any returned inversion list.
24454 * Other parameters will be set on return as described below */
24456 const char * const name, /* The first non-blank in the \p{}, \P{} */
24457 Size_t name_len, /* Its length in bytes, not including any
24459 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
24460 const bool to_fold, /* ? Is this under /i */
24461 const bool runtime, /* TRUE if this is being called at run time */
24462 const bool deferrable, /* TRUE if it's ok for the definition to not be
24463 known at this call */
24464 AV ** strings, /* To return string property values, like named
24466 bool *user_defined_ptr, /* Upon return from this function it will be
24467 set to TRUE if any component is a
24468 user-defined property */
24469 SV * msg, /* Any error or warning msg(s) are appended to
24471 const STRLEN level) /* Recursion level of this call */
24473 char* lookup_name; /* normalized name for lookup in our tables */
24474 unsigned lookup_len; /* Its length */
24475 enum { Not_Strict = 0, /* Some properties have stricter name */
24476 Strict, /* normalization rules, which we decide */
24477 As_Is /* upon based on parsing */
24478 } stricter = Not_Strict;
24480 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
24481 * (though it requires extra effort to download them from Unicode and
24482 * compile perl to know about them) */
24483 bool is_nv_type = FALSE;
24485 unsigned int i, j = 0;
24486 int equals_pos = -1; /* Where the '=' is found, or negative if none */
24487 int slash_pos = -1; /* Where the '/' is found, or negative if none */
24488 int table_index = 0; /* The entry number for this property in the table
24489 of all Unicode property names */
24490 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
24491 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
24492 the normalized name in certain situations */
24493 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
24494 part of a package name */
24495 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
24496 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
24497 property rather than a Unicode
24499 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
24500 if an error. If it is an inversion list,
24501 it is the definition. Otherwise it is a
24502 string containing the fully qualified sub
24504 SV * fq_name = NULL; /* For user-defined properties, the fully
24506 bool invert_return = FALSE; /* ? Do we need to complement the result before
24508 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
24509 explicit utf8:: package that we strip
24511 /* The expansion of properties that could be either user-defined or
24512 * official unicode ones is deferred until runtime, including a marker for
24513 * those that might be in the latter category. This boolean indicates if
24514 * we've seen that marker. If not, what we're parsing can't be such an
24515 * official Unicode property whose expansion was deferred */
24516 bool could_be_deferred_official = FALSE;
24518 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
24520 /* The input will be normalized into 'lookup_name' */
24521 Newx(lookup_name, name_len, char);
24522 SAVEFREEPV(lookup_name);
24524 /* Parse the input. */
24525 for (i = 0; i < name_len; i++) {
24526 char cur = name[i];
24528 /* Most of the characters in the input will be of this ilk, being parts
24530 if (isIDCONT_A(cur)) {
24532 /* Case differences are ignored. Our lookup routine assumes
24533 * everything is lowercase, so normalize to that */
24534 if (isUPPER_A(cur)) {
24535 lookup_name[j++] = toLOWER_A(cur);
24539 if (cur == '_') { /* Don't include these in the normalized name */
24543 lookup_name[j++] = cur;
24545 /* The first character in a user-defined name must be of this type.
24547 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
24548 could_be_user_defined = FALSE;
24554 /* Here, the character is not something typically in a name, But these
24555 * two types of characters (and the '_' above) can be freely ignored in
24556 * most situations. Later it may turn out we shouldn't have ignored
24557 * them, and we have to reparse, but we don't have enough information
24558 * yet to make that decision */
24559 if (cur == '-' || isSPACE_A(cur)) {
24560 could_be_user_defined = FALSE;
24564 /* An equals sign or single colon mark the end of the first part of
24565 * the property name */
24567 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
24569 lookup_name[j++] = '='; /* Treat the colon as an '=' */
24570 equals_pos = j; /* Note where it occurred in the input */
24571 could_be_user_defined = FALSE;
24575 /* If this looks like it is a marker we inserted at compile time,
24576 * set a flag and otherwise ignore it. If it isn't in the final
24577 * position, keep it as it would have been user input. */
24578 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
24580 && could_be_user_defined
24581 && i == name_len - 1)
24584 could_be_deferred_official = TRUE;
24588 /* Otherwise, this character is part of the name. */
24589 lookup_name[j++] = cur;
24591 /* Here it isn't a single colon, so if it is a colon, it must be a
24595 /* A double colon should be a package qualifier. We note its
24596 * position and continue. Note that one could have
24597 * pkg1::pkg2::...::foo
24598 * so that the position at the end of the loop will be just after
24599 * the final qualifier */
24602 non_pkg_begin = i + 1;
24603 lookup_name[j++] = ':';
24604 lun_non_pkg_begin = j;
24606 else { /* Only word chars (and '::') can be in a user-defined name */
24607 could_be_user_defined = FALSE;
24609 } /* End of parsing through the lhs of the property name (or all of it if
24612 /* If there is a single package name 'utf8::', it is ambiguous. It could
24613 * be for a user-defined property, or it could be a Unicode property, as
24614 * all of them are considered to be for that package. For the purposes of
24615 * parsing the rest of the property, strip it off */
24616 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
24617 lookup_name += STRLENs("utf8::");
24618 j -= STRLENs("utf8::");
24619 equals_pos -= STRLENs("utf8::");
24620 stripped_utf8_pkg = TRUE;
24623 /* Here, we are either done with the whole property name, if it was simple;
24624 * or are positioned just after the '=' if it is compound. */
24626 if (equals_pos >= 0) {
24627 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
24629 /* Space immediately after the '=' is ignored */
24631 for (; i < name_len; i++) {
24632 if (! isSPACE_A(name[i])) {
24637 /* Most punctuation after the equals indicates a subpattern, like
24639 if ( isPUNCT_A(name[i])
24644 /* A backslash means the real delimitter is the next character,
24645 * but it must be punctuation */
24646 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
24648 bool special_property = memEQs(lookup_name, j - 1, "name")
24649 || memEQs(lookup_name, j - 1, "na");
24650 if (! special_property) {
24651 /* Find the property. The table includes the equals sign, so
24652 * we use 'j' as-is */
24653 table_index = do_uniprop_match(lookup_name, j);
24655 if (special_property || table_index) {
24656 REGEXP * subpattern_re;
24657 char open = name[i++];
24659 const char * pos_in_brackets;
24660 const char * const * prop_values;
24663 /* Backslash => delimitter is the character following. We
24664 * already checked that it is punctuation */
24665 if (open == '\\') {
24670 /* This data structure is constructed so that the matching
24671 * closing bracket is 3 past its matching opening. The second
24672 * set of closing is so that if the opening is something like
24673 * ']', the closing will be that as well. Something similar is
24674 * done in toke.c */
24675 pos_in_brackets = memCHRs("([<)]>)]>", open);
24676 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
24679 || name[name_len-1] != close
24680 || (escaped && name[name_len-2] != '\\')
24681 /* Also make sure that there are enough characters.
24682 * e.g., '\\\' would show up incorrectly as legal even
24683 * though it is too short */
24684 || (SSize_t) (name_len - i - 1 - escaped) < 0)
24686 sv_catpvs(msg, "Unicode property wildcard not terminated");
24687 goto append_name_to_msg;
24690 Perl_ck_warner_d(aTHX_
24691 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
24692 "The Unicode property wildcards feature is experimental");
24694 if (special_property) {
24695 const char * error_msg;
24696 const char * revised_name = name + i;
24697 Size_t revised_name_len = name_len - (i + 1 + escaped);
24699 /* Currently, the only 'special_property' is name, which we
24700 * lookup in _charnames.pm */
24702 if (! load_charnames(newSVpvs("placeholder"),
24703 revised_name, revised_name_len,
24706 sv_catpv(msg, error_msg);
24707 goto append_name_to_msg;
24710 /* Farm this out to a function just to make the current
24711 * function less unwieldy */
24712 if (handle_names_wildcard(revised_name, revised_name_len,
24716 return prop_definition;
24722 prop_values = get_prop_values(table_index);
24724 /* Now create and compile the wildcard subpattern. Use /i
24725 * because the property values are supposed to match with case
24727 subpattern_re = compile_wildcard(name + i,
24728 name_len - i - 1 - escaped,
24732 /* For each legal property value, see if the supplied pattern
24734 while (*prop_values) {
24735 const char * const entry = *prop_values;
24736 const Size_t len = strlen(entry);
24737 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
24739 if (execute_wildcard(subpattern_re,
24741 (char *) entry + len,
24745 { /* Here, matched. Add to the returned list */
24746 Size_t total_len = j + len;
24747 SV * sub_invlist = NULL;
24748 char * this_string;
24750 /* We know this is a legal \p{property=value}. Call
24751 * the function to return the list of code points that
24753 Newxz(this_string, total_len + 1, char);
24754 Copy(lookup_name, this_string, j, char);
24755 my_strlcat(this_string, entry, total_len + 1);
24756 SAVEFREEPV(this_string);
24757 sub_invlist = parse_uniprop_string(this_string,
24767 _invlist_union(prop_definition, sub_invlist,
24771 prop_values++; /* Next iteration, look at next propvalue */
24772 } /* End of looking through property values; (the data
24773 structure is terminated by a NULL ptr) */
24775 SvREFCNT_dec_NN(subpattern_re);
24777 if (prop_definition) {
24778 return prop_definition;
24781 sv_catpvs(msg, "No Unicode property value wildcard matches:");
24782 goto append_name_to_msg;
24785 /* Here's how khw thinks we should proceed to handle the properties
24786 * not yet done: Bidi Mirroring Glyph can map to ""
24787 Bidi Paired Bracket can map to ""
24788 Case Folding (both full and simple)
24789 Shouldn't /i be good enough for Full
24790 Decomposition Mapping
24791 Equivalent Unified Ideograph can map to ""
24792 Lowercase Mapping (both full and simple)
24793 NFKC Case Fold can map to ""
24794 Titlecase Mapping (both full and simple)
24795 Uppercase Mapping (both full and simple)
24796 * Handle these the same way Name is done, using say, _wild.pm, but
24797 * having both loose and full, like in charclass_invlists.h.
24798 * Perhaps move block and script to that as they are somewhat large
24799 * in charclass_invlists.h.
24800 * For properties where the default is the code point itself, such
24801 * as any of the case changing mappings, the string would otherwise
24802 * consist of all Unicode code points in UTF-8 strung together.
24803 * This would be impractical. So instead, examine their compiled
24804 * pattern, looking at the ssc. If none, reject the pattern as an
24805 * error. Otherwise run the pattern against every code point in
24806 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
24807 * And it might be good to create an API to return the ssc.
24808 * Or handle them like the algorithmic names are done
24810 } /* End of is a wildcard subppattern */
24812 /* \p{name=...} is handled specially. Instead of using the normal
24813 * mechanism involving charclass_invlists.h, it uses _charnames.pm
24814 * which has the necessary (huge) data accessible to it, and which
24815 * doesn't get loaded unless necessary. The legal syntax for names is
24816 * somewhat different than other properties due both to the vagaries of
24817 * a few outlier official names, and the fact that only a few ASCII
24818 * characters are permitted in them */
24819 if ( memEQs(lookup_name, j - 1, "name")
24820 || memEQs(lookup_name, j - 1, "na"))
24825 const char * error_msg;
24827 SV * character_name;
24828 STRLEN character_len;
24833 /* Since the RHS (after skipping initial space) is passed unchanged
24834 * to charnames, and there are different criteria for what are
24835 * legal characters in the name, just parse it here. A character
24836 * name must begin with an ASCII alphabetic */
24837 if (! isALPHA(name[i])) {
24840 lookup_name[j++] = name[i];
24842 for (++i; i < name_len; i++) {
24843 /* Official names can only be in the ASCII range, and only
24844 * certain characters */
24845 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24848 lookup_name[j++] = name[i];
24851 /* Finished parsing, save the name into an SV */
24852 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24854 /* Make sure _charnames is loaded. (The parameters give context
24855 * for any errors generated */
24856 table = load_charnames(character_name, name, name_len, &error_msg);
24857 if (table == NULL) {
24858 sv_catpv(msg, error_msg);
24859 goto append_name_to_msg;
24862 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24863 if (! lookup_loose) {
24865 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24868 PUSHSTACKi(PERLSI_REGCOMP);
24874 XPUSHs(character_name);
24876 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24881 SvREFCNT_inc_simple_void_NN(character);
24888 if (! SvOK(character)) {
24892 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24893 if (character_len == SvCUR(character)) {
24894 prop_definition = add_cp_to_invlist(NULL, cp);
24899 /* First of the remaining characters in the string. */
24900 char * remaining = SvPVX(character) + character_len;
24902 if (strings == NULL) {
24903 goto failed; /* XXX Perhaps a specific msg instead, like
24904 'not available here' */
24907 if (*strings == NULL) {
24908 *strings = newAV();
24911 this_string = newAV();
24912 av_push(this_string, newSVuv(cp));
24915 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24916 av_push(this_string, newSVuv(cp));
24917 remaining += character_len;
24918 } while (remaining < SvEND(character));
24920 av_push(*strings, (SV *) this_string);
24923 return prop_definition;
24926 /* Certain properties whose values are numeric need special handling.
24927 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24928 * purposes of checking if this is one of those properties */
24929 if (memBEGINPs(lookup_name, j, "is")) {
24933 /* Then check if it is one of these specially-handled properties. The
24934 * possibilities are hard-coded because easier this way, and the list
24935 * is unlikely to change.
24937 * All numeric value type properties are of this ilk, and are also
24938 * special in a different way later on. So find those first. There
24939 * are several numeric value type properties in the Unihan DB (which is
24940 * unlikely to be compiled with perl, but we handle it here in case it
24941 * does get compiled). They all end with 'numeric'. The interiors
24942 * aren't checked for the precise property. This would stop working if
24943 * a cjk property were to be created that ended with 'numeric' and
24944 * wasn't a numeric type */
24945 is_nv_type = memEQs(lookup_name + lookup_offset,
24946 j - 1 - lookup_offset, "numericvalue")
24947 || memEQs(lookup_name + lookup_offset,
24948 j - 1 - lookup_offset, "nv")
24949 || ( memENDPs(lookup_name + lookup_offset,
24950 j - 1 - lookup_offset, "numeric")
24951 && ( memBEGINPs(lookup_name + lookup_offset,
24952 j - 1 - lookup_offset, "cjk")
24953 || memBEGINPs(lookup_name + lookup_offset,
24954 j - 1 - lookup_offset, "k")));
24956 || memEQs(lookup_name + lookup_offset,
24957 j - 1 - lookup_offset, "canonicalcombiningclass")
24958 || memEQs(lookup_name + lookup_offset,
24959 j - 1 - lookup_offset, "ccc")
24960 || memEQs(lookup_name + lookup_offset,
24961 j - 1 - lookup_offset, "age")
24962 || memEQs(lookup_name + lookup_offset,
24963 j - 1 - lookup_offset, "in")
24964 || memEQs(lookup_name + lookup_offset,
24965 j - 1 - lookup_offset, "presentin"))
24969 /* Since the stuff after the '=' is a number, we can't throw away
24970 * '-' willy-nilly, as those could be a minus sign. Other stricter
24971 * rules also apply. However, these properties all can have the
24972 * rhs not be a number, in which case they contain at least one
24973 * alphabetic. In those cases, the stricter rules don't apply.
24974 * But the numeric type properties can have the alphas [Ee] to
24975 * signify an exponent, and it is still a number with stricter
24976 * rules. So look for an alpha that signifies not-strict */
24978 for (k = i; k < name_len; k++) {
24979 if ( isALPHA_A(name[k])
24980 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24982 stricter = Not_Strict;
24990 /* A number may have a leading '+' or '-'. The latter is retained
24992 if (name[i] == '+') {
24995 else if (name[i] == '-') {
24996 lookup_name[j++] = '-';
25000 /* Skip leading zeros including single underscores separating the
25001 * zeros, or between the final leading zero and the first other
25003 for (; i < name_len - 1; i++) {
25004 if ( name[i] != '0'
25005 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
25011 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by
25012 * underling libc implementation. */
25013 if ( i == name_len - 1
25014 && name[name_len-1] == '0'
25015 && lookup_name[j-1] == '-')
25021 else { /* No '=' */
25023 /* Only a few properties without an '=' should be parsed with stricter
25024 * rules. The list is unlikely to change. */
25025 if ( memBEGINPs(lookup_name, j, "perl")
25026 && memNEs(lookup_name + 4, j - 4, "space")
25027 && memNEs(lookup_name + 4, j - 4, "word"))
25031 /* We set the inputs back to 0 and the code below will reparse,
25037 /* Here, we have either finished the property, or are positioned to parse
25038 * the remainder, and we know if stricter rules apply. Finish out, if not
25040 for (; i < name_len; i++) {
25041 char cur = name[i];
25043 /* In all instances, case differences are ignored, and we normalize to
25045 if (isUPPER_A(cur)) {
25046 lookup_name[j++] = toLOWER(cur);
25050 /* An underscore is skipped, but not under strict rules unless it
25051 * separates two digits */
25054 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
25055 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
25057 lookup_name[j++] = '_';
25062 /* Hyphens are skipped except under strict */
25063 if (cur == '-' && ! stricter) {
25067 /* XXX Bug in documentation. It says white space skipped adjacent to
25068 * non-word char. Maybe we should, but shouldn't skip it next to a dot
25070 if (isSPACE_A(cur) && ! stricter) {
25074 lookup_name[j++] = cur;
25076 /* Unless this is a non-trailing slash, we are done with it */
25077 if (i >= name_len - 1 || cur != '/') {
25083 /* A slash in the 'numeric value' property indicates that what follows
25084 * is a denominator. It can have a leading '+' and '0's that should be
25085 * skipped. But we have never allowed a negative denominator, so treat
25086 * a minus like every other character. (No need to rule out a second
25087 * '/', as that won't match anything anyway */
25090 if (i < name_len && name[i] == '+') {
25094 /* Skip leading zeros including underscores separating digits */
25095 for (; i < name_len - 1; i++) {
25096 if ( name[i] != '0'
25097 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
25103 /* Store the first real character in the denominator */
25104 if (i < name_len) {
25105 lookup_name[j++] = name[i];
25110 /* Here are completely done parsing the input 'name', and 'lookup_name'
25111 * contains a copy, normalized.
25113 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
25114 * different from without the underscores. */
25115 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
25116 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
25117 && UNLIKELY(name[name_len-1] == '_'))
25119 lookup_name[j++] = '&';
25122 /* If the original input began with 'In' or 'Is', it could be a subroutine
25123 * call to a user-defined property instead of a Unicode property name. */
25124 if ( name_len - non_pkg_begin > 2
25125 && name[non_pkg_begin+0] == 'I'
25126 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
25128 /* Names that start with In have different characterstics than those
25129 * that start with Is */
25130 if (name[non_pkg_begin+1] == 's') {
25131 starts_with_Is = TRUE;
25135 could_be_user_defined = FALSE;
25138 if (could_be_user_defined) {
25141 /* If the user defined property returns the empty string, it could
25142 * easily be because the pattern is being compiled before the data it
25143 * actually needs to compile is available. This could be argued to be
25144 * a bug in the perl code, but this is a change of behavior for Perl,
25145 * so we handle it. This means that intentionally returning nothing
25146 * will not be resolved until runtime */
25147 bool empty_return = FALSE;
25149 /* Here, the name could be for a user defined property, which are
25150 * implemented as subs. */
25151 user_sub = get_cvn_flags(name, name_len, 0);
25154 /* Here, the property name could be a user-defined one, but there
25155 * is no subroutine to handle it (as of now). Defer handling it
25156 * until runtime. Otherwise, a block defined by Unicode in a later
25157 * release would get the synonym InFoo added for it, and existing
25158 * code that used that name would suddenly break if it referred to
25159 * the property before the sub was declared. See [perl #134146] */
25161 goto definition_deferred;
25164 /* Here, we are at runtime, and didn't find the user property. It
25165 * could be an official property, but only if no package was
25166 * specified, or just the utf8:: package. */
25167 if (could_be_deferred_official) {
25168 lookup_name += lun_non_pkg_begin;
25169 j -= lun_non_pkg_begin;
25171 else if (! stripped_utf8_pkg) {
25172 goto unknown_user_defined;
25175 /* Drop down to look up in the official properties */
25178 const char insecure[] = "Insecure user-defined property";
25180 /* Here, there is a sub by the correct name. Normally we call it
25181 * to get the property definition */
25183 SV * user_sub_sv = MUTABLE_SV(user_sub);
25184 SV * error; /* Any error returned by calling 'user_sub' */
25185 SV * key; /* The key into the hash of user defined sub names
25188 SV ** saved_user_prop_ptr; /* Hash entry for this property */
25190 /* How many times to retry when another thread is in the middle of
25191 * expanding the same definition we want */
25192 PERL_INT_FAST8_T retry_countdown = 10;
25194 DECLARATION_FOR_GLOBAL_CONTEXT;
25196 /* If we get here, we know this property is user-defined */
25197 *user_defined_ptr = TRUE;
25199 /* We refuse to call a potentially tainted subroutine; returning an
25202 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25203 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
25204 goto append_name_to_msg;
25207 /* In principal, we only call each subroutine property definition
25208 * once during the life of the program. This guarantees that the
25209 * property definition never changes. The results of the single
25210 * sub call are stored in a hash, which is used instead for future
25211 * references to this property. The property definition is thus
25212 * immutable. But, to allow the user to have a /i-dependent
25213 * definition, we call the sub once for non-/i, and once for /i,
25214 * should the need arise, passing the /i status as a parameter.
25216 * We start by constructing the hash key name, consisting of the
25217 * fully qualified subroutine name, preceded by the /i status, so
25218 * that there is a key for /i and a different key for non-/i */
25219 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
25220 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25221 non_pkg_begin != 0);
25222 sv_catsv(key, fq_name);
25224 /* We only call the sub once throughout the life of the program
25225 * (with the /i, non-/i exception noted above). That means the
25226 * hash must be global and accessible to all threads. It is
25227 * created at program start-up, before any threads are created, so
25228 * is accessible to all children. But this creates some
25231 * 1) The keys can't be shared, or else problems arise; sharing is
25232 * turned off at hash creation time
25233 * 2) All SVs in it are there for the remainder of the life of the
25234 * program, and must be created in the same interpreter context
25235 * as the hash, or else they will be freed from the wrong pool
25236 * at global destruction time. This is handled by switching to
25237 * the hash's context to create each SV going into it, and then
25238 * immediately switching back
25239 * 3) All accesses to the hash must be controlled by a mutex, to
25240 * prevent two threads from getting an unstable state should
25241 * they simultaneously be accessing it. The code below is
25242 * crafted so that the mutex is locked whenever there is an
25243 * access and unlocked only when the next stable state is
25246 * The hash stores either the definition of the property if it was
25247 * valid, or, if invalid, the error message that was raised. We
25248 * use the type of SV to distinguish.
25250 * There's also the need to guard against the definition expansion
25251 * from infinitely recursing. This is handled by storing the aTHX
25252 * of the expanding thread during the expansion. Again the SV type
25253 * is used to distinguish this from the other two cases. If we
25254 * come to here and the hash entry for this property is our aTHX,
25255 * it means we have recursed, and the code assumes that we would
25256 * infinitely recurse, so instead stops and raises an error.
25257 * (Any recursion has always been treated as infinite recursion in
25260 * If instead, the entry is for a different aTHX, it means that
25261 * that thread has gotten here first, and hasn't finished expanding
25262 * the definition yet. We just have to wait until it is done. We
25263 * sleep and retry a few times, returning an error if the other
25264 * thread doesn't complete. */
25267 USER_PROP_MUTEX_LOCK;
25269 /* If we have an entry for this key, the subroutine has already
25270 * been called once with this /i status. */
25271 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
25272 SvPVX(key), SvCUR(key), 0);
25273 if (saved_user_prop_ptr) {
25275 /* If the saved result is an inversion list, it is the valid
25276 * definition of this property */
25277 if (is_invlist(*saved_user_prop_ptr)) {
25278 prop_definition = *saved_user_prop_ptr;
25280 /* The SV in the hash won't be removed until global
25281 * destruction, so it is stable and we can unlock */
25282 USER_PROP_MUTEX_UNLOCK;
25284 /* The caller shouldn't try to free this SV */
25285 return prop_definition;
25288 /* Otherwise, if it is a string, it is the error message
25289 * that was returned when we first tried to evaluate this
25290 * property. Fail, and append the message */
25291 if (SvPOK(*saved_user_prop_ptr)) {
25292 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25293 sv_catsv(msg, *saved_user_prop_ptr);
25295 /* The SV in the hash won't be removed until global
25296 * destruction, so it is stable and we can unlock */
25297 USER_PROP_MUTEX_UNLOCK;
25302 assert(SvIOK(*saved_user_prop_ptr));
25304 /* Here, we have an unstable entry in the hash. Either another
25305 * thread is in the middle of expanding the property's
25306 * definition, or we are ourselves recursing. We use the aTHX
25307 * in it to distinguish */
25308 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
25310 /* Here, it's another thread doing the expanding. We've
25311 * looked as much as we are going to at the contents of the
25312 * hash entry. It's safe to unlock. */
25313 USER_PROP_MUTEX_UNLOCK;
25315 /* Retry a few times */
25316 if (retry_countdown-- > 0) {
25321 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25322 sv_catpvs(msg, "Timeout waiting for another thread to "
25324 goto append_name_to_msg;
25327 /* Here, we are recursing; don't dig any deeper */
25328 USER_PROP_MUTEX_UNLOCK;
25330 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25332 "Infinite recursion in user-defined property");
25333 goto append_name_to_msg;
25336 /* Here, this thread has exclusive control, and there is no entry
25337 * for this property in the hash. So we have the go ahead to
25338 * expand the definition ourselves. */
25340 PUSHSTACKi(PERLSI_REGCOMP);
25343 /* Create a temporary placeholder in the hash to detect recursion
25345 SWITCH_TO_GLOBAL_CONTEXT;
25346 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
25347 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
25350 /* Now that we have a placeholder, we can let other threads
25352 USER_PROP_MUTEX_UNLOCK;
25354 /* Make sure the placeholder always gets destroyed */
25355 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
25360 /* Call the user's function, with the /i status as a parameter.
25361 * Note that we have gone to a lot of trouble to keep this call
25362 * from being within the locked mutex region. */
25363 XPUSHs(boolSV(to_fold));
25366 /* The following block was taken from swash_init(). Presumably
25367 * they apply to here as well, though we no longer use a swash --
25371 /* We might get here via a subroutine signature which uses a utf8
25372 * parameter name, at which point PL_subname will have been set
25373 * but not yet used. */
25374 save_item(PL_subname);
25376 /* G_SCALAR guarantees a single return value */
25377 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
25382 if (TAINT_get || SvTRUE(error)) {
25383 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25384 if (SvTRUE(error)) {
25385 sv_catpvs(msg, "Error \"");
25386 sv_catsv(msg, error);
25387 sv_catpvs(msg, "\"");
25390 if (SvTRUE(error)) sv_catpvs(msg, "; ");
25391 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
25394 if (name_len > 0) {
25395 sv_catpvs(msg, " in expansion of ");
25396 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
25402 prop_definition = NULL;
25405 SV * contents = POPs;
25407 /* The contents is supposed to be the expansion of the property
25408 * definition. If the definition is deferrable, and we got an
25409 * empty string back, set a flag to later defer it (after clean
25412 && (! SvPOK(contents) || SvCUR(contents) == 0))
25414 empty_return = TRUE;
25416 else { /* Otherwise, call a function to check for valid syntax,
25419 prop_definition = handle_user_defined_property(
25421 is_utf8, to_fold, runtime,
25423 contents, user_defined_ptr,
25429 /* Here, we have the results of the expansion. Delete the
25430 * placeholder, and if the definition is now known, replace it with
25431 * that definition. We need exclusive access to the hash, and we
25432 * can't let anyone else in, between when we delete the placeholder
25433 * and add the permanent entry */
25434 USER_PROP_MUTEX_LOCK;
25436 S_delete_recursion_entry(aTHX_ SvPVX(key));
25438 if ( ! empty_return
25439 && (! prop_definition || is_invlist(prop_definition)))
25441 /* If we got success we use the inversion list defining the
25442 * property; otherwise use the error message */
25443 SWITCH_TO_GLOBAL_CONTEXT;
25444 (void) hv_store_ent(PL_user_def_props,
25447 ? newSVsv(prop_definition)
25453 /* All done, and the hash now has a permanent entry for this
25454 * property. Give up exclusive control */
25455 USER_PROP_MUTEX_UNLOCK;
25461 if (empty_return) {
25462 goto definition_deferred;
25465 if (prop_definition) {
25467 /* If the definition is for something not known at this time,
25468 * we toss it, and go return the main property name, as that's
25469 * the one the user will be aware of */
25470 if (! is_invlist(prop_definition)) {
25471 SvREFCNT_dec_NN(prop_definition);
25472 goto definition_deferred;
25475 sv_2mortal(prop_definition);
25479 return prop_definition;
25481 } /* End of calling the subroutine for the user-defined property */
25482 } /* End of it could be a user-defined property */
25484 /* Here it wasn't a user-defined property that is known at this time. See
25485 * if it is a Unicode property */
25487 lookup_len = j; /* This is a more mnemonic name than 'j' */
25489 /* Get the index into our pointer table of the inversion list corresponding
25490 * to the property */
25491 table_index = do_uniprop_match(lookup_name, lookup_len);
25493 /* If it didn't find the property ... */
25494 if (table_index == 0) {
25496 /* Try again stripping off any initial 'Is'. This is because we
25497 * promise that an initial Is is optional. The same isn't true of
25498 * names that start with 'In'. Those can match only blocks, and the
25499 * lookup table already has those accounted for. The lookup table also
25500 * has already accounted for Perl extensions (without and = sign)
25501 * starting with 'i's'. */
25502 if (starts_with_Is && equals_pos >= 0) {
25508 table_index = do_uniprop_match(lookup_name, lookup_len);
25511 if (table_index == 0) {
25514 /* Here, we didn't find it. If not a numeric type property, and
25515 * can't be a user-defined one, it isn't a legal property */
25516 if (! is_nv_type) {
25517 if (! could_be_user_defined) {
25521 /* Here, the property name is legal as a user-defined one. At
25522 * compile time, it might just be that the subroutine for that
25523 * property hasn't been encountered yet, but at runtime, it's
25524 * an error to try to use an undefined one */
25525 if (! deferrable) {
25526 goto unknown_user_defined;;
25529 goto definition_deferred;
25530 } /* End of isn't a numeric type property */
25532 /* The numeric type properties need more work to decide. What we
25533 * do is make sure we have the number in canonical form and look
25536 if (slash_pos < 0) { /* No slash */
25538 /* When it isn't a rational, take the input, convert it to a
25539 * NV, then create a canonical string representation of that
25543 SSize_t value_len = lookup_len - equals_pos;
25545 /* Get the value */
25546 if ( value_len <= 0
25547 || my_atof3(lookup_name + equals_pos, &value,
25549 != lookup_name + lookup_len)
25554 /* If the value is an integer, the canonical value is integral
25556 if (Perl_ceil(value) == value) {
25557 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
25558 equals_pos, lookup_name, value);
25560 else { /* Otherwise, it is %e with a known precision */
25563 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
25564 equals_pos, lookup_name,
25565 PL_E_FORMAT_PRECISION, value);
25567 /* The exponent generated is expecting two digits, whereas
25568 * %e on some systems will generate three. Remove leading
25569 * zeros in excess of 2 from the exponent. We start
25570 * looking for them after the '=' */
25571 exp_ptr = strchr(canonical + equals_pos, 'e');
25573 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
25574 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
25576 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
25578 if (excess_exponent_len > 0) {
25579 SSize_t leading_zeros = strspn(cur_ptr, "0");
25580 SSize_t excess_leading_zeros
25581 = MIN(leading_zeros, excess_exponent_len);
25582 if (excess_leading_zeros > 0) {
25583 Move(cur_ptr + excess_leading_zeros,
25585 strlen(cur_ptr) - excess_leading_zeros
25586 + 1, /* Copy the NUL as well */
25593 else { /* Has a slash. Create a rational in canonical form */
25594 UV numerator, denominator, gcd, trial;
25595 const char * end_ptr;
25596 const char * sign = "";
25598 /* We can't just find the numerator, denominator, and do the
25599 * division, then use the method above, because that is
25600 * inexact. And the input could be a rational that is within
25601 * epsilon (given our precision) of a valid rational, and would
25602 * then incorrectly compare valid.
25604 * We're only interested in the part after the '=' */
25605 const char * this_lookup_name = lookup_name + equals_pos;
25606 lookup_len -= equals_pos;
25607 slash_pos -= equals_pos;
25609 /* Handle any leading minus */
25610 if (this_lookup_name[0] == '-') {
25612 this_lookup_name++;
25617 /* Convert the numerator to numeric */
25618 end_ptr = this_lookup_name + slash_pos;
25619 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
25623 /* It better have included all characters before the slash */
25624 if (*end_ptr != '/') {
25628 /* Set to look at just the denominator */
25629 this_lookup_name += slash_pos;
25630 lookup_len -= slash_pos;
25631 end_ptr = this_lookup_name + lookup_len;
25633 /* Convert the denominator to numeric */
25634 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
25638 /* It better be the rest of the characters, and don't divide by
25640 if ( end_ptr != this_lookup_name + lookup_len
25641 || denominator == 0)
25646 /* Get the greatest common denominator using
25647 http://en.wikipedia.org/wiki/Euclidean_algorithm */
25649 trial = denominator;
25650 while (trial != 0) {
25652 trial = gcd % trial;
25656 /* If already in lowest possible terms, we have already tried
25657 * looking this up */
25662 /* Reduce the rational, which should put it in canonical form
25665 denominator /= gcd;
25667 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
25668 equals_pos, lookup_name, sign, numerator, denominator);
25671 /* Here, we have the number in canonical form. Try that */
25672 table_index = do_uniprop_match(canonical, strlen(canonical));
25673 if (table_index == 0) {
25676 } /* End of still didn't find the property in our table */
25677 } /* End of didn't find the property in our table */
25679 /* Here, we have a non-zero return, which is an index into a table of ptrs.
25680 * A negative return signifies that the real index is the absolute value,
25681 * but the result needs to be inverted */
25682 if (table_index < 0) {
25683 invert_return = TRUE;
25684 table_index = -table_index;
25687 /* Out-of band indices indicate a deprecated property. The proper index is
25688 * modulo it with the table size. And dividing by the table size yields
25689 * an offset into a table constructed by regen/mk_invlists.pl to contain
25690 * the corresponding warning message */
25691 if (table_index > MAX_UNI_KEYWORD_INDEX) {
25692 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
25693 table_index %= MAX_UNI_KEYWORD_INDEX;
25694 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
25695 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
25696 (int) name_len, name,
25697 get_deprecated_property_msg(warning_offset));
25700 /* In a few properties, a different property is used under /i. These are
25701 * unlikely to change, so are hard-coded here. */
25703 if ( table_index == UNI_XPOSIXUPPER
25704 || table_index == UNI_XPOSIXLOWER
25705 || table_index == UNI_TITLE)
25707 table_index = UNI_CASED;
25709 else if ( table_index == UNI_UPPERCASELETTER
25710 || table_index == UNI_LOWERCASELETTER
25711 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
25712 || table_index == UNI_TITLECASELETTER
25715 table_index = UNI_CASEDLETTER;
25717 else if ( table_index == UNI_POSIXUPPER
25718 || table_index == UNI_POSIXLOWER)
25720 table_index = UNI_POSIXALPHA;
25724 /* Create and return the inversion list */
25725 prop_definition = get_prop_definition(table_index);
25726 sv_2mortal(prop_definition);
25728 /* See if there is a private use override to add to this definition */
25730 COPHH * hinthash = (IN_PERL_COMPILETIME)
25731 ? CopHINTHASH_get(&PL_compiling)
25732 : CopHINTHASH_get(PL_curcop);
25733 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
25735 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
25737 /* See if there is an element in the hints hash for this table */
25738 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
25739 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
25743 SV * pu_definition;
25745 SV * expanded_prop_definition =
25746 sv_2mortal(invlist_clone(prop_definition, NULL));
25748 /* If so, it's definition is the string from here to the next
25749 * \a character. And its format is the same as a user-defined
25751 pos += SvCUR(pu_lookup);
25752 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
25753 pu_invlist = handle_user_defined_property(lookup_name,
25756 0, /* Not folded */
25764 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25765 sv_catpvs(msg, "Insecure private-use override");
25766 goto append_name_to_msg;
25769 /* For now, as a safety measure, make sure that it doesn't
25770 * override non-private use code points */
25771 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
25773 /* Add it to the list to be returned */
25774 _invlist_union(prop_definition, pu_invlist,
25775 &expanded_prop_definition);
25776 prop_definition = expanded_prop_definition;
25777 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
25782 if (invert_return) {
25783 _invlist_invert(prop_definition);
25785 return prop_definition;
25787 unknown_user_defined:
25788 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25789 sv_catpvs(msg, "Unknown user-defined property name");
25790 goto append_name_to_msg;
25793 if (non_pkg_begin != 0) {
25794 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25795 sv_catpvs(msg, "Illegal user-defined property name");
25798 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25799 sv_catpvs(msg, "Can't find Unicode property definition");
25803 append_name_to_msg:
25805 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
25806 const char * suffix = (runtime && level == 0) ? "}" : "\"";
25808 sv_catpv(msg, prefix);
25809 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
25810 sv_catpv(msg, suffix);
25815 definition_deferred:
25818 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
25820 /* Here it could yet to be defined, so defer evaluation of this until
25821 * its needed at runtime. We need the fully qualified property name to
25822 * avoid ambiguity */
25824 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25828 /* If it didn't come with a package, or the package is utf8::, this
25829 * actually could be an official Unicode property whose inclusion we
25830 * are deferring until runtime to make sure that it isn't overridden by
25831 * a user-defined property of the same name (which we haven't
25832 * encountered yet). Add a marker to indicate this possibility, for
25833 * use at such time when we first need the definition during pattern
25834 * matching execution */
25835 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25836 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25839 /* We also need a trailing newline */
25840 sv_catpvs(fq_name, "\n");
25842 *user_defined_ptr = TRUE;
25848 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25849 const STRLEN wname_len, /* Its length */
25850 SV ** prop_definition,
25853 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25854 * any matches, adding them to prop_definition */
25858 CV * get_names_info; /* entry to charnames.pm to get info we need */
25859 SV * names_string; /* Contains all character names, except algo */
25860 SV * algorithmic_names; /* Contains info about algorithmically
25861 generated character names */
25862 REGEXP * subpattern_re; /* The user's pattern to match with */
25863 struct regexp * prog; /* The compiled pattern */
25864 char * all_names_start; /* lib/unicore/Name.pl string of every
25865 (non-algorithmic) character name */
25866 char * cur_pos; /* We match, effectively using /gc; this is
25867 where we are now */
25868 bool found_matches = FALSE; /* Did any name match so far? */
25869 SV * empty; /* For matching zero length names */
25870 SV * must_sv; /* Contains the substring, if any, that must be
25871 in a name for the subpattern to match */
25872 const char * must; /* The PV of 'must' */
25873 STRLEN must_len; /* And its length */
25874 SV * syllable_name = NULL; /* For Hangul syllables */
25875 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25876 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25878 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25879 * syllable name, and these are immutable and guaranteed by the Unicode
25880 * standard to never be extended */
25881 const STRLEN syl_max_len = hangul_prefix_len + 7;
25885 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25887 /* Make sure _charnames is loaded. (The parameters give context
25888 * for any errors generated */
25889 get_names_info = get_cv("_charnames::_get_names_info", 0);
25890 if (! get_names_info) {
25891 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25894 /* Get the charnames data */
25895 PUSHSTACKi(PERLSI_REGCOMP);
25903 /* Special _charnames entry point that returns the info this routine
25905 call_sv(MUTABLE_SV(get_names_info), G_LIST);
25909 /* Data structure for names which end in their very own code points */
25910 algorithmic_names = POPs;
25911 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25913 /* The lib/unicore/Name.pl string */
25914 names_string = POPs;
25915 SvREFCNT_inc_simple_void_NN(names_string);
25922 if ( ! SvROK(names_string)
25923 || ! SvROK(algorithmic_names))
25924 { /* Perhaps should panic instead XXX */
25925 SvREFCNT_dec(names_string);
25926 SvREFCNT_dec(algorithmic_names);
25930 names_string = sv_2mortal(SvRV(names_string));
25931 all_names_start = SvPVX(names_string);
25932 cur_pos = all_names_start;
25934 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25936 /* Compile the subpattern consisting of the name being looked for */
25937 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25939 must_sv = re_intuit_string(subpattern_re);
25941 /* regexec.c can free the re_intuit_string() return. GH #17734 */
25942 must_sv = sv_2mortal(newSVsv(must_sv));
25943 must = SvPV(must_sv, must_len);
25950 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
25951 * This works because the NUL causes the function to return early, thus
25952 * showing that there are characters in it other than the acceptable ones,
25953 * which is our desired result.) */
25955 prog = ReANY(subpattern_re);
25957 /* If only nothing is matched, skip to where empty names are looked for */
25958 if (prog->maxlen == 0) {
25962 /* And match against the string of all names /gc. Don't even try if it
25963 * must match a character not found in any name. */
25964 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25966 while (execute_wildcard(subpattern_re,
25968 SvEND(names_string),
25969 all_names_start, 0,
25972 { /* Here, matched. */
25974 /* Note the string entries look like
25975 * 00001\nSTART OF HEADING\n\n
25976 * so we could match anywhere in that string. We have to rule out
25977 * matching a code point line */
25978 char * this_name_start = all_names_start
25979 + RX_OFFS(subpattern_re)->start;
25980 char * this_name_end = all_names_start
25981 + RX_OFFS(subpattern_re)->end;
25984 UV cp = 0; /* Silences some compilers */
25985 AV * this_string = NULL;
25986 bool is_multi = FALSE;
25988 /* If matched nothing, advance to next possible match */
25989 if (this_name_start == this_name_end) {
25990 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25991 SvEND(names_string) - this_name_end);
25992 if (cur_pos == NULL) {
25997 /* Position the next match to start beyond the current returned
25999 cur_pos = (char *) memchr(this_name_end, '\n',
26000 SvEND(names_string) - this_name_end);
26003 /* Back up to the \n just before the beginning of the character. */
26004 cp_end = (char *) my_memrchr(all_names_start,
26006 this_name_start - all_names_start);
26008 /* If we didn't find a \n, it means it matched somewhere in the
26009 * initial '00000' in the string, so isn't a real match */
26010 if (cp_end == NULL) {
26014 this_name_start = cp_end + 1; /* The name starts just after */
26015 cp_end--; /* the \n, and the code point */
26016 /* ends just before it */
26018 /* All code points are 5 digits long */
26019 cp_start = cp_end - 4;
26021 /* This shouldn't happen, as we found a \n, and the first \n is
26022 * further along than what we subtracted */
26023 assert(cp_start >= all_names_start);
26025 if (cp_start == all_names_start) {
26026 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
26030 /* If the character is a blank, we either have a named sequence, or
26031 * something is wrong */
26032 if (*(cp_start - 1) == ' ') {
26033 cp_start = (char *) my_memrchr(all_names_start,
26035 cp_start - all_names_start);
26039 assert(cp_start != NULL && cp_start >= all_names_start + 2);
26041 /* Except for the first line in the string, the sequence before the
26042 * code point is \n\n. If that isn't the case here, we didn't
26043 * match the name of a character. (We could have matched a named
26044 * sequence, not currently handled */
26045 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
26049 /* We matched! Add this to the list */
26050 found_matches = TRUE;
26052 /* Loop through all the code points in the sequence */
26053 while (cp_start < cp_end) {
26055 /* Calculate this code point from its 5 digits */
26056 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
26057 + (XDIGIT_VALUE(cp_start[1]) << 12)
26058 + (XDIGIT_VALUE(cp_start[2]) << 8)
26059 + (XDIGIT_VALUE(cp_start[3]) << 4)
26060 + XDIGIT_VALUE(cp_start[4]);
26062 cp_start += 6; /* Go past any blank */
26064 if (cp_start < cp_end || is_multi) {
26065 if (this_string == NULL) {
26066 this_string = newAV();
26070 av_push(this_string, newSVuv(cp));
26074 if (is_multi) { /* Was more than one code point */
26075 if (*strings == NULL) {
26076 *strings = newAV();
26079 av_push(*strings, (SV *) this_string);
26081 else { /* Only a single code point */
26082 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
26084 } /* End of loop through the non-algorithmic names string */
26087 /* There are also character names not in 'names_string'. These are
26088 * algorithmically generatable. Try this pattern on each possible one.
26089 * (khw originally planned to leave this out given the large number of
26090 * matches attempted; but the speed turned out to be quite acceptable
26092 * There are plenty of opportunities to optimize to skip many of the tests.
26093 * beyond the rudimentary ones already here */
26095 /* First see if the subpattern matches any of the algorithmic generatable
26096 * Hangul syllable names.
26098 * We know none of these syllable names will match if the input pattern
26099 * requires more bytes than any syllable has, or if the input pattern only
26100 * matches an empty name, or if the pattern has something it must match and
26101 * one of the characters in that isn't in any Hangul syllable. */
26102 if ( prog->minlen <= (SSize_t) syl_max_len
26103 && prog->maxlen > 0
26104 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
26106 /* These constants, names, values, and algorithm are adapted from the
26107 * Unicode standard, version 5.1, section 3.12, and should never
26109 const char * JamoL[] = {
26110 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
26111 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
26113 const int LCount = C_ARRAY_LENGTH(JamoL);
26115 const char * JamoV[] = {
26116 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
26117 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
26120 const int VCount = C_ARRAY_LENGTH(JamoV);
26122 const char * JamoT[] = {
26123 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
26124 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
26125 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
26127 const int TCount = C_ARRAY_LENGTH(JamoT);
26131 /* This is the initial Hangul syllable code point; each time through the
26132 * inner loop, it maps to the next higher code point. For more info,
26133 * see the Hangul syllable section of the Unicode standard. */
26136 syllable_name = sv_2mortal(newSV(syl_max_len));
26137 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
26139 for (L = 0; L < LCount; L++) {
26140 for (V = 0; V < VCount; V++) {
26141 for (T = 0; T < TCount; T++) {
26143 /* Truncate back to the prefix, which is unvarying */
26144 SvCUR_set(syllable_name, hangul_prefix_len);
26146 sv_catpv(syllable_name, JamoL[L]);
26147 sv_catpv(syllable_name, JamoV[V]);
26148 sv_catpv(syllable_name, JamoT[T]);
26150 if (execute_wildcard(subpattern_re,
26151 SvPVX(syllable_name),
26152 SvEND(syllable_name),
26153 SvPVX(syllable_name), 0,
26157 *prop_definition = add_cp_to_invlist(*prop_definition,
26159 found_matches = TRUE;
26168 /* The rest of the algorithmically generatable names are of the form
26169 * "PREFIX-code_point". The prefixes and the code point limits of each
26170 * were returned to us in the array 'algorithmic_names' from data in
26171 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
26172 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
26175 /* Each element of the array is a hash, giving the details for the
26176 * series of names it covers. There is the base name of the characters
26177 * in the series, and the low and high code points in the series. And,
26178 * for optimization purposes a string containing all the legal
26179 * characters that could possibly be in a name in this series. */
26180 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
26181 SV * prefix = * hv_fetchs(this_series, "name", 0);
26182 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
26183 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
26184 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
26186 /* Pre-allocate an SV with enough space */
26187 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
26189 if (high >= 0x10000) {
26190 sv_catpvs(algo_name, "0");
26193 /* This series can be skipped entirely if the pattern requires
26194 * something longer than any name in the series, or can only match an
26195 * empty name, or contains a character not found in any name in the
26197 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
26198 && prog->maxlen > 0
26199 && (strspn(must, legal) == must_len))
26201 for (j = low; j <= high; j++) { /* For each code point in the series */
26203 /* Get its name, and see if it matches the subpattern */
26204 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
26207 if (execute_wildcard(subpattern_re,
26210 SvPVX(algo_name), 0,
26214 *prop_definition = add_cp_to_invlist(*prop_definition, j);
26215 found_matches = TRUE;
26222 /* Finally, see if the subpattern matches an empty string */
26223 empty = newSVpvs("");
26224 if (execute_wildcard(subpattern_re,
26231 /* Many code points have empty names. Currently these are the \p{GC=C}
26232 * ones, minus CC and CF */
26234 SV * empty_names_ref = get_prop_definition(UNI_C);
26235 SV * empty_names = invlist_clone(empty_names_ref, NULL);
26237 SV * subtract = get_prop_definition(UNI_CC);
26239 _invlist_subtract(empty_names, subtract, &empty_names);
26240 SvREFCNT_dec_NN(empty_names_ref);
26241 SvREFCNT_dec_NN(subtract);
26243 subtract = get_prop_definition(UNI_CF);
26244 _invlist_subtract(empty_names, subtract, &empty_names);
26245 SvREFCNT_dec_NN(subtract);
26247 _invlist_union(*prop_definition, empty_names, prop_definition);
26248 found_matches = TRUE;
26249 SvREFCNT_dec_NN(empty_names);
26251 SvREFCNT_dec_NN(empty);
26254 /* If we ever were to accept aliases for, say private use names, we would
26255 * need to do something fancier to find empty names. The code below works
26256 * (at the time it was written), and is slower than the above */
26257 const char empties_pat[] = "^.";
26258 if (strNE(name, empties_pat)) {
26259 SV * empty = newSVpvs("");
26260 if (execute_wildcard(subpattern_re,
26267 SV * empties = NULL;
26269 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
26271 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
26272 SvREFCNT_dec_NN(empties);
26274 found_matches = TRUE;
26276 SvREFCNT_dec_NN(empty);
26280 SvREFCNT_dec_NN(subpattern_re);
26281 return found_matches;
26285 * ex: set ts=8 sts=4 sw=4 et: