5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
147 #define STATIC static
150 /* this is a chain of data about sub patterns we are processing that
151 need to be handled separately/specially in study_chunk. Its so
152 we can simulate recursion without losing state. */
154 typedef struct scan_frame {
155 regnode *last_regnode; /* last node to process in this frame */
156 regnode *next_regnode; /* next node to process when last is reached */
157 U32 prev_recursed_depth;
158 I32 stopparen; /* what stopparen do we use */
159 bool in_gosub; /* this or an outer frame is for GOSUB */
161 struct scan_frame *this_prev_frame; /* this previous frame */
162 struct scan_frame *prev_frame; /* previous frame */
163 struct scan_frame *next_frame; /* next frame */
166 /* Certain characters are output as a sequence with the first being a
168 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
171 struct RExC_state_t {
172 U32 flags; /* RXf_* are we folding, multilining? */
173 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
174 char *precomp; /* uncompiled string. */
175 char *precomp_end; /* pointer to end of uncompiled string. */
176 REGEXP *rx_sv; /* The SV that is the regexp. */
177 regexp *rx; /* perl core regexp structure */
178 regexp_internal *rxi; /* internal data for regexp object
180 char *start; /* Start of input for compile */
181 char *end; /* End of input for compile */
182 char *parse; /* Input-scan pointer. */
183 char *copy_start; /* start of copy of input within
184 constructed parse string */
185 char *save_copy_start; /* Provides one level of saving
186 and restoring 'copy_start' */
187 char *copy_start_in_input; /* Position in input string
188 corresponding to copy_start */
189 SSize_t whilem_seen; /* number of WHILEM in this expr */
190 regnode *emit_start; /* Start of emitted-code area */
191 regnode_offset emit; /* Code-emit pointer */
192 I32 naughty; /* How bad is this pattern? */
193 I32 sawback; /* Did we see \1, ...? */
194 SSize_t size; /* Number of regnode equivalents in
196 Size_t sets_depth; /* Counts recursion depth of already-
197 compiled regex set patterns */
200 I32 parens_buf_size; /* #slots malloced open/close_parens */
201 regnode_offset *open_parens; /* offsets to open parens */
202 regnode_offset *close_parens; /* offsets to close parens */
203 HV *paren_names; /* Paren names */
205 /* position beyond 'precomp' of the warning message furthest away from
206 * 'precomp'. During the parse, no warnings are raised for any problems
207 * earlier in the parse than this position. This works if warnings are
208 * raised the first time a given spot is parsed, and if only one
209 * independent warning is raised for any given spot */
210 Size_t latest_warn_offset;
212 I32 npar; /* Capture buffer count so far in the
213 parse, (OPEN) plus one. ("par" 0 is
215 I32 total_par; /* During initial parse, is either 0,
216 or -1; the latter indicating a
217 reparse is needed. After that pass,
218 it is what 'npar' became after the
219 pass. Hence, it being > 0 indicates
220 we are in a reparse situation */
221 I32 nestroot; /* root parens we are in - used by
224 regnode *end_op; /* END node in program */
225 I32 utf8; /* whether the pattern is utf8 or not */
226 I32 orig_utf8; /* whether the pattern was originally in utf8 */
227 /* XXX use this for future optimisation of case
228 * where pattern must be upgraded to utf8. */
229 I32 uni_semantics; /* If a d charset modifier should use unicode
230 rules, even if the pattern is not in
233 I32 recurse_count; /* Number of recurse regops we have generated */
234 regnode **recurse; /* Recurse regops */
235 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
237 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
240 I32 override_recoding;
241 I32 recode_x_to_native;
242 I32 in_multi_char_class;
243 int code_index; /* next code_blocks[] slot */
244 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
246 SSize_t maxlen; /* mininum possible number of chars in string to match */
247 scan_frame *frame_head;
248 scan_frame *frame_last;
252 SV *runtime_code_qr; /* qr with the runtime code blocks */
254 const char *lastparse;
256 U32 study_chunk_recursed_count;
257 AV *paren_name_list; /* idx -> name */
261 #define RExC_lastparse (pRExC_state->lastparse)
262 #define RExC_lastnum (pRExC_state->lastnum)
263 #define RExC_paren_name_list (pRExC_state->paren_name_list)
264 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
265 #define RExC_mysv (pRExC_state->mysv1)
266 #define RExC_mysv1 (pRExC_state->mysv1)
267 #define RExC_mysv2 (pRExC_state->mysv2)
275 bool sWARN_EXPERIMENTAL__VLB;
276 bool sWARN_EXPERIMENTAL__REGEX_SETS;
279 #define RExC_flags (pRExC_state->flags)
280 #define RExC_pm_flags (pRExC_state->pm_flags)
281 #define RExC_precomp (pRExC_state->precomp)
282 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
283 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
284 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
285 #define RExC_precomp_end (pRExC_state->precomp_end)
286 #define RExC_rx_sv (pRExC_state->rx_sv)
287 #define RExC_rx (pRExC_state->rx)
288 #define RExC_rxi (pRExC_state->rxi)
289 #define RExC_start (pRExC_state->start)
290 #define RExC_end (pRExC_state->end)
291 #define RExC_parse (pRExC_state->parse)
292 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
293 #define RExC_whilem_seen (pRExC_state->whilem_seen)
294 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
295 under /d from /u ? */
297 #define RExC_emit (pRExC_state->emit)
298 #define RExC_emit_start (pRExC_state->emit_start)
299 #define RExC_sawback (pRExC_state->sawback)
300 #define RExC_seen (pRExC_state->seen)
301 #define RExC_size (pRExC_state->size)
302 #define RExC_maxlen (pRExC_state->maxlen)
303 #define RExC_npar (pRExC_state->npar)
304 #define RExC_total_parens (pRExC_state->total_par)
305 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
306 #define RExC_nestroot (pRExC_state->nestroot)
307 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
308 #define RExC_utf8 (pRExC_state->utf8)
309 #define RExC_uni_semantics (pRExC_state->uni_semantics)
310 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
311 #define RExC_open_parens (pRExC_state->open_parens)
312 #define RExC_close_parens (pRExC_state->close_parens)
313 #define RExC_end_op (pRExC_state->end_op)
314 #define RExC_paren_names (pRExC_state->paren_names)
315 #define RExC_recurse (pRExC_state->recurse)
316 #define RExC_recurse_count (pRExC_state->recurse_count)
317 #define RExC_sets_depth (pRExC_state->sets_depth)
318 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
319 #define RExC_study_chunk_recursed_bytes \
320 (pRExC_state->study_chunk_recursed_bytes)
321 #define RExC_in_lookaround (pRExC_state->in_lookaround)
322 #define RExC_contains_locale (pRExC_state->contains_locale)
323 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
326 # define SET_recode_x_to_native(x) \
327 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
329 # define SET_recode_x_to_native(x) NOOP
332 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
333 #define RExC_frame_head (pRExC_state->frame_head)
334 #define RExC_frame_last (pRExC_state->frame_last)
335 #define RExC_frame_count (pRExC_state->frame_count)
336 #define RExC_strict (pRExC_state->strict)
337 #define RExC_study_started (pRExC_state->study_started)
338 #define RExC_warn_text (pRExC_state->warn_text)
339 #define RExC_in_script_run (pRExC_state->in_script_run)
340 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
341 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
342 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
343 #define RExC_unlexed_names (pRExC_state->unlexed_names)
346 /***********************************************************************/
347 /* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse
349 * All of these macros depend on the above RExC_ accessor macros, which
350 * in turns depend on a variable pRExC_state being in scope where they
351 * are used. This is the standard regexp parser context variable which is
352 * passed into every non-trivial parse function in this file.
354 * Note that the UTF macro is itself a wrapper around RExC_utf8, so all
355 * of the macros which do not take an argument will operate on the
356 * pRExC_state structure *only*.
358 * Please do NOT modify RExC_parse without using these macros. In the
359 * future these macros will be extended for enhanced debugging and trace
360 * output during the parse process.
363 /* RExC_parse_incf(flag)
365 * Increment RExC_parse to point at the next codepoint, while doing
366 * the right thing depending on whether we are parsing UTF-8 strings
367 * or not. The 'flag' argument determines if content is UTF-8 or not,
368 * intended for cases where this is NOT governed by the UTF macro.
370 * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro.
372 * WARNING: Does NOT take into account RExC_end; it is the callers
373 * responsibility to make sure there are enough octets left in
374 * RExC_parse to ensure that when processing UTF-8 we would not read
375 * past the end of the string.
377 #define RExC_parse_incf(flag) STMT_START { \
378 RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \
381 /* RExC_parse_inc_safef(flag)
383 * Safely increment RExC_parse to point at the next codepoint,
384 * doing the right thing depending on whether we are parsing
385 * UTF-8 strings or not and NOT reading past the end of the buffer.
386 * The 'flag' argument determines if content is UTF-8 or not,
387 * intended for cases where this is NOT governed by the UTF macro.
389 * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro.
391 * NOTE: Will NOT read past RExC_end when content is UTF-8.
393 #define RExC_parse_inc_safef(flag) STMT_START { \
394 RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \
399 * Increment RExC_parse to point at the next codepoint,
400 * doing the right thing depending on whether we are parsing
401 * UTF-8 strings or not.
403 * WARNING: Does NOT take into account RExC_end, it is the callers
404 * responsibility to make sure there are enough octets left in
405 * RExC_parse to ensure that when processing UTF-8 we would not read
406 * past the end of the string.
408 * NOTE: whether we are parsing UTF-8 or not is determined by the
409 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
410 * macro operates on the pRExC_state structure only.
412 #define RExC_parse_inc() RExC_parse_incf(UTF)
414 /* RExC_parse_inc_safe()
416 * Safely increment RExC_parse to point at the next codepoint,
417 * doing the right thing depending on whether we are parsing
418 * UTF-8 strings or not and NOT reading past the end of the buffer.
420 * NOTE: whether we are parsing UTF-8 or not is determined by the
421 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
422 * macro operates on the pRExC_state structure only.
424 #define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF)
426 /* RExC_parse_inc_utf8()
428 * Increment RExC_parse to point at the next utf8 codepoint,
429 * assumes content is UTF-8.
431 * WARNING: Does NOT take into account RExC_end; it is the callers
432 * responsibility to make sure there are enough octets left in RExC_parse
433 * to ensure that when processing UTF-8 we would not read past the end
436 #define RExC_parse_inc_utf8() STMT_START { \
437 RExC_parse += UTF8SKIP(RExC_parse); \
440 /* RExC_parse_inc_if_char()
442 * Increment RExC_parse to point at the next codepoint, if and only
443 * if the current parse point is NOT a NULL, while doing the right thing
444 * depending on whether we are parsing UTF-8 strings or not.
446 * WARNING: Does NOT take into account RExC_end, it is the callers
447 * responsibility to make sure there are enough octets left in RExC_parse
448 * to ensure that when processing UTF-8 we would not read past the end
451 * NOTE: whether we are parsing UTF-8 or not is determined by the
452 * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this
453 * macro operates on the pRExC_state structure only.
455 #define RExC_parse_inc_if_char() STMT_START { \
456 RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \
459 /* RExC_parse_inc_by(n_octets)
461 * Increment the parse cursor by the number of octets specified by
462 * the 'n_octets' argument.
464 * NOTE: Does NOT check ANY constraints. It is the callers responsibility
465 * that this will not move past the end of the string, or leave the
466 * pointer in the middle of a UTF-8 sequence.
468 * Typically used to advanced past previously analyzed content.
470 #define RExC_parse_inc_by(n_octets) STMT_START { \
471 RExC_parse += (n_octets); \
474 /* RExC_parse_set(to_ptr)
476 * Sets the RExC_parse pointer to the pointer specified by the 'to'
477 * argument. No validation whatsoever is performed on the to pointer.
479 #define RExC_parse_set(to_ptr) STMT_START { \
480 RExC_parse = (to_ptr); \
483 /**********************************************************************/
485 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
486 * a flag to disable back-off on the fixed/floating substrings - if it's
487 * a high complexity pattern we assume the benefit of avoiding a full match
488 * is worth the cost of checking for the substrings even if they rarely help.
490 #define RExC_naughty (pRExC_state->naughty)
491 #define TOO_NAUGHTY (10)
492 #define MARK_NAUGHTY(add) \
493 if (RExC_naughty < TOO_NAUGHTY) \
494 RExC_naughty += (add)
495 #define MARK_NAUGHTY_EXP(exp, add) \
496 if (RExC_naughty < TOO_NAUGHTY) \
497 RExC_naughty += RExC_naughty / (exp) + (add)
499 #define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
500 #define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \
501 || ((*s) == '{' && regcurly(s, e, NULL)))
504 * Flags to be passed up.
506 #define HASWIDTH 0x01 /* Known to not match null strings, could match
508 #define SIMPLE 0x02 /* Exactly one character wide */
509 /* (or LNBREAK as a special case) */
510 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
511 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
512 #define RESTART_PARSE 0x20 /* Need to redo the parse */
513 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
514 calcuate sizes as UTF-8 */
516 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
518 /* whether trie related optimizations are enabled */
519 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
520 #define TRIE_STUDY_OPT
521 #define FULL_TRIE_STUDY
525 /* About the term "restudy" and the var "restudied" and the defines
526 * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to
527 * doing multiple study_chunk() calls over the same set of opcodes for* the
528 * purpose of enhanced TRIE optimizations.
530 * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal
531 * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled
532 * which then causes the Perl_re_op_compile() to then call the optimizer
533 * S_study_chunk() a second time to perform additional optimizations,
534 * including the aho_corasick startclass optimization.
535 * This additional pass will only happen once, which is managed by the
536 * 'restudied' variable in Perl_re_op_compile().
538 * When this second pass is under way the flags passed into study_chunk() will
539 * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down
540 * to any recursive calls to S_study_chunk().
542 * IMPORTANT: Any logic in study_chunk() that emits warnings should check that
543 * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may
546 * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and
547 * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details.
551 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
552 #define PBITVAL(paren) (1 << ((paren) & 7))
553 #define PAREN_OFFSET(depth) \
554 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
555 #define PAREN_TEST(depth, paren) \
556 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
557 #define PAREN_SET(depth, paren) \
558 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
559 #define PAREN_UNSET(depth, paren) \
560 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
562 #define REQUIRE_UTF8(flagp) STMT_START { \
564 *flagp = RESTART_PARSE|NEED_UTF8; \
569 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
570 * pattern is in UTF-8. This latter condition is in case the outermost rules
571 * are locale. See GH #17278 */
572 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
574 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
575 * a flag that indicates we need to override /d with /u as a result of
576 * something in the pattern. It should only be used in regards to calling
577 * set_regex_charset() or get_regex_charset() */
578 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
580 if (DEPENDS_SEMANTICS) { \
581 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
582 RExC_uni_semantics = 1; \
583 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
584 /* No need to restart the parse if we haven't seen \
585 * anything that differs between /u and /d, and no need \
586 * to restart immediately if we're going to reparse \
587 * anyway to count parens */ \
588 *flagp |= RESTART_PARSE; \
589 return restart_retval; \
594 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
596 RExC_use_BRANCHJ = 1; \
597 *flagp |= RESTART_PARSE; \
598 return restart_retval; \
601 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
602 * less. After that, it must always be positive, because the whole re is
603 * considered to be surrounded by virtual parens. Setting it to negative
604 * indicates there is some construct that needs to know the actual number of
605 * parens to be properly handled. And that means an extra pass will be
606 * required after we've counted them all */
607 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
608 #define REQUIRE_PARENS_PASS \
609 STMT_START { /* No-op if have completed a pass */ \
610 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
612 #define IN_PARENS_PASS (RExC_total_parens < 0)
615 /* This is used to return failure (zero) early from the calling function if
616 * various flags in 'flags' are set. Two flags always cause a return:
617 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
618 * additional flags that should cause a return; 0 if none. If the return will
619 * be done, '*flagp' is first set to be all of the flags that caused the
621 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
623 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
624 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
629 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
631 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
632 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
633 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
634 if (MUST_RESTART(*(flagp))) return 0
636 /* This converts the named class defined in regcomp.h to its equivalent class
637 * number defined in handy.h. */
638 #define namedclass_to_classnum(class) ((int) ((class) / 2))
639 #define classnum_to_namedclass(classnum) ((classnum) * 2)
641 #define _invlist_union_complement_2nd(a, b, output) \
642 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
643 #define _invlist_intersection_complement_2nd(a, b, output) \
644 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
646 /* We add a marker if we are deferring expansion of a property that is both
647 * 1) potentiallly user-defined; and
648 * 2) could also be an official Unicode property.
650 * Without this marker, any deferred expansion can only be for a user-defined
651 * one. This marker shouldn't conflict with any that could be in a legal name,
652 * and is appended to its name to indicate this. There is a string and
654 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
655 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
657 /* What is infinity for optimization purposes */
658 #define OPTIMIZE_INFTY SSize_t_MAX
660 /* About scan_data_t.
662 During optimisation we recurse through the regexp program performing
663 various inplace (keyhole style) optimisations. In addition study_chunk
664 and scan_commit populate this data structure with information about
665 what strings MUST appear in the pattern. We look for the longest
666 string that must appear at a fixed location, and we look for the
667 longest string that may appear at a floating location. So for instance
672 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
673 strings (because they follow a .* construct). study_chunk will identify
674 both FOO and BAR as being the longest fixed and floating strings respectively.
676 The strings can be composites, for instance
680 will result in a composite fixed substring 'foo'.
682 For each string some basic information is maintained:
685 This is the position the string must appear at, or not before.
686 It also implicitly (when combined with minlenp) tells us how many
687 characters must match before the string we are searching for.
688 Likewise when combined with minlenp and the length of the string it
689 tells us how many characters must appear after the string we have
693 Only used for floating strings. This is the rightmost point that
694 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
695 string can occur infinitely far to the right.
696 For fixed strings, it is equal to min_offset.
699 A pointer to the minimum number of characters of the pattern that the
700 string was found inside. This is important as in the case of positive
701 lookahead or positive lookbehind we can have multiple patterns
706 The minimum length of the pattern overall is 3, the minimum length
707 of the lookahead part is 3, but the minimum length of the part that
708 will actually match is 1. So 'FOO's minimum length is 3, but the
709 minimum length for the F is 1. This is important as the minimum length
710 is used to determine offsets in front of and behind the string being
711 looked for. Since strings can be composites this is the length of the
712 pattern at the time it was committed with a scan_commit. Note that
713 the length is calculated by study_chunk, so that the minimum lengths
714 are not known until the full pattern has been compiled, thus the
715 pointer to the value.
719 In the case of lookbehind the string being searched for can be
720 offset past the start point of the final matching string.
721 If this value was just blithely removed from the min_offset it would
722 invalidate some of the calculations for how many chars must match
723 before or after (as they are derived from min_offset and minlen and
724 the length of the string being searched for).
725 When the final pattern is compiled and the data is moved from the
726 scan_data_t structure into the regexp structure the information
727 about lookbehind is factored in, with the information that would
728 have been lost precalculated in the end_shift field for the
731 The fields pos_min and pos_delta are used to store the minimum offset
732 and the delta to the maximum offset at the current point in the pattern.
736 struct scan_data_substrs {
737 SV *str; /* longest substring found in pattern */
738 SSize_t min_offset; /* earliest point in string it can appear */
739 SSize_t max_offset; /* latest point in string it can appear */
740 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
741 SSize_t lookbehind; /* is the pos of the string modified by LB */
742 I32 flags; /* per substring SF_* and SCF_* flags */
745 typedef struct scan_data_t {
746 /*I32 len_min; unused */
747 /*I32 len_delta; unused */
751 SSize_t last_end; /* min value, <0 unless valid. */
752 SSize_t last_start_min;
753 SSize_t last_start_max;
754 U8 cur_is_floating; /* whether the last_* values should be set as
755 * the next fixed (0) or floating (1)
758 /* [0] is longest fixed substring so far, [1] is longest float so far */
759 struct scan_data_substrs substrs[2];
761 I32 flags; /* common SF_* and SCF_* flags */
763 SSize_t *last_closep;
764 regnode **last_close_opp; /* pointer to pointer to last CLOSE regop
765 seen. DO NOT DEREFERENCE the regnode
766 pointer - the op may have been optimized
768 regnode_ssc *start_class;
772 * Forward declarations for pregcomp()'s friends.
775 static const scan_data_t zero_scan_data = {
776 0, 0, NULL, 0, 0, 0, 0,
778 { NULL, 0, 0, 0, 0, 0 },
779 { NULL, 0, 0, 0, 0, 0 },
781 0, 0, NULL, NULL, NULL
786 #define SF_BEFORE_SEOL 0x0001
787 #define SF_BEFORE_MEOL 0x0002
788 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
790 #define SF_IS_INF 0x0040
791 #define SF_HAS_PAR 0x0080
792 #define SF_IN_PAR 0x0100
793 #define SF_HAS_EVAL 0x0200
796 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
797 * longest substring in the pattern. When it is not set the optimiser keeps
798 * track of position, but does not keep track of the actual strings seen,
800 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
803 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
804 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
805 * turned off because of the alternation (BRANCH). */
806 #define SCF_DO_SUBSTR 0x0400
808 #define SCF_DO_STCLASS_AND 0x0800
809 #define SCF_DO_STCLASS_OR 0x1000
810 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
811 #define SCF_WHILEM_VISITED_POS 0x2000
813 #define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()?
814 Search for "restudy" in this file
815 to find a detailed explanation.*/
816 #define SCF_SEEN_ACCEPT 0x8000
817 #define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now?
818 Search for "restudy" in this file
819 to find a detailed explanation. */
820 #define SCF_IN_DEFINE 0x20000
824 #define UTF cBOOL(RExC_utf8)
826 /* The enums for all these are ordered so things work out correctly */
827 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
828 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
829 == REGEX_DEPENDS_CHARSET)
830 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
831 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
832 >= REGEX_UNICODE_CHARSET)
833 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
834 == REGEX_ASCII_RESTRICTED_CHARSET)
835 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
836 >= REGEX_ASCII_RESTRICTED_CHARSET)
837 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
838 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
840 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
842 /* For programs that want to be strictly Unicode compatible by dying if any
843 * attempt is made to match a non-Unicode code point against a Unicode
845 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
847 #define OOB_NAMEDCLASS -1
849 /* There is no code point that is out-of-bounds, so this is problematic. But
850 * its only current use is to initialize a variable that is always set before
852 #define OOB_UNICODE 0xDEADBEEF
854 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
857 /* length of regex to show in messages that don't mark a position within */
858 #define RegexLengthToShowInErrorMessages 127
861 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
862 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
863 * op/pragma/warn/regcomp.
865 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
866 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
868 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
869 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
871 /* The code in this file in places uses one level of recursion with parsing
872 * rebased to an alternate string constructed by us in memory. This can take
873 * the form of something that is completely different from the input, or
874 * something that uses the input as part of the alternate. In the first case,
875 * there should be no possibility of an error, as we are in complete control of
876 * the alternate string. But in the second case we don't completely control
877 * the input portion, so there may be errors in that. Here's an example:
879 * is handled specially because \x{df} folds to a sequence of more than one
880 * character: 'ss'. What is done is to create and parse an alternate string,
881 * which looks like this:
882 * /(?:\x{DF}|[abc\x{DF}def])/ui
883 * where it uses the input unchanged in the middle of something it constructs,
884 * which is a branch for the DF outside the character class, and clustering
885 * parens around the whole thing. (It knows enough to skip the DF inside the
886 * class while in this substitute parse.) 'abc' and 'def' may have errors that
887 * need to be reported. The general situation looks like this:
889 * |<------- identical ------>|
891 * Input: ---------------------------------------------------------------
892 * Constructed: ---------------------------------------------------
894 * |<------- identical ------>|
896 * sI..eI is the portion of the input pattern we are concerned with here.
897 * sC..EC is the constructed substitute parse string.
898 * sC..tC is constructed by us
899 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
900 * In the diagram, these are vertically aligned.
901 * eC..EC is also constructed by us.
902 * xC is the position in the substitute parse string where we found a
904 * xI is the position in the original pattern corresponding to xC.
906 * We want to display a message showing the real input string. Thus we need to
907 * translate from xC to xI. We know that xC >= tC, since the portion of the
908 * string sC..tC has been constructed by us, and so shouldn't have errors. We
910 * xI = tI + (xC - tC)
912 * When the substitute parse is constructed, the code needs to set:
915 * RExC_copy_start_in_input (tI)
916 * RExC_copy_start_in_constructed (tC)
917 * and restore them when done.
919 * During normal processing of the input pattern, both
920 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
921 * sI, so that xC equals xI.
924 #define sI RExC_precomp
925 #define eI RExC_precomp_end
926 #define sC RExC_start
928 #define tI RExC_copy_start_in_input
929 #define tC RExC_copy_start_in_constructed
930 #define xI(xC) (tI + (xC - tC))
931 #define xI_offset(xC) (xI(xC) - sI)
933 #define REPORT_LOCATION_ARGS(xC) \
935 (xI(xC) > eI) /* Don't run off end */ \
936 ? eI - sI /* Length before the <--HERE */ \
937 : ((xI_offset(xC) >= 0) \
939 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
940 IVdf " trying to output message for " \
942 __FILE__, __LINE__, (IV) xI_offset(xC), \
943 ((int) (eC - sC)), sC), 0)), \
944 sI), /* The input pattern printed up to the <--HERE */ \
946 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
947 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
949 /* Used to point after bad bytes for an error message, but avoid skipping
950 * past a nul byte. */
951 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
953 /* Set up to clean up after our imminent demise */
954 #define PREPARE_TO_DIE \
957 SAVEFREESV(RExC_rx_sv); \
958 if (RExC_open_parens) \
959 SAVEFREEPV(RExC_open_parens); \
960 if (RExC_close_parens) \
961 SAVEFREEPV(RExC_close_parens); \
965 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
966 * arg. Show regex, up to a maximum length. If it's too long, chop and add
969 #define _FAIL(code) STMT_START { \
970 const char *ellipses = ""; \
971 IV len = RExC_precomp_end - RExC_precomp; \
974 if (len > RegexLengthToShowInErrorMessages) { \
975 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
976 len = RegexLengthToShowInErrorMessages - 10; \
982 #define FAIL(msg) _FAIL( \
983 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
984 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
986 #define FAIL2(msg,arg) _FAIL( \
987 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
988 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
990 #define FAIL3(msg,arg1,arg2) _FAIL( \
991 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
992 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
995 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
997 #define Simple_vFAIL(m) STMT_START { \
998 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
999 m, REPORT_LOCATION_ARGS(RExC_parse)); \
1003 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
1005 #define vFAIL(m) STMT_START { \
1011 * Like Simple_vFAIL(), but accepts two arguments.
1013 #define Simple_vFAIL2(m,a1) STMT_START { \
1014 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
1015 REPORT_LOCATION_ARGS(RExC_parse)); \
1019 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
1021 #define vFAIL2(m,a1) STMT_START { \
1023 Simple_vFAIL2(m, a1); \
1028 * Like Simple_vFAIL(), but accepts three arguments.
1030 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
1031 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
1032 REPORT_LOCATION_ARGS(RExC_parse)); \
1036 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
1038 #define vFAIL3(m,a1,a2) STMT_START { \
1040 Simple_vFAIL3(m, a1, a2); \
1044 * Like Simple_vFAIL(), but accepts four arguments.
1046 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
1047 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
1048 REPORT_LOCATION_ARGS(RExC_parse)); \
1051 #define vFAIL4(m,a1,a2,a3) STMT_START { \
1053 Simple_vFAIL4(m, a1, a2, a3); \
1056 /* A specialized version of vFAIL2 that works with UTF8f */
1057 #define vFAIL2utf8f(m, a1) STMT_START { \
1059 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
1060 REPORT_LOCATION_ARGS(RExC_parse)); \
1063 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
1065 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
1066 REPORT_LOCATION_ARGS(RExC_parse)); \
1069 /* Setting this to NULL is a signal to not output warnings */
1070 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
1072 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
1073 RExC_copy_start_in_constructed = NULL; \
1075 #define RESTORE_WARNINGS \
1076 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
1078 /* Since a warning can be generated multiple times as the input is reparsed, we
1079 * output it the first time we come to that point in the parse, but suppress it
1080 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
1081 * generate any warnings */
1082 #define TO_OUTPUT_WARNINGS(loc) \
1083 ( RExC_copy_start_in_constructed \
1084 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
1086 /* After we've emitted a warning, we save the position in the input so we don't
1087 * output it again */
1088 #define UPDATE_WARNINGS_LOC(loc) \
1090 if (TO_OUTPUT_WARNINGS(loc)) { \
1091 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
1096 /* 'warns' is the output of the packWARNx macro used in 'code' */
1097 #define _WARN_HELPER(loc, warns, code) \
1099 if (! RExC_copy_start_in_constructed) { \
1100 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
1101 " expected at '%s'", \
1102 __FILE__, __LINE__, loc); \
1104 if (TO_OUTPUT_WARNINGS(loc)) { \
1105 if (ckDEAD(warns)) \
1108 UPDATE_WARNINGS_LOC(loc); \
1112 /* m is not necessarily a "literal string", in this macro */
1113 #define warn_non_literal_string(loc, packed_warn, m) \
1114 _WARN_HELPER(loc, packed_warn, \
1115 Perl_warner(aTHX_ packed_warn, \
1116 "%s" REPORT_LOCATION, \
1117 m, REPORT_LOCATION_ARGS(loc)))
1118 #define reg_warn_non_literal_string(loc, m) \
1119 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
1121 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
1124 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
1125 Newx(format, format_size, char); \
1126 my_strlcpy(format, m, format_size); \
1127 my_strlcat(format, REPORT_LOCATION, format_size); \
1128 SAVEFREEPV(format); \
1129 _WARN_HELPER(loc, packwarn, \
1130 Perl_ck_warner(aTHX_ packwarn, \
1132 a1, REPORT_LOCATION_ARGS(loc))); \
1135 #define ckWARNreg(loc,m) \
1136 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1137 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1138 m REPORT_LOCATION, \
1139 REPORT_LOCATION_ARGS(loc)))
1141 #define vWARN(loc, m) \
1142 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1143 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1144 m REPORT_LOCATION, \
1145 REPORT_LOCATION_ARGS(loc))) \
1147 #define vWARN_dep(loc, m) \
1148 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
1149 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
1150 m REPORT_LOCATION, \
1151 REPORT_LOCATION_ARGS(loc)))
1153 #define ckWARNdep(loc,m) \
1154 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
1155 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
1156 m REPORT_LOCATION, \
1157 REPORT_LOCATION_ARGS(loc)))
1159 #define ckWARNregdep(loc,m) \
1160 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1161 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1163 m REPORT_LOCATION, \
1164 REPORT_LOCATION_ARGS(loc)))
1166 #define ckWARN2reg_d(loc,m, a1) \
1167 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1168 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1169 m REPORT_LOCATION, \
1170 a1, REPORT_LOCATION_ARGS(loc)))
1172 #define ckWARN2reg(loc, m, a1) \
1173 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1174 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1175 m REPORT_LOCATION, \
1176 a1, REPORT_LOCATION_ARGS(loc)))
1178 #define vWARN3(loc, m, a1, a2) \
1179 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1180 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1181 m REPORT_LOCATION, \
1182 a1, a2, REPORT_LOCATION_ARGS(loc)))
1184 #define ckWARN3reg(loc, m, a1, a2) \
1185 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1186 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1187 m REPORT_LOCATION, \
1189 REPORT_LOCATION_ARGS(loc)))
1191 #define vWARN4(loc, m, a1, a2, a3) \
1192 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1193 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1194 m REPORT_LOCATION, \
1196 REPORT_LOCATION_ARGS(loc)))
1198 #define ckWARN4reg(loc, m, a1, a2, a3) \
1199 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1200 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1201 m REPORT_LOCATION, \
1203 REPORT_LOCATION_ARGS(loc)))
1205 #define vWARN5(loc, m, a1, a2, a3, a4) \
1206 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1207 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1208 m REPORT_LOCATION, \
1210 REPORT_LOCATION_ARGS(loc)))
1212 #define ckWARNexperimental(loc, class, m) \
1214 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1215 RExC_warned_ ## class = 1; \
1216 _WARN_HELPER(loc, packWARN(class), \
1217 Perl_ck_warner_d(aTHX_ packWARN(class), \
1218 m REPORT_LOCATION, \
1219 REPORT_LOCATION_ARGS(loc)));\
1223 #define ckWARNexperimental_with_arg(loc, class, m, arg) \
1225 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1226 RExC_warned_ ## class = 1; \
1227 _WARN_HELPER(loc, packWARN(class), \
1228 Perl_ck_warner_d(aTHX_ packWARN(class), \
1229 m REPORT_LOCATION, \
1230 arg, REPORT_LOCATION_ARGS(loc)));\
1234 /* Convert between a pointer to a node and its offset from the beginning of the
1236 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1237 #define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \
1238 (SSize_t) ((node) - RExC_emit_start))
1240 #define ProgLen(ri) ri->proglen
1241 #define SetProgLen(ri,x) ri->proglen = x
1243 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1244 #define EXPERIMENTAL_INPLACESCAN
1245 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1248 S_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
1250 PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
1252 /* As the name says. The zeroth bit corresponds to the code point given by
1257 Zero(bitmap, len, U8);
1259 invlist_iterinit(invlist);
1260 while (invlist_iternext(invlist, &start, &end)) {
1261 assert(start >= offset);
1263 for (UV i = start; i <= end; i++) {
1264 UV adjusted = i - offset;
1266 BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
1269 invlist_iterfinish(invlist);
1273 S_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
1275 PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
1277 /* As the name says. The zeroth bit corresponds to the code point given by
1282 for (i = 0; i < bitmap_len; i++) {
1283 if (BITMAP_TEST(bitmap, i)) {
1286 /* Save a little work by adding a range all at once instead of bit
1288 while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
1292 *invlist = _add_range_to_invlist(*invlist,
1301 Perl_re_printf(pTHX_ const char *fmt, ...)
1305 PerlIO *f= Perl_debug_log;
1306 PERL_ARGS_ASSERT_RE_PRINTF;
1308 result = PerlIO_vprintf(f, fmt, ap);
1314 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1318 PerlIO *f= Perl_debug_log;
1319 PERL_ARGS_ASSERT_RE_INDENTF;
1320 va_start(ap, depth);
1321 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1322 result = PerlIO_vprintf(f, fmt, ap);
1326 #endif /* DEBUGGING */
1328 #define DEBUG_RExC_seen() \
1329 DEBUG_OPTIMISE_MORE_r({ \
1330 Perl_re_printf( aTHX_ "RExC_seen: "); \
1332 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1333 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1335 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1336 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1338 if (RExC_seen & REG_GPOS_SEEN) \
1339 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1341 if (RExC_seen & REG_RECURSE_SEEN) \
1342 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1344 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1345 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1347 if (RExC_seen & REG_VERBARG_SEEN) \
1348 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1350 if (RExC_seen & REG_CUTGROUP_SEEN) \
1351 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1353 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1354 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1356 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1357 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1359 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1360 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1362 Perl_re_printf( aTHX_ "\n"); \
1365 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1366 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1371 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1372 const char *close_str)
1377 Perl_re_printf( aTHX_ "%s", open_str);
1378 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1379 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1380 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1381 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1382 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1383 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1384 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1385 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1386 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1387 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1388 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1389 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1390 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1391 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1392 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1393 Perl_re_printf( aTHX_ "%s", close_str);
1398 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1399 U32 depth, int is_inf,
1400 SSize_t min, SSize_t stopmin, SSize_t delta)
1402 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1404 DEBUG_OPTIMISE_MORE_r({
1407 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1410 min, stopmin, delta,
1412 (IV)data->pos_delta,
1416 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1418 Perl_re_printf( aTHX_
1419 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1421 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1422 is_inf ? "INF " : ""
1425 if (data->last_found) {
1427 Perl_re_printf(aTHX_
1428 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1429 SvPVX_const(data->last_found),
1431 (IV)data->last_start_min,
1432 (IV)data->last_start_max
1435 for (i = 0; i < 2; i++) {
1436 Perl_re_printf(aTHX_
1437 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1438 data->cur_is_floating == i ? "*" : "",
1439 i ? "Float" : "Fixed",
1440 SvPVX_const(data->substrs[i].str),
1441 (IV)data->substrs[i].min_offset,
1442 (IV)data->substrs[i].max_offset
1444 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1448 Perl_re_printf( aTHX_ "\n");
1454 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1455 regnode *scan, U32 depth, U32 flags)
1457 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1464 Next = regnext(scan);
1465 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1466 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1469 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1470 Next ? (REG_NODE_NUM(Next)) : 0 );
1471 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1472 Perl_re_printf( aTHX_ "\n");
1477 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \
1478 S_debug_studydata(aTHX_ where, data, depth, is_inf, min, stopmin, delta)
1480 # define DEBUG_PEEP(str, scan, depth, flags) \
1481 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1484 # define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP
1485 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1489 /* =========================================================
1490 * BEGIN edit_distance stuff.
1492 * This calculates how many single character changes of any type are needed to
1493 * transform a string into another one. It is taken from version 3.1 of
1495 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1498 /* Our unsorted dictionary linked list. */
1499 /* Note we use UVs, not chars. */
1504 struct dictionary* next;
1506 typedef struct dictionary item;
1509 PERL_STATIC_INLINE item*
1510 push(UV key, item* curr)
1513 Newx(head, 1, item);
1521 PERL_STATIC_INLINE item*
1522 find(item* head, UV key)
1524 item* iterator = head;
1526 if (iterator->key == key){
1529 iterator = iterator->next;
1535 PERL_STATIC_INLINE item*
1536 uniquePush(item* head, UV key)
1538 item* iterator = head;
1541 if (iterator->key == key) {
1544 iterator = iterator->next;
1547 return push(key, head);
1550 PERL_STATIC_INLINE void
1551 dict_free(item* head)
1553 item* iterator = head;
1556 item* temp = iterator;
1557 iterator = iterator->next;
1564 /* End of Dictionary Stuff */
1566 /* All calculations/work are done here */
1568 S_edit_distance(const UV* src,
1570 const STRLEN x, /* length of src[] */
1571 const STRLEN y, /* length of tgt[] */
1572 const SSize_t maxDistance
1576 UV swapCount, swapScore, targetCharCount, i, j;
1578 UV score_ceil = x + y;
1580 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1582 /* intialize matrix start values */
1583 Newx(scores, ( (x + 2) * (y + 2)), UV);
1584 scores[0] = score_ceil;
1585 scores[1 * (y + 2) + 0] = score_ceil;
1586 scores[0 * (y + 2) + 1] = score_ceil;
1587 scores[1 * (y + 2) + 1] = 0;
1588 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1593 for (i=1;i<=x;i++) {
1595 head = uniquePush(head, src[i]);
1596 scores[(i+1) * (y + 2) + 1] = i;
1597 scores[(i+1) * (y + 2) + 0] = score_ceil;
1600 for (j=1;j<=y;j++) {
1603 head = uniquePush(head, tgt[j]);
1604 scores[1 * (y + 2) + (j + 1)] = j;
1605 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1608 targetCharCount = find(head, tgt[j-1])->value;
1609 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1611 if (src[i-1] != tgt[j-1]){
1612 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1616 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1620 find(head, src[i-1])->value = i;
1624 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1627 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1631 /* END of edit_distance() stuff
1632 * ========================================================= */
1634 /* Mark that we cannot extend a found fixed substring at this point.
1635 Update the longest found anchored substring or the longest found
1636 floating substrings if needed. */
1639 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1640 SSize_t *minlenp, int is_inf)
1642 const STRLEN l = CHR_SVLEN(data->last_found);
1643 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1644 const STRLEN old_l = CHR_SVLEN(longest_sv);
1645 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1647 PERL_ARGS_ASSERT_SCAN_COMMIT;
1649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1650 const U8 i = data->cur_is_floating;
1651 SvSetMagicSV(longest_sv, data->last_found);
1652 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1655 data->substrs[0].max_offset = data->substrs[0].min_offset;
1657 data->substrs[1].max_offset =
1661 ? data->last_start_max
1662 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1664 : data->pos_min + data->pos_delta));
1667 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1668 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1669 data->substrs[i].minlenp = minlenp;
1670 data->substrs[i].lookbehind = 0;
1673 SvCUR_set(data->last_found, 0);
1675 SV * const sv = data->last_found;
1676 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1677 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1682 data->last_end = -1;
1683 data->flags &= ~SF_BEFORE_EOL;
1684 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
1687 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1688 * list that describes which code points it matches */
1691 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1693 /* Set the SSC 'ssc' to match an empty string or any code point */
1695 PERL_ARGS_ASSERT_SSC_ANYTHING;
1697 assert(is_ANYOF_SYNTHETIC(ssc));
1699 /* mortalize so won't leak */
1700 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1701 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1705 S_ssc_is_anything(const regnode_ssc *ssc)
1707 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1708 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1709 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1710 * in any way, so there's no point in using it */
1712 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
1715 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1717 assert(is_ANYOF_SYNTHETIC(ssc));
1719 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1723 /* See if the list consists solely of the range 0 - Infinity */
1724 invlist_iterinit(ssc->invlist);
1725 ret = invlist_iternext(ssc->invlist, &start, &end)
1729 invlist_iterfinish(ssc->invlist);
1735 /* If e.g., both \w and \W are set, matches everything */
1736 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1738 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1739 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1749 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1751 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1752 * string, any code point, or any posix class under locale */
1754 PERL_ARGS_ASSERT_SSC_INIT;
1756 Zero(ssc, 1, regnode_ssc);
1757 set_ANYOF_SYNTHETIC(ssc);
1758 ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
1761 /* If any portion of the regex is to operate under locale rules that aren't
1762 * fully known at compile time, initialization includes it. The reason
1763 * this isn't done for all regexes is that the optimizer was written under
1764 * the assumption that locale was all-or-nothing. Given the complexity and
1765 * lack of documentation in the optimizer, and that there are inadequate
1766 * test cases for locale, many parts of it may not work properly, it is
1767 * safest to avoid locale unless necessary. */
1768 if (RExC_contains_locale) {
1769 ANYOF_POSIXL_SETALL(ssc);
1772 ANYOF_POSIXL_ZERO(ssc);
1777 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1778 const regnode_ssc *ssc)
1780 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1781 * to the list of code points matched, and locale posix classes; hence does
1782 * not check its flags) */
1784 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
1787 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1789 assert(is_ANYOF_SYNTHETIC(ssc));
1791 invlist_iterinit(ssc->invlist);
1792 ret = invlist_iternext(ssc->invlist, &start, &end)
1796 invlist_iterfinish(ssc->invlist);
1802 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1809 #define INVLIST_INDEX 0
1810 #define ONLY_LOCALE_MATCHES_INDEX 1
1811 #define DEFERRED_USER_DEFINED_INDEX 2
1814 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1815 const regnode_charclass* const node)
1817 /* Returns a mortal inversion list defining which code points are matched
1818 * by 'node', which is of ANYOF-ish type . Handles complementing the
1819 * result if appropriate. If some code points aren't knowable at this
1820 * time, the returned list must, and will, contain every code point that is
1824 SV* only_utf8_locale_invlist = NULL;
1825 bool new_node_has_latin1 = FALSE;
1826 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
1830 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1832 /* Look at the data structure created by S_set_ANYOF_arg() */
1833 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
1834 invlist = sv_2mortal(_new_invlist(1));
1835 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
1837 else if (ANYOF_HAS_AUX(node)) {
1838 const U32 n = ARG(node);
1839 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1840 AV * const av = MUTABLE_AV(SvRV(rv));
1841 SV **const ary = AvARRAY(av);
1843 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1845 /* Here there are things that won't be known until runtime -- we
1846 * have to assume it could be anything */
1847 invlist = sv_2mortal(_new_invlist(1));
1848 return _add_range_to_invlist(invlist, 0, UV_MAX);
1850 else if (ary[INVLIST_INDEX]) {
1852 /* Use the node's inversion list */
1853 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1856 /* Get the code points valid only under UTF-8 locales */
1857 if ( (flags & ANYOFL_FOLD)
1858 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1860 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1865 invlist = sv_2mortal(_new_invlist(0));
1868 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1869 * code points, and an inversion list for the others, but if there are code
1870 * points that should match only conditionally on the target string being
1871 * UTF-8, those are placed in the inversion list, and not the bitmap.
1872 * Since there are circumstances under which they could match, they are
1873 * included in the SSC. But if the ANYOF node is to be inverted, we have
1874 * to exclude them here, so that when we invert below, the end result
1875 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1876 * have to do this here before we add the unconditionally matched code
1878 if (flags & ANYOF_INVERT) {
1879 _invlist_intersection_complement_2nd(invlist,
1884 /* Add in the points from the bit map */
1885 if (REGNODE_TYPE(OP(node)) == ANYOF){
1886 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1887 if (ANYOF_BITMAP_TEST(node, i)) {
1888 unsigned int start = i++;
1890 for (; i < NUM_ANYOF_CODE_POINTS
1891 && ANYOF_BITMAP_TEST(node, i); ++i)
1895 invlist = _add_range_to_invlist(invlist, start, i-1);
1896 new_node_has_latin1 = TRUE;
1901 /* If this can match all upper Latin1 code points, have to add them
1902 * as well. But don't add them if inverting, as when that gets done below,
1903 * it would exclude all these characters, including the ones it shouldn't
1904 * that were added just above */
1905 if ( ! (flags & ANYOF_INVERT)
1906 && OP(node) == ANYOFD
1907 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
1909 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1912 /* Similarly for these */
1913 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
1914 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1917 if (flags & ANYOF_INVERT) {
1918 _invlist_invert(invlist);
1920 else if (flags & ANYOFL_FOLD) {
1921 if (new_node_has_latin1) {
1923 /* These folds are potential in Turkic locales */
1924 if (_invlist_contains_cp(invlist, 'i')) {
1925 invlist = add_cp_to_invlist(invlist,
1926 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1928 if (_invlist_contains_cp(invlist, 'I')) {
1929 invlist = add_cp_to_invlist(invlist,
1930 LATIN_SMALL_LETTER_DOTLESS_I);
1933 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1934 * the locale. We can skip this if there are no 0-255 at all. */
1935 _invlist_union(invlist, PL_Latin1, &invlist);
1938 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1939 invlist = add_cp_to_invlist(invlist, 'I');
1941 if (_invlist_contains_cp(invlist,
1942 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1944 invlist = add_cp_to_invlist(invlist, 'i');
1949 /* Similarly add the UTF-8 locale possible matches. These have to be
1950 * deferred until after the non-UTF-8 locale ones are taken care of just
1951 * above, or it leads to wrong results under ANYOF_INVERT */
1952 if (only_utf8_locale_invlist) {
1953 _invlist_union_maybe_complement_2nd(invlist,
1954 only_utf8_locale_invlist,
1955 flags & ANYOF_INVERT,
1962 /* These two functions currently do the exact same thing */
1963 #define ssc_init_zero ssc_init
1965 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1966 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1968 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1969 * should not be inverted. */
1972 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1973 const regnode_charclass *and_with)
1975 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1976 * another SSC or a regular ANYOF class. Can create false positives. */
1979 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
1980 ? ANYOF_FLAGS(and_with)
1984 PERL_ARGS_ASSERT_SSC_AND;
1986 assert(is_ANYOF_SYNTHETIC(ssc));
1988 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1989 * the code point inversion list and just the relevant flags */
1990 if (is_ANYOF_SYNTHETIC(and_with)) {
1991 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1992 anded_flags = and_with_flags;
1994 /* XXX This is a kludge around what appears to be deficiencies in the
1995 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1996 * there are paths through the optimizer where it doesn't get weeded
1997 * out when it should. And if we don't make some extra provision for
1998 * it like the code just below, it doesn't get added when it should.
1999 * This solution is to add it only when AND'ing, which is here, and
2000 * only when what is being AND'ed is the pristine, original node
2001 * matching anything. Thus it is like adding it to ssc_anything() but
2002 * only when the result is to be AND'ed. Probably the same solution
2003 * could be adopted for the same problem we have with /l matching,
2004 * which is solved differently in S_ssc_init(), and that would lead to
2005 * fewer false positives than that solution has. But if this solution
2006 * creates bugs, the consequences are only that a warning isn't raised
2007 * that should be; while the consequences for having /l bugs is
2008 * incorrect matches */
2009 if (ssc_is_anything((regnode_ssc *)and_with)) {
2010 anded_flags |= ANYOF_WARN_SUPER__shared;
2014 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
2015 if (OP(and_with) == ANYOFD) {
2016 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
2019 anded_flags = and_with_flags
2020 & ( ANYOF_COMMON_FLAGS
2021 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
2022 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
2023 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
2024 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
2029 ANYOF_FLAGS(ssc) &= anded_flags;
2031 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2032 * C2 is the list of code points in 'and-with'; P2, its posix classes.
2033 * 'and_with' may be inverted. When not inverted, we have the situation of
2035 * (C1 | P1) & (C2 | P2)
2036 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
2037 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
2038 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
2039 * <= ((C1 & C2) | P1 | P2)
2040 * Alternatively, the last few steps could be:
2041 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
2042 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
2043 * <= (C1 | C2 | (P1 & P2))
2044 * We favor the second approach if either P1 or P2 is non-empty. This is
2045 * because these components are a barrier to doing optimizations, as what
2046 * they match cannot be known until the moment of matching as they are
2047 * dependent on the current locale, 'AND"ing them likely will reduce or
2049 * But we can do better if we know that C1,P1 are in their initial state (a
2050 * frequent occurrence), each matching everything:
2051 * (<everything>) & (C2 | P2) = C2 | P2
2052 * Similarly, if C2,P2 are in their initial state (again a frequent
2053 * occurrence), the result is a no-op
2054 * (C1 | P1) & (<everything>) = C1 | P1
2057 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
2058 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
2059 * <= (C1 & ~C2) | (P1 & ~P2)
2062 if ((and_with_flags & ANYOF_INVERT)
2063 && ! is_ANYOF_SYNTHETIC(and_with))
2067 ssc_intersection(ssc,
2069 FALSE /* Has already been inverted */
2072 /* If either P1 or P2 is empty, the intersection will be also; can skip
2074 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
2075 ANYOF_POSIXL_ZERO(ssc);
2077 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2079 /* Note that the Posix class component P from 'and_with' actually
2081 * P = Pa | Pb | ... | Pn
2082 * where each component is one posix class, such as in [\w\s].
2084 * ~P = ~(Pa | Pb | ... | Pn)
2085 * = ~Pa & ~Pb & ... & ~Pn
2086 * <= ~Pa | ~Pb | ... | ~Pn
2087 * The last is something we can easily calculate, but unfortunately
2088 * is likely to have many false positives. We could do better
2089 * in some (but certainly not all) instances if two classes in
2090 * P have known relationships. For example
2091 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
2093 * :lower: & :print: = :lower:
2094 * And similarly for classes that must be disjoint. For example,
2095 * since \s and \w can have no elements in common based on rules in
2096 * the POSIX standard,
2097 * \w & ^\S = nothing
2098 * Unfortunately, some vendor locales do not meet the Posix
2099 * standard, in particular almost everything by Microsoft.
2100 * The loop below just changes e.g., \w into \W and vice versa */
2102 regnode_charclass_posixl temp;
2103 int add = 1; /* To calculate the index of the complement */
2105 Zero(&temp, 1, regnode_charclass_posixl);
2106 ANYOF_POSIXL_ZERO(&temp);
2107 for (i = 0; i < ANYOF_MAX; i++) {
2109 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
2110 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
2112 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
2113 ANYOF_POSIXL_SET(&temp, i + add);
2115 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
2117 ANYOF_POSIXL_AND(&temp, ssc);
2119 } /* else ssc already has no posixes */
2120 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
2121 in its initial state */
2122 else if (! is_ANYOF_SYNTHETIC(and_with)
2123 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
2125 /* But if 'ssc' is in its initial state, the result is just 'and_with';
2126 * copy it over 'ssc' */
2127 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
2128 if (is_ANYOF_SYNTHETIC(and_with)) {
2129 StructCopy(and_with, ssc, regnode_ssc);
2132 ssc->invlist = anded_cp_list;
2133 ANYOF_POSIXL_ZERO(ssc);
2134 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
2135 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
2139 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
2140 || (and_with_flags & ANYOF_MATCHES_POSIXL))
2142 /* One or the other of P1, P2 is non-empty. */
2143 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
2144 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
2146 ssc_union(ssc, anded_cp_list, FALSE);
2148 else { /* P1 = P2 = empty */
2149 ssc_intersection(ssc, anded_cp_list, FALSE);
2155 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
2156 const regnode_charclass *or_with)
2158 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
2159 * another SSC or a regular ANYOF class. Can create false positives if
2160 * 'or_with' is to be inverted. */
2164 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
2165 ? ANYOF_FLAGS(or_with)
2168 PERL_ARGS_ASSERT_SSC_OR;
2170 assert(is_ANYOF_SYNTHETIC(ssc));
2172 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2173 * the code point inversion list and just the relevant flags */
2174 if (is_ANYOF_SYNTHETIC(or_with)) {
2175 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2176 ored_flags = or_with_flags;
2179 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2180 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2181 if (OP(or_with) != ANYOFD) {
2183 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
2184 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
2185 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
2186 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
2191 ANYOF_FLAGS(ssc) |= ored_flags;
2193 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2194 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2195 * 'or_with' may be inverted. When not inverted, we have the simple
2196 * situation of computing:
2197 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2198 * If P1|P2 yields a situation with both a class and its complement are
2199 * set, like having both \w and \W, this matches all code points, and we
2200 * can delete these from the P component of the ssc going forward. XXX We
2201 * might be able to delete all the P components, but I (khw) am not certain
2202 * about this, and it is better to be safe.
2205 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2206 * <= (C1 | P1) | ~C2
2207 * <= (C1 | ~C2) | P1
2208 * (which results in actually simpler code than the non-inverted case)
2211 if ((or_with_flags & ANYOF_INVERT)
2212 && ! is_ANYOF_SYNTHETIC(or_with))
2214 /* We ignore P2, leaving P1 going forward */
2215 } /* else Not inverted */
2216 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2217 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2218 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2220 for (i = 0; i < ANYOF_MAX; i += 2) {
2221 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2223 ssc_match_all_cp(ssc);
2224 ANYOF_POSIXL_CLEAR(ssc, i);
2225 ANYOF_POSIXL_CLEAR(ssc, i+1);
2233 FALSE /* Already has been inverted */
2238 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2240 PERL_ARGS_ASSERT_SSC_UNION;
2242 assert(is_ANYOF_SYNTHETIC(ssc));
2244 _invlist_union_maybe_complement_2nd(ssc->invlist,
2251 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2253 const bool invert2nd)
2255 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2257 assert(is_ANYOF_SYNTHETIC(ssc));
2259 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2266 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2268 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2270 assert(is_ANYOF_SYNTHETIC(ssc));
2272 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2276 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2278 /* AND just the single code point 'cp' into the SSC 'ssc' */
2280 SV* cp_list = _new_invlist(2);
2282 PERL_ARGS_ASSERT_SSC_CP_AND;
2284 assert(is_ANYOF_SYNTHETIC(ssc));
2286 cp_list = add_cp_to_invlist(cp_list, cp);
2287 ssc_intersection(ssc, cp_list,
2288 FALSE /* Not inverted */
2290 SvREFCNT_dec_NN(cp_list);
2294 S_ssc_clear_locale(regnode_ssc *ssc)
2296 /* Set the SSC 'ssc' to not match any locale things */
2297 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2299 assert(is_ANYOF_SYNTHETIC(ssc));
2301 ANYOF_POSIXL_ZERO(ssc);
2302 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2306 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2308 /* The synthetic start class is used to hopefully quickly winnow down
2309 * places where a pattern could start a match in the target string. If it
2310 * doesn't really narrow things down that much, there isn't much point to
2311 * having the overhead of using it. This function uses some very crude
2312 * heuristics to decide if to use the ssc or not.
2314 * It returns TRUE if 'ssc' rules out more than half what it considers to
2315 * be the "likely" possible matches, but of course it doesn't know what the
2316 * actual things being matched are going to be; these are only guesses
2318 * For /l matches, it assumes that the only likely matches are going to be
2319 * in the 0-255 range, uniformly distributed, so half of that is 127
2320 * For /a and /d matches, it assumes that the likely matches will be just
2321 * the ASCII range, so half of that is 63
2322 * For /u and there isn't anything matching above the Latin1 range, it
2323 * assumes that that is the only range likely to be matched, and uses
2324 * half that as the cut-off: 127. If anything matches above Latin1,
2325 * it assumes that all of Unicode could match (uniformly), except for
2326 * non-Unicode code points and things in the General Category "Other"
2327 * (unassigned, private use, surrogates, controls and formats). This
2328 * is a much large number. */
2330 U32 count = 0; /* Running total of number of code points matched by
2332 UV start, end; /* Start and end points of current range in inversion
2333 XXX outdated. UTF-8 locales are common, what about invert? list */
2334 const U32 max_code_points = (LOC)
2336 : (( ! UNI_SEMANTICS
2337 || invlist_highest(ssc->invlist) < 256)
2340 const U32 max_match = max_code_points / 2;
2342 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2344 invlist_iterinit(ssc->invlist);
2345 while (invlist_iternext(ssc->invlist, &start, &end)) {
2346 if (start >= max_code_points) {
2349 end = MIN(end, max_code_points - 1);
2350 count += end - start + 1;
2351 if (count >= max_match) {
2352 invlist_iterfinish(ssc->invlist);
2362 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2364 /* The inversion list in the SSC is marked mortal; now we need a more
2365 * permanent copy, which is stored the same way that is done in a regular
2366 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2369 SV* invlist = invlist_clone(ssc->invlist, NULL);
2371 PERL_ARGS_ASSERT_SSC_FINALIZE;
2373 assert(is_ANYOF_SYNTHETIC(ssc));
2375 /* The code in this file assumes that all but these flags aren't relevant
2376 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2377 * by the time we reach here */
2378 assert(! (ANYOF_FLAGS(ssc)
2379 & ~( ANYOF_COMMON_FLAGS
2380 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
2381 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
2383 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
2385 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2386 SvREFCNT_dec(invlist);
2388 /* Make sure is clone-safe */
2389 ssc->invlist = NULL;
2391 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2392 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2393 OP(ssc) = ANYOFPOSIXL;
2395 else if (RExC_contains_locale) {
2399 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2402 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2403 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2404 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2405 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2406 ? (TRIE_LIST_CUR( idx ) - 1) \
2412 dump_trie(trie,widecharmap,revcharmap)
2413 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2414 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2416 These routines dump out a trie in a somewhat readable format.
2417 The _interim_ variants are used for debugging the interim
2418 tables that are used to generate the final compressed
2419 representation which is what dump_trie expects.
2421 Part of the reason for their existence is to provide a form
2422 of documentation as to how the different representations function.
2427 Dumps the final compressed table form of the trie to Perl_debug_log.
2428 Used for debugging make_trie().
2432 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2433 AV *revcharmap, U32 depth)
2436 SV *sv=sv_newmortal();
2437 int colwidth= widecharmap ? 6 : 4;
2439 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2441 PERL_ARGS_ASSERT_DUMP_TRIE;
2443 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2444 depth+1, "Match","Base","Ofs" );
2446 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2447 SV ** const tmp = av_fetch( revcharmap, state, 0);
2449 Perl_re_printf( aTHX_ "%*s",
2451 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2452 PL_colors[0], PL_colors[1],
2453 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2454 PERL_PV_ESCAPE_FIRSTCHAR
2459 Perl_re_printf( aTHX_ "\n");
2460 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2462 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2463 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2464 Perl_re_printf( aTHX_ "\n");
2466 for( state = 1 ; state < trie->statecount ; state++ ) {
2467 const U32 base = trie->states[ state ].trans.base;
2469 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2471 if ( trie->states[ state ].wordnum ) {
2472 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2474 Perl_re_printf( aTHX_ "%6s", "" );
2477 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2482 while( ( base + ofs < trie->uniquecharcount ) ||
2483 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2484 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2488 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2490 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2491 if ( ( base + ofs >= trie->uniquecharcount )
2492 && ( base + ofs - trie->uniquecharcount
2494 && trie->trans[ base + ofs
2495 - trie->uniquecharcount ].check == state )
2497 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2498 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2501 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2505 Perl_re_printf( aTHX_ "]");
2508 Perl_re_printf( aTHX_ "\n" );
2510 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2512 for (word=1; word <= trie->wordcount; word++) {
2513 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2514 (int)word, (int)(trie->wordinfo[word].prev),
2515 (int)(trie->wordinfo[word].len));
2517 Perl_re_printf( aTHX_ "\n" );
2520 Dumps a fully constructed but uncompressed trie in list form.
2521 List tries normally only are used for construction when the number of
2522 possible chars (trie->uniquecharcount) is very high.
2523 Used for debugging make_trie().
2526 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2527 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2531 SV *sv=sv_newmortal();
2532 int colwidth= widecharmap ? 6 : 4;
2533 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2535 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2537 /* print out the table precompression. */
2538 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2540 Perl_re_indentf( aTHX_ "%s",
2541 depth+1, "------:-----+-----------------\n" );
2543 for( state=1 ; state < next_alloc ; state ++ ) {
2546 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2547 depth+1, (UV)state );
2548 if ( ! trie->states[ state ].wordnum ) {
2549 Perl_re_printf( aTHX_ "%5s| ","");
2551 Perl_re_printf( aTHX_ "W%4x| ",
2552 trie->states[ state ].wordnum
2555 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2556 SV ** const tmp = av_fetch( revcharmap,
2557 TRIE_LIST_ITEM(state, charid).forid, 0);
2559 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2561 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2563 PL_colors[0], PL_colors[1],
2564 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2565 | PERL_PV_ESCAPE_FIRSTCHAR
2567 TRIE_LIST_ITEM(state, charid).forid,
2568 (UV)TRIE_LIST_ITEM(state, charid).newstate
2571 Perl_re_printf( aTHX_ "\n%*s| ",
2572 (int)((depth * 2) + 14), "");
2575 Perl_re_printf( aTHX_ "\n");
2580 Dumps a fully constructed but uncompressed trie in table form.
2581 This is the normal DFA style state transition table, with a few
2582 twists to facilitate compression later.
2583 Used for debugging make_trie().
2586 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2587 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2592 SV *sv=sv_newmortal();
2593 int colwidth= widecharmap ? 6 : 4;
2594 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2596 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2599 print out the table precompression so that we can do a visual check
2600 that they are identical.
2603 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2605 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2606 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2608 Perl_re_printf( aTHX_ "%*s",
2610 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2611 PL_colors[0], PL_colors[1],
2612 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2613 PERL_PV_ESCAPE_FIRSTCHAR
2619 Perl_re_printf( aTHX_ "\n");
2620 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2622 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2623 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2626 Perl_re_printf( aTHX_ "\n" );
2628 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2630 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2632 (UV)TRIE_NODENUM( state ) );
2634 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2635 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2637 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2639 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2641 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2642 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2643 (UV)trie->trans[ state ].check );
2645 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2646 (UV)trie->trans[ state ].check,
2647 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2655 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2656 startbranch: the first branch in the whole branch sequence
2657 first : start branch of sequence of branch-exact nodes.
2658 May be the same as startbranch
2659 last : Thing following the last branch.
2660 May be the same as tail.
2661 tail : item following the branch sequence
2662 count : words in the sequence
2663 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2664 depth : indent depth
2666 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2668 A trie is an N'ary tree where the branches are determined by digital
2669 decomposition of the key. IE, at the root node you look up the 1st character and
2670 follow that branch repeat until you find the end of the branches. Nodes can be
2671 marked as "accepting" meaning they represent a complete word. Eg:
2675 would convert into the following structure. Numbers represent states, letters
2676 following numbers represent valid transitions on the letter from that state, if
2677 the number is in square brackets it represents an accepting state, otherwise it
2678 will be in parenthesis.
2680 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2684 (1) +-i->(6)-+-s->[7]
2686 +-s->(3)-+-h->(4)-+-e->[5]
2688 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2690 This shows that when matching against the string 'hers' we will begin at state 1
2691 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2692 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2693 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2694 single traverse. We store a mapping from accepting to state to which word was
2695 matched, and then when we have multiple possibilities we try to complete the
2696 rest of the regex in the order in which they occurred in the alternation.
2698 The only prior NFA like behaviour that would be changed by the TRIE support is
2699 the silent ignoring of duplicate alternations which are of the form:
2701 / (DUPE|DUPE) X? (?{ ... }) Y /x
2703 Thus EVAL blocks following a trie may be called a different number of times with
2704 and without the optimisation. With the optimisations dupes will be silently
2705 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2706 the following demonstrates:
2708 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2710 which prints out 'word' three times, but
2712 'words'=~/(word|word|word)(?{ print $1 })S/
2714 which doesnt print it out at all. This is due to other optimisations kicking in.
2716 Example of what happens on a structural level:
2718 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2720 1: CURLYM[1] {1,32767}(18)
2731 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2732 and should turn into:
2734 1: CURLYM[1] {1,32767}(18)
2736 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2744 Cases where tail != last would be like /(?foo|bar)baz/:
2754 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2755 and would end up looking like:
2758 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2765 d = uvchr_to_utf8_flags(d, uv, 0);
2767 is the recommended Unicode-aware way of saying
2772 #define TRIE_STORE_REVCHAR(val) \
2775 SV *zlopp = newSV(UTF8_MAXBYTES); \
2776 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2777 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2779 SvCUR_set(zlopp, kapow - flrbbbbb); \
2782 av_push(revcharmap, zlopp); \
2784 char ooooff = (char)val; \
2785 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2789 /* This gets the next character from the input, folding it if not already
2791 #define TRIE_READ_CHAR STMT_START { \
2794 /* if it is UTF then it is either already folded, or does not need \
2796 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2798 else if (folder == PL_fold_latin1) { \
2799 /* This folder implies Unicode rules, which in the range expressible \
2800 * by not UTF is the lower case, with the two exceptions, one of \
2801 * which should have been taken care of before calling this */ \
2802 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2803 uvc = toLOWER_L1(*uc); \
2804 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2807 /* raw data, will be folded later if needed */ \
2815 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2816 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2817 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2818 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2819 TRIE_LIST_LEN( state ) = ging; \
2821 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2823 TRIE_LIST_CUR( state )++; \
2826 #define TRIE_LIST_NEW(state) STMT_START { \
2827 Newx( trie->states[ state ].trans.list, \
2828 4, reg_trie_trans_le ); \
2829 TRIE_LIST_CUR( state ) = 1; \
2830 TRIE_LIST_LEN( state ) = 4; \
2833 #define TRIE_HANDLE_WORD(state) STMT_START { \
2834 U16 dupe= trie->states[ state ].wordnum; \
2835 regnode * const noper_next = regnext( noper ); \
2838 /* store the word for dumping */ \
2840 if (OP(noper) != NOTHING) \
2841 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2843 tmp = newSVpvn_utf8( "", 0, UTF ); \
2844 av_push( trie_words, tmp ); \
2848 trie->wordinfo[curword].prev = 0; \
2849 trie->wordinfo[curword].len = wordlen; \
2850 trie->wordinfo[curword].accept = state; \
2852 if ( noper_next < tail ) { \
2854 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2856 trie->jump[curword] = (U16)(noper_next - convert); \
2858 jumper = noper_next; \
2860 nextbranch= regnext(cur); \
2864 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2865 /* chain, so that when the bits of chain are later */\
2866 /* linked together, the dups appear in the chain */\
2867 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2868 trie->wordinfo[dupe].prev = curword; \
2870 /* we haven't inserted this word yet. */ \
2871 trie->states[ state ].wordnum = curword; \
2876 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2877 ( ( base + charid >= ucharcount \
2878 && base + charid < ubound \
2879 && state == trie->trans[ base - ucharcount + charid ].check \
2880 && trie->trans[ base - ucharcount + charid ].next ) \
2881 ? trie->trans[ base - ucharcount + charid ].next \
2882 : ( state==1 ? special : 0 ) \
2885 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2887 TRIE_BITMAP_SET(trie, uvc); \
2888 /* store the folded codepoint */ \
2890 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2893 /* store first byte of utf8 representation of */ \
2894 /* variant codepoints */ \
2895 if (! UVCHR_IS_INVARIANT(uvc)) { \
2896 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2901 #define MADE_JUMP_TRIE 2
2902 #define MADE_EXACT_TRIE 4
2905 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2906 regnode *first, regnode *last, regnode *tail,
2907 U32 word_count, U32 flags, U32 depth)
2909 /* first pass, loop through and scan words */
2910 reg_trie_data *trie;
2911 HV *widecharmap = NULL;
2912 AV *revcharmap = newAV();
2918 regnode *jumper = NULL;
2919 regnode *nextbranch = NULL;
2920 regnode *convert = NULL;
2921 U32 *prev_states; /* temp array mapping each state to previous one */
2922 /* we just use folder as a flag in utf8 */
2923 const U8 * folder = NULL;
2925 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2926 * which stands for one trie structure, one hash, optionally followed
2929 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2930 AV *trie_words = NULL;
2931 /* along with revcharmap, this only used during construction but both are
2932 * useful during debugging so we store them in the struct when debugging.
2935 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2936 STRLEN trie_charcount=0;
2938 SV *re_trie_maxbuff;
2939 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2941 PERL_ARGS_ASSERT_MAKE_TRIE;
2943 PERL_UNUSED_ARG(depth);
2947 case EXACT: case EXACT_REQ8: case EXACTL: break;
2951 case EXACTFLU8: folder = PL_fold_latin1; break;
2952 case EXACTF: folder = PL_fold; break;
2953 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
2956 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2958 trie->startstate = 1;
2959 trie->wordcount = word_count;
2960 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2961 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2962 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2963 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2964 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2965 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2968 trie_words = newAV();
2971 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2972 assert(re_trie_maxbuff);
2973 if (!SvIOK(re_trie_maxbuff)) {
2974 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2976 DEBUG_TRIE_COMPILE_r({
2977 Perl_re_indentf( aTHX_
2978 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2980 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2981 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2984 /* Find the node we are going to overwrite */
2985 if ( first == startbranch && OP( last ) != BRANCH ) {
2986 /* whole branch chain */
2989 /* branch sub-chain */
2990 convert = REGNODE_AFTER( first );
2993 /* -- First loop and Setup --
2995 We first traverse the branches and scan each word to determine if it
2996 contains widechars, and how many unique chars there are, this is
2997 important as we have to build a table with at least as many columns as we
3000 We use an array of integers to represent the character codes 0..255
3001 (trie->charmap) and we use a an HV* to store Unicode characters. We use
3002 the native representation of the character value as the key and IV's for
3005 *TODO* If we keep track of how many times each character is used we can
3006 remap the columns so that the table compression later on is more
3007 efficient in terms of memory by ensuring the most common value is in the
3008 middle and the least common are on the outside. IMO this would be better
3009 than a most to least common mapping as theres a decent chance the most
3010 common letter will share a node with the least common, meaning the node
3011 will not be compressible. With a middle is most common approach the worst
3012 case is when we have the least common nodes twice.
3016 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3017 regnode *noper = REGNODE_AFTER( cur );
3021 U32 wordlen = 0; /* required init */
3022 STRLEN minchars = 0;
3023 STRLEN maxchars = 0;
3024 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
3027 if (OP(noper) == NOTHING) {
3028 /* skip past a NOTHING at the start of an alternation
3029 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
3031 * If the next node is not something we are supposed to process
3032 * we will just ignore it due to the condition guarding the
3036 regnode *noper_next= regnext(noper);
3037 if (noper_next < tail)
3042 && ( OP(noper) == flags
3043 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3044 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3045 || OP(noper) == EXACTFUP))))
3047 uc= (U8*)STRING(noper);
3048 e= uc + STR_LEN(noper);
3055 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
3056 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
3057 regardless of encoding */
3058 if (OP( noper ) == EXACTFUP) {
3059 /* false positives are ok, so just set this */
3060 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
3064 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
3066 TRIE_CHARCOUNT(trie)++;
3069 /* TRIE_READ_CHAR returns the current character, or its fold if /i
3070 * is in effect. Under /i, this character can match itself, or
3071 * anything that folds to it. If not under /i, it can match just
3072 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
3073 * all fold to k, and all are single characters. But some folds
3074 * expand to more than one character, so for example LATIN SMALL
3075 * LIGATURE FFI folds to the three character sequence 'ffi'. If
3076 * the string beginning at 'uc' is 'ffi', it could be matched by
3077 * three characters, or just by the one ligature character. (It
3078 * could also be matched by two characters: LATIN SMALL LIGATURE FF
3079 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
3080 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
3081 * match.) The trie needs to know the minimum and maximum number
3082 * of characters that could match so that it can use size alone to
3083 * quickly reject many match attempts. The max is simple: it is
3084 * the number of folded characters in this branch (since a fold is
3085 * never shorter than what folds to it. */
3089 /* And the min is equal to the max if not under /i (indicated by
3090 * 'folder' being NULL), or there are no multi-character folds. If
3091 * there is a multi-character fold, the min is incremented just
3092 * once, for the character that folds to the sequence. Each
3093 * character in the sequence needs to be added to the list below of
3094 * characters in the trie, but we count only the first towards the
3095 * min number of characters needed. This is done through the
3096 * variable 'foldlen', which is returned by the macros that look
3097 * for these sequences as the number of bytes the sequence
3098 * occupies. Each time through the loop, we decrement 'foldlen' by
3099 * how many bytes the current char occupies. Only when it reaches
3100 * 0 do we increment 'minchars' or look for another multi-character
3102 if (folder == NULL) {
3105 else if (foldlen > 0) {
3106 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
3111 /* See if *uc is the beginning of a multi-character fold. If
3112 * so, we decrement the length remaining to look at, to account
3113 * for the current character this iteration. (We can use 'uc'
3114 * instead of the fold returned by TRIE_READ_CHAR because the
3115 * macro is smart enough to account for any unfolded
3118 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
3119 foldlen -= UTF8SKIP(uc);
3122 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
3127 /* The current character (and any potential folds) should be added
3128 * to the possible matching characters for this position in this
3132 U8 folded= folder[ (U8) uvc ];
3133 if ( !trie->charmap[ folded ] ) {
3134 trie->charmap[ folded ]=( ++trie->uniquecharcount );
3135 TRIE_STORE_REVCHAR( folded );
3138 if ( !trie->charmap[ uvc ] ) {
3139 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
3140 TRIE_STORE_REVCHAR( uvc );
3143 /* store the codepoint in the bitmap, and its folded
3145 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
3146 set_bit = 0; /* We've done our bit :-) */
3150 /* XXX We could come up with the list of code points that fold
3151 * to this using PL_utf8_foldclosures, except not for
3152 * multi-char folds, as there may be multiple combinations
3153 * there that could work, which needs to wait until runtime to
3154 * resolve (The comment about LIGATURE FFI above is such an
3159 widecharmap = newHV();
3161 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
3164 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3166 if ( !SvTRUE( *svpp ) ) {
3167 sv_setiv( *svpp, ++trie->uniquecharcount );
3168 TRIE_STORE_REVCHAR(uvc);
3171 } /* end loop through characters in this branch of the trie */
3173 /* We take the min and max for this branch and combine to find the min
3174 * and max for all branches processed so far */
3175 if( cur == first ) {
3176 trie->minlen = minchars;
3177 trie->maxlen = maxchars;
3178 } else if (minchars < trie->minlen) {
3179 trie->minlen = minchars;
3180 } else if (maxchars > trie->maxlen) {
3181 trie->maxlen = maxchars;
3183 } /* end first pass */
3184 DEBUG_TRIE_COMPILE_r(
3185 Perl_re_indentf( aTHX_
3186 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3188 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3189 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3190 (int)trie->minlen, (int)trie->maxlen )
3194 We now know what we are dealing with in terms of unique chars and
3195 string sizes so we can calculate how much memory a naive
3196 representation using a flat table will take. If it's over a reasonable
3197 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3198 conservative but potentially much slower representation using an array
3201 At the end we convert both representations into the same compressed
3202 form that will be used in regexec.c for matching with. The latter
3203 is a form that cannot be used to construct with but has memory
3204 properties similar to the list form and access properties similar
3205 to the table form making it both suitable for fast searches and
3206 small enough that its feasable to store for the duration of a program.
3208 See the comment in the code where the compressed table is produced
3209 inplace from the flat tabe representation for an explanation of how
3210 the compression works.
3215 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3218 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3219 > SvIV(re_trie_maxbuff) )
3222 Second Pass -- Array Of Lists Representation
3224 Each state will be represented by a list of charid:state records
3225 (reg_trie_trans_le) the first such element holds the CUR and LEN
3226 points of the allocated array. (See defines above).
3228 We build the initial structure using the lists, and then convert
3229 it into the compressed table form which allows faster lookups
3230 (but cant be modified once converted).
3233 STRLEN transcount = 1;
3235 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3238 trie->states = (reg_trie_state *)
3239 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3240 sizeof(reg_trie_state) );
3244 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3246 regnode *noper = REGNODE_AFTER( cur );
3247 U32 state = 1; /* required init */
3248 U16 charid = 0; /* sanity init */
3249 U32 wordlen = 0; /* required init */
3251 if (OP(noper) == NOTHING) {
3252 regnode *noper_next= regnext(noper);
3253 if (noper_next < tail)
3255 /* we will undo this assignment if noper does not
3256 * point at a trieable type in the else clause of
3257 * the following statement. */
3261 && ( OP(noper) == flags
3262 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3263 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3264 || OP(noper) == EXACTFUP))))
3266 const U8 *uc= (U8*)STRING(noper);
3267 const U8 *e= uc + STR_LEN(noper);
3269 for ( ; uc < e ; uc += len ) {
3274 charid = trie->charmap[ uvc ];
3276 SV** const svpp = hv_fetch( widecharmap,
3283 charid=(U16)SvIV( *svpp );
3286 /* charid is now 0 if we dont know the char read, or
3287 * nonzero if we do */
3294 if ( !trie->states[ state ].trans.list ) {
3295 TRIE_LIST_NEW( state );
3298 check <= TRIE_LIST_USED( state );
3301 if ( TRIE_LIST_ITEM( state, check ).forid
3304 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3309 newstate = next_alloc++;
3310 prev_states[newstate] = state;
3311 TRIE_LIST_PUSH( state, charid, newstate );
3316 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3320 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3321 * on a trieable type. So we need to reset noper back to point at the first regop
3322 * in the branch before we call TRIE_HANDLE_WORD()
3324 noper= REGNODE_AFTER(cur);
3326 TRIE_HANDLE_WORD(state);
3328 } /* end second pass */
3330 /* next alloc is the NEXT state to be allocated */
3331 trie->statecount = next_alloc;
3332 trie->states = (reg_trie_state *)
3333 PerlMemShared_realloc( trie->states,
3335 * sizeof(reg_trie_state) );
3337 /* and now dump it out before we compress it */
3338 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3339 revcharmap, next_alloc,
3343 trie->trans = (reg_trie_trans *)
3344 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3351 for( state=1 ; state < next_alloc ; state ++ ) {
3355 DEBUG_TRIE_COMPILE_MORE_r(
3356 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3360 if (trie->states[state].trans.list) {
3361 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3365 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3366 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3367 if ( forid < minid ) {
3369 } else if ( forid > maxid ) {
3373 if ( transcount < tp + maxid - minid + 1) {
3375 trie->trans = (reg_trie_trans *)
3376 PerlMemShared_realloc( trie->trans,
3378 * sizeof(reg_trie_trans) );
3379 Zero( trie->trans + (transcount / 2),
3383 base = trie->uniquecharcount + tp - minid;
3384 if ( maxid == minid ) {
3386 for ( ; zp < tp ; zp++ ) {
3387 if ( ! trie->trans[ zp ].next ) {
3388 base = trie->uniquecharcount + zp - minid;
3389 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3391 trie->trans[ zp ].check = state;
3397 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3399 trie->trans[ tp ].check = state;
3404 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3405 const U32 tid = base
3406 - trie->uniquecharcount
3407 + TRIE_LIST_ITEM( state, idx ).forid;
3408 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3410 trie->trans[ tid ].check = state;
3412 tp += ( maxid - minid + 1 );
3414 Safefree(trie->states[ state ].trans.list);
3417 DEBUG_TRIE_COMPILE_MORE_r(
3418 Perl_re_printf( aTHX_ " base: %d\n",base);
3421 trie->states[ state ].trans.base=base;
3423 trie->lasttrans = tp + 1;
3427 Second Pass -- Flat Table Representation.
3429 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3430 each. We know that we will need Charcount+1 trans at most to store
3431 the data (one row per char at worst case) So we preallocate both
3432 structures assuming worst case.
3434 We then construct the trie using only the .next slots of the entry
3437 We use the .check field of the first entry of the node temporarily
3438 to make compression both faster and easier by keeping track of how
3439 many non zero fields are in the node.
3441 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3444 There are two terms at use here: state as a TRIE_NODEIDX() which is
3445 a number representing the first entry of the node, and state as a
3446 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3447 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3448 if there are 2 entrys per node. eg:
3456 The table is internally in the right hand, idx form. However as we
3457 also have to deal with the states array which is indexed by nodenum
3458 we have to use TRIE_NODENUM() to convert.
3461 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3464 trie->trans = (reg_trie_trans *)
3465 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3466 * trie->uniquecharcount + 1,
3467 sizeof(reg_trie_trans) );
3468 trie->states = (reg_trie_state *)
3469 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3470 sizeof(reg_trie_state) );
3471 next_alloc = trie->uniquecharcount + 1;
3474 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3476 regnode *noper = REGNODE_AFTER( cur );
3478 U32 state = 1; /* required init */
3480 U16 charid = 0; /* sanity init */
3481 U32 accept_state = 0; /* sanity init */
3483 U32 wordlen = 0; /* required init */
3485 if (OP(noper) == NOTHING) {
3486 regnode *noper_next= regnext(noper);
3487 if (noper_next < tail)
3489 /* we will undo this assignment if noper does not
3490 * point at a trieable type in the else clause of
3491 * the following statement. */
3495 && ( OP(noper) == flags
3496 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3497 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3498 || OP(noper) == EXACTFUP))))
3500 const U8 *uc= (U8*)STRING(noper);
3501 const U8 *e= uc + STR_LEN(noper);
3503 for ( ; uc < e ; uc += len ) {
3508 charid = trie->charmap[ uvc ];
3510 SV* const * const svpp = hv_fetch( widecharmap,
3514 charid = svpp ? (U16)SvIV(*svpp) : 0;
3518 if ( !trie->trans[ state + charid ].next ) {
3519 trie->trans[ state + charid ].next = next_alloc;
3520 trie->trans[ state ].check++;
3521 prev_states[TRIE_NODENUM(next_alloc)]
3522 = TRIE_NODENUM(state);
3523 next_alloc += trie->uniquecharcount;
3525 state = trie->trans[ state + charid ].next;
3527 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3529 /* charid is now 0 if we dont know the char read, or
3530 * nonzero if we do */
3533 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3534 * on a trieable type. So we need to reset noper back to point at the first regop
3535 * in the branch before we call TRIE_HANDLE_WORD().
3537 noper= REGNODE_AFTER(cur);
3539 accept_state = TRIE_NODENUM( state );
3540 TRIE_HANDLE_WORD(accept_state);
3542 } /* end second pass */
3544 /* and now dump it out before we compress it */
3545 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3547 next_alloc, depth+1));
3551 * Inplace compress the table.*
3553 For sparse data sets the table constructed by the trie algorithm will
3554 be mostly 0/FAIL transitions or to put it another way mostly empty.
3555 (Note that leaf nodes will not contain any transitions.)
3557 This algorithm compresses the tables by eliminating most such
3558 transitions, at the cost of a modest bit of extra work during lookup:
3560 - Each states[] entry contains a .base field which indicates the
3561 index in the state[] array wheres its transition data is stored.
3563 - If .base is 0 there are no valid transitions from that node.
3565 - If .base is nonzero then charid is added to it to find an entry in
3568 -If trans[states[state].base+charid].check!=state then the
3569 transition is taken to be a 0/Fail transition. Thus if there are fail
3570 transitions at the front of the node then the .base offset will point
3571 somewhere inside the previous nodes data (or maybe even into a node
3572 even earlier), but the .check field determines if the transition is
3576 The following process inplace converts the table to the compressed
3577 table: We first do not compress the root node 1,and mark all its
3578 .check pointers as 1 and set its .base pointer as 1 as well. This
3579 allows us to do a DFA construction from the compressed table later,
3580 and ensures that any .base pointers we calculate later are greater
3583 - We set 'pos' to indicate the first entry of the second node.
3585 - We then iterate over the columns of the node, finding the first and
3586 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3587 and set the .check pointers accordingly, and advance pos
3588 appropriately and repreat for the next node. Note that when we copy
3589 the next pointers we have to convert them from the original
3590 NODEIDX form to NODENUM form as the former is not valid post
3593 - If a node has no transitions used we mark its base as 0 and do not
3594 advance the pos pointer.
3596 - If a node only has one transition we use a second pointer into the
3597 structure to fill in allocated fail transitions from other states.
3598 This pointer is independent of the main pointer and scans forward
3599 looking for null transitions that are allocated to a state. When it
3600 finds one it writes the single transition into the "hole". If the
3601 pointer doesnt find one the single transition is appended as normal.
3603 - Once compressed we can Renew/realloc the structures to release the
3606 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3607 specifically Fig 3.47 and the associated pseudocode.
3611 const U32 laststate = TRIE_NODENUM( next_alloc );
3614 trie->statecount = laststate;
3616 for ( state = 1 ; state < laststate ; state++ ) {
3618 const U32 stateidx = TRIE_NODEIDX( state );
3619 const U32 o_used = trie->trans[ stateidx ].check;
3620 U32 used = trie->trans[ stateidx ].check;
3621 trie->trans[ stateidx ].check = 0;
3624 used && charid < trie->uniquecharcount;
3627 if ( flag || trie->trans[ stateidx + charid ].next ) {
3628 if ( trie->trans[ stateidx + charid ].next ) {
3630 for ( ; zp < pos ; zp++ ) {
3631 if ( ! trie->trans[ zp ].next ) {
3635 trie->states[ state ].trans.base
3637 + trie->uniquecharcount
3639 trie->trans[ zp ].next
3640 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3642 trie->trans[ zp ].check = state;
3643 if ( ++zp > pos ) pos = zp;
3650 trie->states[ state ].trans.base
3651 = pos + trie->uniquecharcount - charid ;
3653 trie->trans[ pos ].next
3654 = SAFE_TRIE_NODENUM(
3655 trie->trans[ stateidx + charid ].next );
3656 trie->trans[ pos ].check = state;
3661 trie->lasttrans = pos + 1;
3662 trie->states = (reg_trie_state *)
3663 PerlMemShared_realloc( trie->states, laststate
3664 * sizeof(reg_trie_state) );
3665 DEBUG_TRIE_COMPILE_MORE_r(
3666 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3668 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3672 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3675 } /* end table compress */
3677 DEBUG_TRIE_COMPILE_MORE_r(
3678 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3680 (UV)trie->statecount,
3681 (UV)trie->lasttrans)
3683 /* resize the trans array to remove unused space */
3684 trie->trans = (reg_trie_trans *)
3685 PerlMemShared_realloc( trie->trans, trie->lasttrans
3686 * sizeof(reg_trie_trans) );
3688 { /* Modify the program and insert the new TRIE node */
3689 U8 nodetype =(U8) flags;
3693 regnode *optimize = NULL;
3694 #endif /* DEBUGGING */
3696 This means we convert either the first branch or the first Exact,
3697 depending on whether the thing following (in 'last') is a branch
3698 or not and whther first is the startbranch (ie is it a sub part of
3699 the alternation or is it the whole thing.)
3700 Assuming its a sub part we convert the EXACT otherwise we convert
3701 the whole branch sequence, including the first.
3703 /* Find the node we are going to overwrite */
3704 if ( first != startbranch || OP( last ) == BRANCH ) {
3705 /* branch sub-chain */
3706 NEXT_OFF( first ) = (U16)(last - first);
3707 /* whole branch chain */
3709 /* But first we check to see if there is a common prefix we can
3710 split out as an EXACT and put in front of the TRIE node. */
3711 trie->startstate= 1;
3712 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3713 /* we want to find the first state that has more than
3714 * one transition, if that state is not the first state
3715 * then we have a common prefix which we can remove.
3718 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3720 I32 first_ofs = -1; /* keeps track of the ofs of the first
3721 transition, -1 means none */
3723 const U32 base = trie->states[ state ].trans.base;
3725 /* does this state terminate an alternation? */
3726 if ( trie->states[state].wordnum )
3729 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3730 if ( ( base + ofs >= trie->uniquecharcount ) &&
3731 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3732 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3734 if ( ++count > 1 ) {
3735 /* we have more than one transition */
3738 /* if this is the first state there is no common prefix
3739 * to extract, so we can exit */
3740 if ( state == 1 ) break;
3741 tmp = av_fetch( revcharmap, ofs, 0);
3742 ch = (U8*)SvPV_nolen_const( *tmp );
3744 /* if we are on count 2 then we need to initialize the
3745 * bitmap, and store the previous char if there was one
3748 /* clear the bitmap */
3749 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3751 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3754 if (first_ofs >= 0) {
3755 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3756 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3758 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3760 Perl_re_printf( aTHX_ "%s", (char*)ch)
3764 /* store the current firstchar in the bitmap */
3765 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3766 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3772 /* This state has only one transition, its transition is part
3773 * of a common prefix - we need to concatenate the char it
3774 * represents to what we have so far. */
3775 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3777 char *ch = SvPV( *tmp, len );
3779 SV *sv=sv_newmortal();
3780 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3782 (UV)state, (UV)first_ofs,
3783 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3784 PL_colors[0], PL_colors[1],
3785 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3786 PERL_PV_ESCAPE_FIRSTCHAR
3791 OP( convert ) = nodetype;
3792 str=STRING(convert);
3793 setSTR_LEN(convert, 0);
3795 assert( ( STR_LEN(convert) + len ) < 256 );
3796 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3802 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3807 trie->prefixlen = (state-1);
3809 regnode *n = REGNODE_AFTER(convert);
3810 assert( n - convert <= U16_MAX );
3811 NEXT_OFF(convert) = n - convert;
3812 trie->startstate = state;
3813 trie->minlen -= (state - 1);
3814 trie->maxlen -= (state - 1);
3816 /* At least the UNICOS C compiler choked on this
3817 * being argument to DEBUG_r(), so let's just have
3820 #ifdef PERL_EXT_RE_BUILD
3826 U32 word = trie->wordcount;
3828 SV ** const tmp = av_fetch( trie_words, word, 0 );
3830 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3831 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3833 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3841 NEXT_OFF(convert) = (U16)(tail - convert);
3842 DEBUG_r(optimize= n);
3848 if ( trie->maxlen ) {
3849 NEXT_OFF( convert ) = (U16)(tail - convert);
3850 ARG_SET( convert, data_slot );
3851 /* Store the offset to the first unabsorbed branch in
3852 jump[0], which is otherwise unused by the jump logic.
3853 We use this when dumping a trie and during optimisation. */
3855 trie->jump[0] = (U16)(nextbranch - convert);
3857 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3858 * and there is a bitmap
3859 * and the first "jump target" node we found leaves enough room
3860 * then convert the TRIE node into a TRIEC node, with the bitmap
3861 * embedded inline in the opcode - this is hypothetically faster.
3863 if ( !trie->states[trie->startstate].wordnum
3865 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3867 OP( convert ) = TRIEC;
3868 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3869 PerlMemShared_free(trie->bitmap);
3872 OP( convert ) = TRIE;
3874 /* store the type in the flags */
3875 convert->flags = nodetype;
3879 + REGNODE_ARG_LEN( OP( convert ) );
3881 /* XXX We really should free up the resource in trie now,
3882 as we won't use them - (which resources?) dmq */
3884 /* needed for dumping*/
3885 DEBUG_r(if (optimize) {
3887 Try to clean up some of the debris left after the
3890 while( optimize < jumper ) {
3891 OP( optimize ) = OPTIMIZED;
3895 } /* end node insert */
3897 /* Finish populating the prev field of the wordinfo array. Walk back
3898 * from each accept state until we find another accept state, and if
3899 * so, point the first word's .prev field at the second word. If the
3900 * second already has a .prev field set, stop now. This will be the
3901 * case either if we've already processed that word's accept state,
3902 * or that state had multiple words, and the overspill words were
3903 * already linked up earlier.
3910 for (word=1; word <= trie->wordcount; word++) {
3912 if (trie->wordinfo[word].prev)
3914 state = trie->wordinfo[word].accept;
3916 state = prev_states[state];
3919 prev = trie->states[state].wordnum;
3923 trie->wordinfo[word].prev = prev;
3925 Safefree(prev_states);
3929 /* and now dump out the compressed format */
3930 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3932 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3934 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3935 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3937 SvREFCNT_dec_NN(revcharmap);
3941 : trie->startstate>1
3947 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3949 /* The Trie is constructed and compressed now so we can build a fail array if
3952 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3954 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3958 We find the fail state for each state in the trie, this state is the longest
3959 proper suffix of the current state's 'word' that is also a proper prefix of
3960 another word in our trie. State 1 represents the word '' and is thus the
3961 default fail state. This allows the DFA not to have to restart after its
3962 tried and failed a word at a given point, it simply continues as though it
3963 had been matching the other word in the first place.
3965 'abcdgu'=~/abcdefg|cdgu/
3966 When we get to 'd' we are still matching the first word, we would encounter
3967 'g' which would fail, which would bring us to the state representing 'd' in
3968 the second word where we would try 'g' and succeed, proceeding to match
3971 /* add a fail transition */
3972 const U32 trie_offset = ARG(source);
3973 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3975 const U32 ucharcount = trie->uniquecharcount;
3976 const U32 numstates = trie->statecount;
3977 const U32 ubound = trie->lasttrans + ucharcount;
3981 U32 base = trie->states[ 1 ].trans.base;
3984 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3986 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3988 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3989 PERL_UNUSED_CONTEXT;
3991 PERL_UNUSED_ARG(depth);
3994 if ( OP(source) == TRIE ) {
3995 struct regnode_1 *op = (struct regnode_1 *)
3996 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3997 StructCopy(source, op, struct regnode_1);
3998 stclass = (regnode *)op;
4000 struct regnode_charclass *op = (struct regnode_charclass *)
4001 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4002 StructCopy(source, op, struct regnode_charclass);
4003 stclass = (regnode *)op;
4005 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
4007 ARG_SET( stclass, data_slot );
4008 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
4009 RExC_rxi->data->data[ data_slot ] = (void*)aho;
4010 aho->trie=trie_offset;
4011 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
4012 Copy( trie->states, aho->states, numstates, reg_trie_state );
4013 Newx( q, numstates, U32);
4014 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
4017 /* initialize fail[0..1] to be 1 so that we always have
4018 a valid final fail state */
4019 fail[ 0 ] = fail[ 1 ] = 1;
4021 for ( charid = 0; charid < ucharcount ; charid++ ) {
4022 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
4024 q[ q_write ] = newstate;
4025 /* set to point at the root */
4026 fail[ q[ q_write++ ] ]=1;
4029 while ( q_read < q_write) {
4030 const U32 cur = q[ q_read++ % numstates ];
4031 base = trie->states[ cur ].trans.base;
4033 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
4034 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
4036 U32 fail_state = cur;
4039 fail_state = fail[ fail_state ];
4040 fail_base = aho->states[ fail_state ].trans.base;
4041 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
4043 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
4044 fail[ ch_state ] = fail_state;
4045 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
4047 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
4049 q[ q_write++ % numstates] = ch_state;
4053 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
4054 when we fail in state 1, this allows us to use the
4055 charclass scan to find a valid start char. This is based on the principle
4056 that theres a good chance the string being searched contains lots of stuff
4057 that cant be a start char.
4059 fail[ 0 ] = fail[ 1 ] = 0;
4060 DEBUG_TRIE_COMPILE_r({
4061 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
4062 depth, (UV)numstates
4064 for( q_read=1; q_read<numstates; q_read++ ) {
4065 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
4067 Perl_re_printf( aTHX_ "\n");
4070 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
4075 /* The below joins as many adjacent EXACTish nodes as possible into a single
4076 * one. The regop may be changed if the node(s) contain certain sequences that
4077 * require special handling. The joining is only done if:
4078 * 1) there is room in the current conglomerated node to entirely contain the
4080 * 2) they are compatible node types
4082 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
4083 * these get optimized out
4085 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
4086 * as possible, even if that means splitting an existing node so that its first
4087 * part is moved to the preceding node. This would maximise the efficiency of
4088 * memEQ during matching.
4090 * If a node is to match under /i (folded), the number of characters it matches
4091 * can be different than its character length if it contains a multi-character
4092 * fold. *min_subtract is set to the total delta number of characters of the
4095 * And *unfolded_multi_char is set to indicate whether or not the node contains
4096 * an unfolded multi-char fold. This happens when it won't be known until
4097 * runtime whether the fold is valid or not; namely
4098 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
4099 * target string being matched against turns out to be UTF-8 is that fold
4101 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
4103 * (Multi-char folds whose components are all above the Latin1 range are not
4104 * run-time locale dependent, and have already been folded by the time this
4105 * function is called.)
4107 * This is as good a place as any to discuss the design of handling these
4108 * multi-character fold sequences. It's been wrong in Perl for a very long
4109 * time. There are three code points in Unicode whose multi-character folds
4110 * were long ago discovered to mess things up. The previous designs for
4111 * dealing with these involved assigning a special node for them. This
4112 * approach doesn't always work, as evidenced by this example:
4113 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
4114 * Both sides fold to "sss", but if the pattern is parsed to create a node that
4115 * would match just the \xDF, it won't be able to handle the case where a
4116 * successful match would have to cross the node's boundary. The new approach
4117 * that hopefully generally solves the problem generates an EXACTFUP node
4118 * that is "sss" in this case.
4120 * It turns out that there are problems with all multi-character folds, and not
4121 * just these three. Now the code is general, for all such cases. The
4122 * approach taken is:
4123 * 1) This routine examines each EXACTFish node that could contain multi-
4124 * character folded sequences. Since a single character can fold into
4125 * such a sequence, the minimum match length for this node is less than
4126 * the number of characters in the node. This routine returns in
4127 * *min_subtract how many characters to subtract from the actual
4128 * length of the string to get a real minimum match length; it is 0 if
4129 * there are no multi-char foldeds. This delta is used by the caller to
4130 * adjust the min length of the match, and the delta between min and max,
4131 * so that the optimizer doesn't reject these possibilities based on size
4134 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4135 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4136 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4137 * EXACTFU nodes. The node type of such nodes is then changed to
4138 * EXACTFUP, indicating it is problematic, and needs careful handling.
4139 * (The procedures in step 1) above are sufficient to handle this case in
4140 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4141 * the only case where there is a possible fold length change in non-UTF-8
4142 * patterns. By reserving a special node type for problematic cases, the
4143 * far more common regular EXACTFU nodes can be processed faster.
4144 * regexec.c takes advantage of this.
4146 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4147 * problematic cases. These all only occur when the pattern is not
4148 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4149 * length change, it handles the situation where the string cannot be
4150 * entirely folded. The strings in an EXACTFish node are folded as much
4151 * as possible during compilation in regcomp.c. This saves effort in
4152 * regex matching. By using an EXACTFUP node when it is not possible to
4153 * fully fold at compile time, regexec.c can know that everything in an
4154 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4155 * case where folding in EXACTFU nodes can't be done at compile time is
4156 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4157 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4158 * handle two very different cases. Alternatively, there could have been
4159 * a node type where there are length changes, one for unfolded, and one
4160 * for both. If yet another special case needed to be created, the number
4161 * of required node types would have to go to 7. khw figures that even
4162 * though there are plenty of node types to spare, that the maintenance
4163 * cost wasn't worth the small speedup of doing it that way, especially
4164 * since he thinks the MICRO SIGN is rarely encountered in practice.
4166 * There are other cases where folding isn't done at compile time, but
4167 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4168 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4169 * changes. Some folds in EXACTF depend on if the runtime target string
4170 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4171 * when no fold in it depends on the UTF-8ness of the target string.)
4173 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4174 * validity of the fold won't be known until runtime, and so must remain
4175 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4176 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4177 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4178 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4179 * The reason this is a problem is that the optimizer part of regexec.c
4180 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4181 * that a character in the pattern corresponds to at most a single
4182 * character in the target string. (And I do mean character, and not byte
4183 * here, unlike other parts of the documentation that have never been
4184 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4185 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4186 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4187 * EXACTFL nodes, violate the assumption, and they are the only instances
4188 * where it is violated. I'm reluctant to try to change the assumption,
4189 * as the code involved is impenetrable to me (khw), so instead the code
4190 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4191 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4192 * boolean indicating whether or not the node contains such a fold. When
4193 * it is true, the caller sets a flag that later causes the optimizer in
4194 * this file to not set values for the floating and fixed string lengths,
4195 * and thus avoids the optimizer code in regexec.c that makes the invalid
4196 * assumption. Thus, there is no optimization based on string lengths for
4197 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4198 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4199 * assumption is wrong only in these cases is that all other non-UTF-8
4200 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4201 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4202 * EXACTF nodes because we don't know at compile time if it actually
4203 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4204 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4205 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4206 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4207 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4208 * string would require the pattern to be forced into UTF-8, the overhead
4209 * of which we want to avoid. Similarly the unfolded multi-char folds in
4210 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4213 * Similarly, the code that generates tries doesn't currently handle
4214 * not-already-folded multi-char folds, and it looks like a pain to change
4215 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4216 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4217 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4218 * using /iaa matching will be doing so almost entirely with ASCII
4219 * strings, so this should rarely be encountered in practice */
4222 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4223 UV *min_subtract, bool *unfolded_multi_char,
4224 U32 flags, regnode *val, U32 depth)
4226 /* Merge several consecutive EXACTish nodes into one. */
4228 regnode *n = regnext(scan);
4230 regnode *next = REGNODE_AFTER_varies(scan);
4234 regnode *stop = scan;
4235 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4237 PERL_UNUSED_ARG(depth);
4240 PERL_ARGS_ASSERT_JOIN_EXACT;
4241 #ifndef EXPERIMENTAL_INPLACESCAN
4242 PERL_UNUSED_ARG(flags);
4243 PERL_UNUSED_ARG(val);
4245 DEBUG_PEEP("join", scan, depth, 0);
4247 assert(REGNODE_TYPE(OP(scan)) == EXACT);
4249 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4250 * EXACT ones that are mergeable to the current one. */
4252 && ( REGNODE_TYPE(OP(n)) == NOTHING
4253 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
4255 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4258 if (OP(n) == TAIL || n > next)
4260 if (REGNODE_TYPE(OP(n)) == NOTHING) {
4261 DEBUG_PEEP("skip:", n, depth, 0);
4262 NEXT_OFF(scan) += NEXT_OFF(n);
4263 next = n + NODE_STEP_REGNODE;
4270 else if (stringok) {
4271 const unsigned int oldl = STR_LEN(scan);
4272 regnode * const nnext = regnext(n);
4274 /* XXX I (khw) kind of doubt that this works on platforms (should
4275 * Perl ever run on one) where U8_MAX is above 255 because of lots
4276 * of other assumptions */
4277 /* Don't join if the sum can't fit into a single node */
4278 if (oldl + STR_LEN(n) > U8_MAX)
4281 /* Joining something that requires UTF-8 with something that
4282 * doesn't, means the result requires UTF-8. */
4283 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4284 OP(scan) = EXACT_REQ8;
4286 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4287 ; /* join is compatible, no need to change OP */
4289 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4290 OP(scan) = EXACTFU_REQ8;
4292 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4293 ; /* join is compatible, no need to change OP */
4295 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4296 ; /* join is compatible, no need to change OP */
4298 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4300 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4301 * which can join with EXACTFU ones. We check for this case
4302 * here. These need to be resolved to either EXACTFU or
4303 * EXACTF at joining time. They have nothing in them that
4304 * would forbid them from being the more desirable EXACTFU
4305 * nodes except that they begin and/or end with a single [Ss].
4306 * The reason this is problematic is because they could be
4307 * joined in this loop with an adjacent node that ends and/or
4308 * begins with [Ss] which would then form the sequence 'ss',
4309 * which matches differently under /di than /ui, in which case
4310 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4311 * formed, the nodes get absorbed into any adjacent EXACTFU
4312 * node. And if the only adjacent node is EXACTF, they get
4313 * absorbed into that, under the theory that a longer node is
4314 * better than two shorter ones, even if one is EXACTFU. Note
4315 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4316 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4318 if (STRING(n)[STR_LEN(n)-1] == 's') {
4320 /* Here the joined node would end with 's'. If the node
4321 * following the combination is an EXACTF one, it's better to
4322 * join this trailing edge 's' node with that one, leaving the
4323 * current one in 'scan' be the more desirable EXACTFU */
4324 if (OP(nnext) == EXACTF) {
4328 OP(scan) = EXACTFU_S_EDGE;
4330 } /* Otherwise, the beginning 's' of the 2nd node just
4331 becomes an interior 's' in 'scan' */
4333 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4334 ; /* join is compatible, no need to change OP */
4336 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4338 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4339 * nodes. But the latter nodes can be also joined with EXACTFU
4340 * ones, and that is a better outcome, so if the node following
4341 * 'n' is EXACTFU, quit now so that those two can be joined
4343 if (OP(nnext) == EXACTFU) {
4347 /* The join is compatible, and the combined node will be
4348 * EXACTF. (These don't care if they begin or end with 's' */
4350 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4351 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4352 && STRING(n)[0] == 's')
4354 /* When combined, we have the sequence 'ss', which means we
4355 * have to remain /di */
4359 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4360 if (STRING(n)[0] == 's') {
4361 ; /* Here the join is compatible and the combined node
4362 starts with 's', no need to change OP */
4364 else { /* Now the trailing 's' is in the interior */
4368 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4370 /* The join is compatible, and the combined node will be
4371 * EXACTF. (These don't care if they begin or end with 's' */
4374 else if (OP(scan) != OP(n)) {
4376 /* The only other compatible joinings are the same node type */
4380 DEBUG_PEEP("merg", n, depth, 0);
4383 next = REGNODE_AFTER_varies(n);
4384 NEXT_OFF(scan) += NEXT_OFF(n);
4385 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4386 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4387 /* Now we can overwrite *n : */
4388 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4396 #ifdef EXPERIMENTAL_INPLACESCAN
4397 if (flags && !NEXT_OFF(n)) {
4398 DEBUG_PEEP("atch", val, depth, 0);
4399 if (REGNODE_OFF_BY_ARG(OP(n))) {
4400 ARG_SET(n, val - n);
4403 NEXT_OFF(n) = val - n;
4410 /* This temporary node can now be turned into EXACTFU, and must, as
4411 * regexec.c doesn't handle it */
4412 if (OP(scan) == EXACTFU_S_EDGE) {
4417 *unfolded_multi_char = FALSE;
4419 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4420 * can now analyze for sequences of problematic code points. (Prior to
4421 * this final joining, sequences could have been split over boundaries, and
4422 * hence missed). The sequences only happen in folding, hence for any
4423 * non-EXACT EXACTish node */
4424 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4425 U8* s0 = (U8*) STRING(scan);
4427 U8* s_end = s0 + STR_LEN(scan);
4429 int total_count_delta = 0; /* Total delta number of characters that
4430 multi-char folds expand to */
4432 /* One pass is made over the node's string looking for all the
4433 * possibilities. To avoid some tests in the loop, there are two main
4434 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4439 if (OP(scan) == EXACTFL) {
4442 /* An EXACTFL node would already have been changed to another
4443 * node type unless there is at least one character in it that
4444 * is problematic; likely a character whose fold definition
4445 * won't be known until runtime, and so has yet to be folded.
4446 * For all but the UTF-8 locale, folds are 1-1 in length, but
4447 * to handle the UTF-8 case, we need to create a temporary
4448 * folded copy using UTF-8 locale rules in order to analyze it.
4449 * This is because our macros that look to see if a sequence is
4450 * a multi-char fold assume everything is folded (otherwise the
4451 * tests in those macros would be too complicated and slow).
4452 * Note that here, the non-problematic folds will have already
4453 * been done, so we can just copy such characters. We actually
4454 * don't completely fold the EXACTFL string. We skip the
4455 * unfolded multi-char folds, as that would just create work
4456 * below to figure out the size they already are */
4458 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4461 STRLEN s_len = UTF8SKIP(s);
4462 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4463 Copy(s, d, s_len, U8);
4466 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4467 *unfolded_multi_char = TRUE;
4468 Copy(s, d, s_len, U8);
4471 else if (isASCII(*s)) {
4472 *(d++) = toFOLD(*s);
4476 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4482 /* Point the remainder of the routine to look at our temporary
4486 } /* End of creating folded copy of EXACTFL string */
4488 /* Examine the string for a multi-character fold sequence. UTF-8
4489 * patterns have all characters pre-folded by the time this code is
4491 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4492 length sequence we are looking for is 2 */
4494 int count = 0; /* How many characters in a multi-char fold */
4495 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4496 if (! len) { /* Not a multi-char fold: get next char */
4501 { /* Here is a generic multi-char fold. */
4502 U8* multi_end = s + len;
4504 /* Count how many characters are in it. In the case of
4505 * /aa, no folds which contain ASCII code points are
4506 * allowed, so check for those, and skip if found. */
4507 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4508 count = utf8_length(s, multi_end);
4512 while (s < multi_end) {
4515 goto next_iteration;
4525 /* The delta is how long the sequence is minus 1 (1 is how long
4526 * the character that folds to the sequence is) */
4527 total_count_delta += count - 1;
4531 /* We created a temporary folded copy of the string in EXACTFL
4532 * nodes. Therefore we need to be sure it doesn't go below zero,
4533 * as the real string could be shorter */
4534 if (OP(scan) == EXACTFL) {
4535 int total_chars = utf8_length((U8*) STRING(scan),
4536 (U8*) STRING(scan) + STR_LEN(scan));
4537 if (total_count_delta > total_chars) {
4538 total_count_delta = total_chars;
4542 *min_subtract += total_count_delta;
4545 else if (OP(scan) == EXACTFAA) {
4547 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4548 * fold to the ASCII range (and there are no existing ones in the
4549 * upper latin1 range). But, as outlined in the comments preceding
4550 * this function, we need to flag any occurrences of the sharp s.
4551 * This character forbids trie formation (because of added
4553 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4554 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4555 || UNICODE_DOT_DOT_VERSION > 0)
4557 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4558 OP(scan) = EXACTFAA_NO_TRIE;
4559 *unfolded_multi_char = TRUE;
4565 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4567 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4568 * folds that are all Latin1. As explained in the comments
4569 * preceding this function, we look also for the sharp s in EXACTF
4570 * and EXACTFL nodes; it can be in the final position. Otherwise
4571 * we can stop looking 1 byte earlier because have to find at least
4572 * two characters for a multi-fold */
4573 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4578 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4579 if (! len) { /* Not a multi-char fold. */
4580 if (*s == LATIN_SMALL_LETTER_SHARP_S
4581 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4583 *unfolded_multi_char = TRUE;
4590 && isALPHA_FOLD_EQ(*s, 's')
4591 && isALPHA_FOLD_EQ(*(s+1), 's'))
4594 /* EXACTF nodes need to know that the minimum length
4595 * changed so that a sharp s in the string can match this
4596 * ss in the pattern, but they remain EXACTF nodes, as they
4597 * won't match this unless the target string is in UTF-8,
4598 * which we don't know until runtime. EXACTFL nodes can't
4599 * transform into EXACTFU nodes */
4600 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4601 OP(scan) = EXACTFUP;
4605 *min_subtract += len - 1;
4613 /* Allow dumping but overwriting the collection of skipped
4614 * ops and/or strings with fake optimized ops */
4615 n = REGNODE_AFTER_varies(scan);
4623 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4627 /* REx optimizer. Converts nodes into quicker variants "in place".
4628 Finds fixed substrings. */
4630 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4631 to the position after last scanned or to NULL. */
4633 #define INIT_AND_WITHP \
4634 assert(!and_withp); \
4635 Newx(and_withp, 1, regnode_ssc); \
4636 SAVEFREEPV(and_withp)
4640 S_unwind_scan_frames(pTHX_ const void *p)
4642 scan_frame *f= (scan_frame *)p;
4644 scan_frame *n= f->next_frame;
4650 /* Follow the next-chain of the current node and optimize away
4651 all the NOTHINGs from it.
4654 S_rck_elide_nothing(pTHX_ regnode *node)
4656 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4658 if (OP(node) != CURLYX) {
4659 const int max = (REGNODE_OFF_BY_ARG(OP(node))
4661 /* I32 may be smaller than U16 on CRAYs! */
4662 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4663 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node));
4667 /* Skip NOTHING and LONGJMP. */
4671 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
4672 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4678 if (REGNODE_OFF_BY_ARG(OP(node)))
4681 NEXT_OFF(node) = off;
4686 /* the return from this sub is the minimum length that could possibly match */
4689 RExC_state_t *pRExC_state,
4690 regnode **scanp, /* Start here (read-write). */
4691 SSize_t *minlenp, /* used for the minlen of substrings? */
4692 SSize_t *deltap, /* Write maxlen-minlen here. */
4693 regnode *last, /* Stop before this one. */
4694 scan_data_t *data, /* string data about the pattern */
4695 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
4696 U32 recursed_depth, /* how deep have we recursed via GOSUB */
4697 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
4698 U32 flags, /* flags controlling this call, see SCF_ flags */
4699 U32 depth, /* how deep have we recursed period */
4700 bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
4701 FALSE only if the caller (recursively) was
4702 prohibited from modifying the regops, because
4703 a higher caller is holding a ptr to them. */
4706 /* vars about the regnodes we are working with */
4707 regnode *scan = *scanp; /* the current opcode we are inspecting */
4708 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
4709 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
4710 the first non open regop, if the init
4711 val IS an OPEN then we will skip past
4712 it just after the var decls section */
4713 I32 code = 0; /* temp var used to hold the optype of a regop */
4715 /* vars about the min and max length of the pattern */
4716 SSize_t min = 0; /* min length of this part of the pattern */
4717 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
4718 this is adjusted down if we find
4720 SSize_t delta = 0; /* difference between min and max length
4721 (not accounting for stopmin) */
4723 /* vars about capture buffers in the pattern */
4724 I32 pars = 0; /* count of OPEN opcodes */
4725 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; /* is this op an OPEN? */
4727 /* vars about whether this pattern contains something that can match
4728 * infinitely long strings, eg, X* or X+ */
4729 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4730 int is_inf_internal = 0; /* The studied chunk is infinite */
4732 /* scan_data_t (struct) is used to hold information about the substrings
4733 * and start class we have extracted from the string */
4734 scan_data_t data_fake; /* temp var used for recursing in some cases */
4736 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
4737 trie optimizations */
4739 scan_frame *frame = NULL; /* used as part of fake recursion */
4741 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4743 PERL_ARGS_ASSERT_STUDY_CHUNK;
4744 RExC_study_started= 1;
4746 Zero(&data_fake, 1, scan_data_t);
4749 while (first_non_open && OP(first_non_open) == OPEN)
4750 first_non_open=regnext(first_non_open);
4755 RExC_study_chunk_recursed_count++;
4757 DEBUG_OPTIMISE_MORE_r(
4759 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4760 depth, (long)stopparen,
4761 (unsigned long)RExC_study_chunk_recursed_count,
4762 (unsigned long)depth, (unsigned long)recursed_depth,
4765 if (recursed_depth) {
4768 for ( j = 0 ; j < recursed_depth ; j++ ) {
4769 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4770 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4771 Perl_re_printf( aTHX_ " %d",(int)i);
4775 if ( j + 1 < recursed_depth ) {
4776 Perl_re_printf( aTHX_ ",");
4780 Perl_re_printf( aTHX_ "\n");
4783 while ( scan && OP(scan) != END && scan < last ){
4784 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4785 node length to get a real minimum (because
4786 the folded version may be shorter) */
4787 bool unfolded_multi_char = FALSE;
4788 /* avoid mutating ops if we are anywhere within the recursed or
4789 * enframed handling for a GOSUB: the outermost level will handle it.
4791 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4792 /* Peephole optimizer: */
4793 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
4794 DEBUG_PEEP("Peep", scan, depth, flags);
4797 /* The reason we do this here is that we need to deal with things like
4798 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4799 * parsing code, as each (?:..) is handled by a different invocation of
4802 if (REGNODE_TYPE(OP(scan)) == EXACT
4803 && OP(scan) != LEXACT
4804 && OP(scan) != LEXACT_REQ8
4807 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4808 0, NULL, depth + 1);
4811 /* Follow the next-chain of the current node and optimize
4812 away all the NOTHINGs from it.
4814 rck_elide_nothing(scan);
4816 /* The principal pseudo-switch. Cannot be a switch, since we look into
4817 * several different things. */
4818 if ( OP(scan) == DEFINEP ) {
4820 SSize_t deltanext = 0;
4821 SSize_t fake_last_close = 0;
4822 regnode *fake_last_close_op = NULL;
4823 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
4825 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4826 scan = regnext(scan);
4827 assert( OP(scan) == IFTHEN );
4828 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4830 data_fake.last_closep= &fake_last_close;
4831 data_fake.last_close_opp= &fake_last_close_op;
4833 next = regnext(scan);
4834 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
4835 DEBUG_PEEP("scan", scan, depth, flags);
4836 DEBUG_PEEP("next", next, depth, flags);
4838 /* we suppose the run is continuous, last=next...
4839 * NOTE we dont use the return here! */
4840 /* DEFINEP study_chunk() recursion */
4841 (void)study_chunk(pRExC_state, &scan, &minlen,
4842 &deltanext, next, &data_fake, stopparen,
4843 recursed_depth, NULL, f, depth+1, mutate_ok);
4848 OP(scan) == BRANCH ||
4849 OP(scan) == BRANCHJ ||
4852 next = regnext(scan);
4855 /* The op(next)==code check below is to see if we
4856 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4857 * IFTHEN is special as it might not appear in pairs.
4858 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4859 * we dont handle it cleanly. */
4860 if (OP(next) == code || code == IFTHEN) {
4861 /* NOTE - There is similar code to this block below for
4862 * handling TRIE nodes on a re-study. If you change stuff here
4863 * check there too. */
4864 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4866 regnode * const startbranch=scan;
4868 if (flags & SCF_DO_SUBSTR) {
4869 /* Cannot merge strings after this. */
4870 scan_commit(pRExC_state, data, minlenp, is_inf);
4873 if (flags & SCF_DO_STCLASS)
4874 ssc_init_zero(pRExC_state, &accum);
4876 while (OP(scan) == code) {
4877 SSize_t deltanext, minnext, fake_last_close = 0;
4878 regnode *fake_last_close_op = NULL;
4879 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
4880 regnode_ssc this_class;
4882 DEBUG_PEEP("Branch", scan, depth, flags);
4885 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4887 data_fake.whilem_c = data->whilem_c;
4888 data_fake.last_closep = data->last_closep;
4889 data_fake.last_close_opp = data->last_close_opp;
4892 data_fake.last_closep = &fake_last_close;
4893 data_fake.last_close_opp = &fake_last_close_op;
4896 data_fake.pos_delta = delta;
4897 next = regnext(scan);
4899 scan = REGNODE_AFTER_opcode(scan, code);
4901 if (flags & SCF_DO_STCLASS) {
4902 ssc_init(pRExC_state, &this_class);
4903 data_fake.start_class = &this_class;
4904 f |= SCF_DO_STCLASS_AND;
4906 if (flags & SCF_WHILEM_VISITED_POS)
4907 f |= SCF_WHILEM_VISITED_POS;
4909 /* we suppose the run is continuous, last=next...*/
4910 /* recurse study_chunk() for each BRANCH in an alternation */
4911 minnext = study_chunk(pRExC_state, &scan, minlenp,
4912 &deltanext, next, &data_fake, stopparen,
4913 recursed_depth, NULL, f, depth+1,
4918 if (deltanext == OPTIMIZE_INFTY) {
4919 is_inf = is_inf_internal = 1;
4920 max1 = OPTIMIZE_INFTY;
4921 } else if (max1 < minnext + deltanext)
4922 max1 = minnext + deltanext;
4924 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4926 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4927 if ( stopmin > minnext)
4928 stopmin = min + min1;
4929 flags &= ~SCF_DO_SUBSTR;
4931 data->flags |= SCF_SEEN_ACCEPT;
4934 if (data_fake.flags & SF_HAS_EVAL)
4935 data->flags |= SF_HAS_EVAL;
4936 data->whilem_c = data_fake.whilem_c;
4938 if (flags & SCF_DO_STCLASS)
4939 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4940 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
4942 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4944 if (flags & SCF_DO_SUBSTR) {
4945 data->pos_min += min1;
4946 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4947 data->pos_delta = OPTIMIZE_INFTY;
4949 data->pos_delta += max1 - min1;
4950 if (max1 != min1 || is_inf)
4951 data->cur_is_floating = 1;
4954 if (delta == OPTIMIZE_INFTY
4955 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4956 delta = OPTIMIZE_INFTY;
4958 delta += max1 - min1;
4959 if (flags & SCF_DO_STCLASS_OR) {
4960 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4962 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4963 flags &= ~SCF_DO_STCLASS;
4966 else if (flags & SCF_DO_STCLASS_AND) {
4968 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4969 flags &= ~SCF_DO_STCLASS;
4972 /* Switch to OR mode: cache the old value of
4973 * data->start_class */
4975 StructCopy(data->start_class, and_withp, regnode_ssc);
4976 flags &= ~SCF_DO_STCLASS_AND;
4977 StructCopy(&accum, data->start_class, regnode_ssc);
4978 flags |= SCF_DO_STCLASS_OR;
4981 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
4983 if (PERL_ENABLE_TRIE_OPTIMISATION
4984 && OP(startbranch) == BRANCH
4989 Assuming this was/is a branch we are dealing with: 'scan'
4990 now points at the item that follows the branch sequence,
4991 whatever it is. We now start at the beginning of the
4992 sequence and look for subsequences of
4998 which would be constructed from a pattern like
5001 If we can find such a subsequence we need to turn the first
5002 element into a trie and then add the subsequent branch exact
5003 strings to the trie.
5007 1. patterns where the whole set of branches can be
5010 2. patterns where only a subset can be converted.
5012 In case 1 we can replace the whole set with a single regop
5013 for the trie. In case 2 we need to keep the start and end
5016 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
5017 becomes BRANCH TRIE; BRANCH X;
5019 There is an additional case, that being where there is a
5020 common prefix, which gets split out into an EXACT like node
5021 preceding the TRIE node.
5023 If X(1..n)==tail then we can do a simple trie, if not we make
5024 a "jump" trie, such that when we match the appropriate word
5025 we "jump" to the appropriate tail node. Essentially we turn
5026 a nested if into a case structure of sorts.
5031 if (!re_trie_maxbuff) {
5032 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
5033 if (!SvIOK(re_trie_maxbuff))
5034 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
5036 if ( SvIV(re_trie_maxbuff)>=0 ) {
5038 regnode *first = (regnode *)NULL;
5039 regnode *prev = (regnode *)NULL;
5040 regnode *tail = scan;
5044 /* var tail is used because there may be a TAIL
5045 regop in the way. Ie, the exacts will point to the
5046 thing following the TAIL, but the last branch will
5047 point at the TAIL. So we advance tail. If we
5048 have nested (?:) we may have to move through several
5052 while ( OP( tail ) == TAIL ) {
5053 /* this is the TAIL generated by (?:) */
5054 tail = regnext( tail );
5058 DEBUG_TRIE_COMPILE_r({
5059 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
5060 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
5062 "Looking for TRIE'able sequences. Tail node is ",
5063 (UV) REGNODE_OFFSET(tail),
5064 SvPV_nolen_const( RExC_mysv )
5070 Step through the branches
5071 cur represents each branch,
5072 noper is the first thing to be matched as part
5074 noper_next is the regnext() of that node.
5076 We normally handle a case like this
5077 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
5078 support building with NOJUMPTRIE, which restricts
5079 the trie logic to structures like /FOO|BAR/.
5081 If noper is a trieable nodetype then the branch is
5082 a possible optimization target. If we are building
5083 under NOJUMPTRIE then we require that noper_next is
5084 the same as scan (our current position in the regex
5087 Once we have two or more consecutive such branches
5088 we can create a trie of the EXACT's contents and
5089 stitch it in place into the program.
5091 If the sequence represents all of the branches in
5092 the alternation we replace the entire thing with a
5095 Otherwise when it is a subsequence we need to
5096 stitch it in place and replace only the relevant
5097 branches. This means the first branch has to remain
5098 as it is used by the alternation logic, and its
5099 next pointer, and needs to be repointed at the item
5100 on the branch chain following the last branch we
5101 have optimized away.
5103 This could be either a BRANCH, in which case the
5104 subsequence is internal, or it could be the item
5105 following the branch sequence in which case the
5106 subsequence is at the end (which does not
5107 necessarily mean the first node is the start of the
5110 TRIE_TYPE(X) is a define which maps the optype to a
5114 ----------------+-----------
5119 EXACTFU_REQ8 | EXACTFU
5123 EXACTFLU8 | EXACTFLU8
5127 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
5129 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
5131 : ( EXACTFU == (X) \
5132 || EXACTFU_REQ8 == (X) \
5133 || EXACTFUP == (X) ) \
5135 : ( EXACTFAA == (X) ) \
5137 : ( EXACTL == (X) ) \
5139 : ( EXACTFLU8 == (X) ) \
5143 /* dont use tail as the end marker for this traverse */
5144 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
5145 regnode * const noper = REGNODE_AFTER( cur );
5146 U8 noper_type = OP( noper );
5147 U8 noper_trietype = TRIE_TYPE( noper_type );
5148 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
5149 regnode * const noper_next = regnext( noper );
5150 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5151 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
5154 DEBUG_TRIE_COMPILE_r({
5155 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5156 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
5158 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5160 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5161 Perl_re_printf( aTHX_ " -> %d:%s",
5162 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5165 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5166 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5167 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5169 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5170 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5171 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
5175 /* Is noper a trieable nodetype that can be merged
5176 * with the current trie (if there is one)? */
5180 ( noper_trietype == NOTHING )
5181 || ( trietype == NOTHING )
5182 || ( trietype == noper_trietype )
5185 && noper_next >= tail
5189 /* Handle mergable triable node Either we are
5190 * the first node in a new trieable sequence,
5191 * in which case we do some bookkeeping,
5192 * otherwise we update the end pointer. */
5195 if ( noper_trietype == NOTHING ) {
5196 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5197 regnode * const noper_next = regnext( noper );
5198 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5199 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5202 if ( noper_next_trietype ) {
5203 trietype = noper_next_trietype;
5204 } else if (noper_next_type) {
5205 /* a NOTHING regop is 1 regop wide.
5206 * We need at least two for a trie
5207 * so we can't merge this in */
5211 trietype = noper_trietype;
5214 if ( trietype == NOTHING )
5215 trietype = noper_trietype;
5220 } /* end handle mergable triable node */
5222 /* handle unmergable node -
5223 * noper may either be a triable node which can
5224 * not be tried together with the current trie,
5225 * or a non triable node */
5227 /* If last is set and trietype is not
5228 * NOTHING then we have found at least two
5229 * triable branch sequences in a row of a
5230 * similar trietype so we can turn them
5231 * into a trie. If/when we allow NOTHING to
5232 * start a trie sequence this condition
5233 * will be required, and it isn't expensive
5234 * so we leave it in for now. */
5235 if ( trietype && trietype != NOTHING )
5236 make_trie( pRExC_state,
5237 startbranch, first, cur, tail,
5238 count, trietype, depth+1 );
5239 prev = NULL; /* note: we clear/update
5240 first, trietype etc below,
5241 so we dont do it here */
5245 && noper_next >= tail
5248 /* noper is triable, so we can start a new
5252 trietype = noper_trietype;
5254 /* if we already saw a first but the
5255 * current node is not triable then we have
5256 * to reset the first information. */
5261 } /* end handle unmergable node */
5262 } /* loop over branches */
5263 DEBUG_TRIE_COMPILE_r({
5264 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5265 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5266 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5267 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5268 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5269 REGNODE_NAME(trietype)
5273 if ( prev && trietype ) {
5274 if ( trietype != NOTHING ) {
5275 /* the last branch of the sequence was part of
5276 * a trie, so we have to construct it here
5277 * outside of the loop */
5278 made= make_trie( pRExC_state, startbranch,
5279 first, scan, tail, count,
5280 trietype, depth+1 );
5281 #ifdef TRIE_STUDY_OPT
5282 if ( ((made == MADE_EXACT_TRIE &&
5283 startbranch == first)
5284 || ( first_non_open == first )) &&
5286 flags |= SCF_TRIE_RESTUDY;
5287 if ( startbranch == first
5290 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5295 /* at this point we know whatever we have is a
5296 * NOTHING sequence/branch AND if 'startbranch'
5297 * is 'first' then we can turn the whole thing
5300 if ( startbranch == first ) {
5302 /* the entire thing is a NOTHING sequence,
5303 * something like this: (?:|) So we can
5304 * turn it into a plain NOTHING op. */
5305 DEBUG_TRIE_COMPILE_r({
5306 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5307 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5309 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5312 OP(startbranch)= NOTHING;
5313 NEXT_OFF(startbranch)= tail - startbranch;
5314 for ( opt= startbranch + 1; opt < tail ; opt++ )
5318 } /* end if ( prev) */
5319 } /* TRIE_MAXBUF is non zero */
5321 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
5324 scan = REGNODE_AFTER_opcode(scan,code);
5326 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5328 regnode *start = NULL;
5329 regnode *end = NULL;
5330 U32 my_recursed_depth= recursed_depth;
5332 if (OP(scan) != SUSPEND) { /* GOSUB */
5333 /* Do setup, note this code has side effects beyond
5334 * the rest of this block. Specifically setting
5335 * RExC_recurse[] must happen at least once during
5338 RExC_recurse[ARG2L(scan)] = scan;
5339 start = REGNODE_p(RExC_open_parens[paren]);
5340 end = REGNODE_p(RExC_close_parens[paren]);
5342 /* NOTE we MUST always execute the above code, even
5343 * if we do nothing with a GOSUB */
5345 ( flags & SCF_IN_DEFINE )
5348 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5350 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5353 /* no need to do anything here if we are in a define. */
5354 /* or we are after some kind of infinite construct
5355 * so we can skip recursing into this item.
5356 * Since it is infinite we will not change the maxlen
5357 * or delta, and if we miss something that might raise
5358 * the minlen it will merely pessimise a little.
5360 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5361 * might result in a minlen of 1 and not of 4,
5362 * but this doesn't make us mismatch, just try a bit
5363 * harder than we should.
5365 * However we must assume this GOSUB is infinite, to
5366 * avoid wrongly applying other optimizations in the
5367 * enclosing scope - see GH 18096, for example.
5369 is_inf = is_inf_internal = 1;
5370 scan= regnext(scan);
5376 || !PAREN_TEST(recursed_depth - 1, paren)
5378 /* it is quite possible that there are more efficient ways
5379 * to do this. We maintain a bitmap per level of recursion
5380 * of which patterns we have entered so we can detect if a
5381 * pattern creates a possible infinite loop. When we
5382 * recurse down a level we copy the previous levels bitmap
5383 * down. When we are at recursion level 0 we zero the top
5384 * level bitmap. It would be nice to implement a different
5385 * more efficient way of doing this. In particular the top
5386 * level bitmap may be unnecessary.
5388 if (!recursed_depth) {
5389 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5391 Copy(PAREN_OFFSET(recursed_depth - 1),
5392 PAREN_OFFSET(recursed_depth),
5393 RExC_study_chunk_recursed_bytes, U8);
5395 /* we havent recursed into this paren yet, so recurse into it */
5396 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
5397 PAREN_SET(recursed_depth, paren);
5398 my_recursed_depth= recursed_depth + 1;
5400 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
5401 /* some form of infinite recursion, assume infinite length
5403 if (flags & SCF_DO_SUBSTR) {
5404 scan_commit(pRExC_state, data, minlenp, is_inf);
5405 data->cur_is_floating = 1;
5407 is_inf = is_inf_internal = 1;
5408 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5409 ssc_anything(data->start_class);
5410 flags &= ~SCF_DO_STCLASS;
5412 start= NULL; /* reset start so we dont recurse later on. */
5417 end = regnext(scan);
5420 scan_frame *newframe;
5422 if (!RExC_frame_last) {
5423 Newxz(newframe, 1, scan_frame);
5424 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5425 RExC_frame_head= newframe;
5427 } else if (!RExC_frame_last->next_frame) {
5428 Newxz(newframe, 1, scan_frame);
5429 RExC_frame_last->next_frame= newframe;
5430 newframe->prev_frame= RExC_frame_last;
5433 newframe= RExC_frame_last->next_frame;
5435 RExC_frame_last= newframe;
5437 newframe->next_regnode = regnext(scan);
5438 newframe->last_regnode = last;
5439 newframe->stopparen = stopparen;
5440 newframe->prev_recursed_depth = recursed_depth;
5441 newframe->this_prev_frame= frame;
5442 newframe->in_gosub = (
5443 (frame && frame->in_gosub) || OP(scan) == GOSUB
5446 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
5447 DEBUG_PEEP("fnew", scan, depth, flags);
5454 recursed_depth= my_recursed_depth;
5459 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
5460 SSize_t bytelen = STR_LEN(scan), charlen;
5464 const U8 * const s = (U8*)STRING(scan);
5465 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5466 charlen = utf8_length(s, s + bytelen);
5468 uc = *((U8*)STRING(scan));
5472 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5473 /* The code below prefers earlier match for fixed
5474 offset, later match for variable offset. */
5475 if (data->last_end == -1) { /* Update the start info. */
5476 data->last_start_min = data->pos_min;
5477 data->last_start_max =
5478 is_inf ? OPTIMIZE_INFTY
5479 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5480 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5482 sv_catpvn(data->last_found, STRING(scan), bytelen);
5484 SvUTF8_on(data->last_found);
5486 SV * const sv = data->last_found;
5487 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5488 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5489 if (mg && mg->mg_len >= 0)
5490 mg->mg_len += charlen;
5492 data->last_end = data->pos_min + charlen;
5493 data->pos_min += charlen; /* As in the first entry. */
5494 data->flags &= ~SF_BEFORE_EOL;
5497 /* ANDing the code point leaves at most it, and not in locale, and
5498 * can't match null string */
5499 if (flags & SCF_DO_STCLASS_AND) {
5500 ssc_cp_and(data->start_class, uc);
5501 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5502 ssc_clear_locale(data->start_class);
5504 else if (flags & SCF_DO_STCLASS_OR) {
5505 ssc_add_cp(data->start_class, uc);
5506 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5508 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5509 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5511 flags &= ~SCF_DO_STCLASS;
5512 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
5514 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
5515 /* But OP != EXACT!, so is EXACTFish */
5516 SSize_t bytelen = STR_LEN(scan), charlen;
5517 const U8 * s = (U8*)STRING(scan);
5519 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5520 * with the mask set to the complement of the bit that differs
5521 * between upper and lower case, and the lowest code point of the
5522 * pair (which the '&' forces) */
5525 && ( OP(scan) == EXACTFAA
5526 || ( OP(scan) == EXACTFU
5527 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5530 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5533 ARG_SET(scan, *s & mask);
5535 /* We're not EXACTFish any more, so restudy.
5536 * Search for "restudy" in this file to find
5537 * a comment with details. */
5541 /* Search for fixed substrings supports EXACT only. */
5542 if (flags & SCF_DO_SUBSTR) {
5544 scan_commit(pRExC_state, data, minlenp, is_inf);
5546 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5547 if (unfolded_multi_char) {
5548 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5550 min += charlen - min_subtract;
5552 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
5553 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
5555 delta += min_subtract;
5557 delta = OPTIMIZE_INFTY;
5559 if (flags & SCF_DO_SUBSTR) {
5560 data->pos_min += charlen - min_subtract;
5561 if (data->pos_min < 0) {
5564 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
5565 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
5567 data->pos_delta += min_subtract;
5569 data->pos_delta = OPTIMIZE_INFTY;
5572 data->cur_is_floating = 1; /* float */
5576 if (flags & SCF_DO_STCLASS) {
5577 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5579 assert(EXACTF_invlist);
5580 if (flags & SCF_DO_STCLASS_AND) {
5581 if (OP(scan) != EXACTFL)
5582 ssc_clear_locale(data->start_class);
5583 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5584 ANYOF_POSIXL_ZERO(data->start_class);
5585 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5587 else { /* SCF_DO_STCLASS_OR */
5588 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5589 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5591 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5592 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5594 flags &= ~SCF_DO_STCLASS;
5595 SvREFCNT_dec(EXACTF_invlist);
5597 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
5599 else if (REGNODE_VARIES(OP(scan))) {
5600 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5603 regnode * const oscan = scan;
5604 regnode_ssc this_class;
5605 regnode_ssc *oclass = NULL;
5606 I32 next_is_eval = 0;
5608 switch (REGNODE_TYPE(OP(scan))) {
5609 case WHILEM: /* End of (?:...)* . */
5610 scan = REGNODE_AFTER(scan);
5613 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5614 next = REGNODE_AFTER(scan);
5615 if ( ( REGNODE_TYPE(OP(next)) == EXACT
5616 && ! isEXACTFish(OP(next)))
5617 || (flags & SCF_DO_STCLASS))
5620 maxcount = REG_INFTY;
5621 next = regnext(scan);
5622 scan = REGNODE_AFTER(scan);
5626 if (flags & SCF_DO_SUBSTR)
5628 /* This will bypass the formal 'min += minnext * mincount'
5629 * calculation in the do_curly path, so assumes min width
5630 * of the PLUS payload is exactly one. */
5634 next = REGNODE_AFTER(scan);
5636 /* This temporary node can now be turned into EXACTFU, and
5637 * must, as regexec.c doesn't handle it */
5638 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5642 if ( STR_LEN(next) == 1
5643 && isALPHA_A(* STRING(next))
5644 && ( OP(next) == EXACTFAA
5645 || ( OP(next) == EXACTFU
5646 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5649 /* These differ in just one bit */
5650 U8 mask = ~ ('A' ^ 'a');
5652 assert(isALPHA_A(* STRING(next)));
5654 /* Then replace it by an ANYOFM node, with
5655 * the mask set to the complement of the
5656 * bit that differs between upper and lower
5657 * case, and the lowest code point of the
5658 * pair (which the '&' forces) */
5660 ARG_SET(next, *STRING(next) & mask);
5664 if (flags & SCF_DO_STCLASS) {
5666 maxcount = REG_INFTY;
5667 next = regnext(scan);
5668 scan = REGNODE_AFTER(scan);
5671 if (flags & SCF_DO_SUBSTR) {
5672 scan_commit(pRExC_state, data, minlenp, is_inf);
5673 /* Cannot extend fixed substrings */
5674 data->cur_is_floating = 1; /* float */
5676 is_inf = is_inf_internal = 1;
5677 scan = regnext(scan);
5678 goto optimize_curly_tail;
5680 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5681 && (scan->flags == stopparen))
5686 mincount = ARG1(scan);
5687 maxcount = ARG2(scan);
5689 next = regnext(scan);
5690 if (OP(scan) == CURLYX) {
5691 I32 lp = (data ? *(data->last_closep) : 0);
5692 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5694 scan = REGNODE_AFTER(scan);
5695 next_is_eval = (OP(scan) == EVAL);
5697 if (flags & SCF_DO_SUBSTR) {
5699 scan_commit(pRExC_state, data, minlenp, is_inf);
5700 /* Cannot extend fixed substrings */
5701 pos_before = data->pos_min;
5705 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5707 data->flags |= SF_IS_INF;
5709 if (flags & SCF_DO_STCLASS) {
5710 ssc_init(pRExC_state, &this_class);
5711 oclass = data->start_class;
5712 data->start_class = &this_class;
5713 f |= SCF_DO_STCLASS_AND;
5714 f &= ~SCF_DO_STCLASS_OR;
5716 /* Exclude from super-linear cache processing any {n,m}
5717 regops for which the combination of input pos and regex
5718 pos is not enough information to determine if a match
5721 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5722 regex pos at the \s*, the prospects for a match depend not
5723 only on the input position but also on how many (bar\s*)
5724 repeats into the {4,8} we are. */
5725 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5726 f &= ~SCF_WHILEM_VISITED_POS;
5728 /* This will finish on WHILEM, setting scan, or on NULL: */
5729 /* recurse study_chunk() on loop bodies */
5730 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5731 last, data, stopparen, recursed_depth, NULL,
5733 ? (f & ~SCF_DO_SUBSTR)
5735 , depth+1, mutate_ok);
5737 if (data && data->flags & SCF_SEEN_ACCEPT) {
5742 if (flags & SCF_DO_STCLASS)
5743 data->start_class = oclass;
5744 if (mincount == 0 || minnext == 0) {
5745 if (flags & SCF_DO_STCLASS_OR) {
5746 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5748 else if (flags & SCF_DO_STCLASS_AND) {
5749 /* Switch to OR mode: cache the old value of
5750 * data->start_class */
5752 StructCopy(data->start_class, and_withp, regnode_ssc);
5753 flags &= ~SCF_DO_STCLASS_AND;
5754 StructCopy(&this_class, data->start_class, regnode_ssc);
5755 flags |= SCF_DO_STCLASS_OR;
5756 ANYOF_FLAGS(data->start_class)
5757 |= SSC_MATCHES_EMPTY_STRING;
5759 } else { /* Non-zero len */
5760 if (flags & SCF_DO_STCLASS_OR) {
5761 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5762 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5764 else if (flags & SCF_DO_STCLASS_AND)
5765 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5766 flags &= ~SCF_DO_STCLASS;
5768 if (!scan) /* It was not CURLYX, but CURLY. */
5770 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5771 /* ? quantifier ok, except for (?{ ... }) */
5772 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5773 && (minnext == 0) && (deltanext == 0)
5774 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5775 && maxcount <= REG_INFTY/3) /* Complement check for big
5778 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5779 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5780 "Quantifier unexpected on zero-length expression "
5781 "in regex m/%" UTF8f "/",
5782 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5786 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5787 || min >= SSize_t_MAX - minnext * mincount )
5789 FAIL("Regexp out of space");
5792 min += minnext * mincount;
5793 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5794 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5795 is_inf |= is_inf_internal;
5797 delta = OPTIMIZE_INFTY;
5799 delta += (minnext + deltanext) * maxcount
5800 - minnext * mincount;
5803 if (data && data->flags & SCF_SEEN_ACCEPT) {
5804 if (flags & SCF_DO_SUBSTR) {
5805 scan_commit(pRExC_state, data, minlenp, is_inf);
5806 flags &= ~SCF_DO_SUBSTR;
5810 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
5812 /* Try powerful optimization CURLYX => CURLYN. */
5813 if ( OP(oscan) == CURLYX && data
5814 && data->flags & SF_IN_PAR
5815 && !(data->flags & SF_HAS_EVAL)
5816 && !deltanext && minnext == 1
5819 /* Try to optimize to CURLYN. */
5820 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
5821 regnode * const nxt1 = nxt;
5828 if (!REGNODE_SIMPLE(OP(nxt))
5829 && !(REGNODE_TYPE(OP(nxt)) == EXACT
5830 && STR_LEN(nxt) == 1))
5836 if (OP(nxt) != CLOSE)
5838 if (RExC_open_parens) {
5841 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5844 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5846 /* Now we know that nxt2 is the only contents: */
5847 oscan->flags = (U8)ARG(nxt);
5849 OP(nxt1) = NOTHING; /* was OPEN. */
5852 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5853 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5854 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5855 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5856 OP(nxt + 1) = OPTIMIZED; /* was count. */
5857 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5862 /* Try optimization CURLYX => CURLYM. */
5863 if ( OP(oscan) == CURLYX && data
5864 && !(data->flags & SF_HAS_PAR)
5865 && !(data->flags & SF_HAS_EVAL)
5866 && !deltanext /* atom is fixed width */
5867 && minnext != 0 /* CURLYM can't handle zero width */
5868 /* Nor characters whose fold at run-time may be
5869 * multi-character */
5870 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5873 /* XXXX How to optimize if data == 0? */
5874 /* Optimize to a simpler form. */
5875 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
5879 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5880 && (OP(nxt2) != WHILEM))
5882 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5883 /* Need to optimize away parenths. */
5884 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5885 /* Set the parenth number. */
5886 /* note that we have changed the type of oscan to CURLYM here */
5887 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
5889 oscan->flags = (U8)ARG(nxt);
5890 if (RExC_open_parens) {
5892 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5895 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5898 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5899 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5902 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5903 OP(nxt + 1) = OPTIMIZED; /* was count. */
5904 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5905 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5908 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5909 regnode *nnxt = regnext(nxt1);
5911 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
5912 ARG_SET(nxt1, nxt2 - nxt1);
5913 else if (nxt2 - nxt1 < U16_MAX)
5914 NEXT_OFF(nxt1) = nxt2 - nxt1;
5916 OP(nxt) = NOTHING; /* Cannot beautify */
5921 /* Optimize again: */
5922 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5923 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5924 NULL, stopparen, recursed_depth, NULL, 0,
5925 depth+1, mutate_ok);
5930 else if ((OP(oscan) == CURLYX)
5931 && (flags & SCF_WHILEM_VISITED_POS)
5932 /* See the comment on a similar expression above.
5933 However, this time it's not a subexpression
5934 we care about, but the expression itself. */
5935 && (maxcount == REG_INFTY)
5937 /* This stays as CURLYX, we can put the count/of pair. */
5938 /* Find WHILEM (as in regexec.c) */
5939 regnode *nxt = oscan + NEXT_OFF(oscan);
5941 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
5943 nxt = REGNODE_BEFORE(nxt);
5944 if (nxt->flags & 0xf) {
5945 /* we've already set whilem count on this node */
5946 } else if (++data->whilem_c < 16) {
5947 assert(data->whilem_c <= RExC_whilem_seen);
5948 nxt->flags = (U8)(data->whilem_c
5949 | (RExC_whilem_seen << 4)); /* On WHILEM */
5952 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5954 if (flags & SCF_DO_SUBSTR) {
5955 SV *last_str = NULL;
5956 STRLEN last_chrs = 0;
5957 int counted = mincount != 0;
5959 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5961 SSize_t b = pos_before >= data->last_start_min
5962 ? pos_before : data->last_start_min;
5964 const char * const s = SvPV_const(data->last_found, l);
5965 SSize_t old = b - data->last_start_min;
5969 old = utf8_hop_forward((U8*)s, old,
5970 (U8 *) SvEND(data->last_found))
5973 /* Get the added string: */
5974 last_str = newSVpvn_utf8(s + old, l, UTF);
5975 last_chrs = UTF ? utf8_length((U8*)(s + old),
5976 (U8*)(s + old + l)) : l;
5977 if (deltanext == 0 && pos_before == b) {
5978 /* What was added is a constant string */
5981 SvGROW(last_str, (mincount * l) + 1);
5982 repeatcpy(SvPVX(last_str) + l,
5983 SvPVX_const(last_str), l,
5985 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5986 /* Add additional parts. */
5987 SvCUR_set(data->last_found,
5988 SvCUR(data->last_found) - l);
5989 sv_catsv(data->last_found, last_str);
5991 SV * sv = data->last_found;
5993 SvUTF8(sv) && SvMAGICAL(sv) ?
5994 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5995 if (mg && mg->mg_len >= 0)
5996 mg->mg_len += last_chrs * (mincount-1);
5998 last_chrs *= mincount;
5999 data->last_end += l * (mincount - 1);
6002 /* start offset must point into the last copy */
6003 data->last_start_min += minnext * (mincount - 1);
6004 data->last_start_max =
6007 : data->last_start_max +
6008 (maxcount - 1) * (minnext + data->pos_delta);
6011 /* It is counted once already... */
6012 data->pos_min += minnext * (mincount - counted);
6014 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
6015 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
6016 " maxcount=%" UVuf " mincount=%" UVuf
6017 " data->pos_delta=%" UVuf "\n",
6018 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
6019 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
6020 if (deltanext != OPTIMIZE_INFTY)
6021 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
6022 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
6023 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
6025 if (deltanext == OPTIMIZE_INFTY
6026 || data->pos_delta == OPTIMIZE_INFTY
6027 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
6028 data->pos_delta = OPTIMIZE_INFTY;
6030 data->pos_delta += - counted * deltanext +
6031 (minnext + deltanext) * maxcount - minnext * mincount;
6032 if (mincount != maxcount) {
6033 /* Cannot extend fixed substrings found inside
6035 scan_commit(pRExC_state, data, minlenp, is_inf);
6036 if (mincount && last_str) {
6037 SV * const sv = data->last_found;
6038 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
6039 mg_find(sv, PERL_MAGIC_utf8) : NULL;
6043 sv_setsv(sv, last_str);
6044 data->last_end = data->pos_min;
6045 data->last_start_min = data->pos_min - last_chrs;
6046 data->last_start_max = is_inf
6048 : data->pos_min + data->pos_delta - last_chrs;
6050 data->cur_is_floating = 1; /* float */
6052 SvREFCNT_dec(last_str);
6054 if (data && (fl & SF_HAS_EVAL))
6055 data->flags |= SF_HAS_EVAL;
6056 optimize_curly_tail:
6057 rck_elide_nothing(oscan);
6061 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
6065 if (flags & SCF_DO_SUBSTR) {
6066 /* Cannot expect anything... */
6067 scan_commit(pRExC_state, data, minlenp, is_inf);
6068 data->cur_is_floating = 1; /* float */
6070 is_inf = is_inf_internal = 1;
6071 if (flags & SCF_DO_STCLASS_OR) {
6072 if (OP(scan) == CLUMP) {
6073 /* Actually is any start char, but very few code points
6074 * aren't start characters */
6075 ssc_match_all_cp(data->start_class);
6078 ssc_anything(data->start_class);
6081 flags &= ~SCF_DO_STCLASS;
6085 else if (OP(scan) == LNBREAK) {
6086 if (flags & SCF_DO_STCLASS) {
6087 if (flags & SCF_DO_STCLASS_AND) {
6088 ssc_intersection(data->start_class,
6089 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
6090 ssc_clear_locale(data->start_class);
6091 ANYOF_FLAGS(data->start_class)
6092 &= ~SSC_MATCHES_EMPTY_STRING;
6094 else if (flags & SCF_DO_STCLASS_OR) {
6095 ssc_union(data->start_class,
6096 PL_XPosix_ptrs[CC_VERTSPACE_],
6098 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6100 /* See commit msg for
6101 * 749e076fceedeb708a624933726e7989f2302f6a */
6102 ANYOF_FLAGS(data->start_class)
6103 &= ~SSC_MATCHES_EMPTY_STRING;
6105 flags &= ~SCF_DO_STCLASS;
6108 if (delta != OPTIMIZE_INFTY)
6109 delta++; /* Because of the 2 char string cr-lf */
6110 if (flags & SCF_DO_SUBSTR) {
6111 /* Cannot expect anything... */
6112 scan_commit(pRExC_state, data, minlenp, is_inf);
6114 if (data->pos_delta != OPTIMIZE_INFTY) {
6115 data->pos_delta += 1;
6117 data->cur_is_floating = 1; /* float */
6120 else if (REGNODE_SIMPLE(OP(scan))) {
6122 if (flags & SCF_DO_SUBSTR) {
6123 scan_commit(pRExC_state, data, minlenp, is_inf);
6127 if (flags & SCF_DO_STCLASS) {
6129 SV* my_invlist = NULL;
6132 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
6133 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
6135 /* Some of the logic below assumes that switching
6136 locale on will only add false positives. */
6141 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
6145 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6146 ssc_match_all_cp(data->start_class);
6151 SV* REG_ANY_invlist = _new_invlist(2);
6152 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
6154 if (flags & SCF_DO_STCLASS_OR) {
6155 ssc_union(data->start_class,
6157 TRUE /* TRUE => invert, hence all but \n
6161 else if (flags & SCF_DO_STCLASS_AND) {
6162 ssc_intersection(data->start_class,
6164 TRUE /* TRUE => invert */
6166 ssc_clear_locale(data->start_class);
6168 SvREFCNT_dec_NN(REG_ANY_invlist);
6180 if (flags & SCF_DO_STCLASS_AND)
6181 ssc_and(pRExC_state, data->start_class,
6182 (regnode_charclass *) scan);
6184 ssc_or(pRExC_state, data->start_class,
6185 (regnode_charclass *) scan);
6190 SV* cp_list = get_ANYOFHbbm_contents(scan);
6192 if (flags & SCF_DO_STCLASS_OR) {
6193 ssc_union(data->start_class, cp_list, invert);
6195 else if (flags & SCF_DO_STCLASS_AND) {
6196 ssc_intersection(data->start_class, cp_list, invert);
6199 SvREFCNT_dec_NN(cp_list);
6203 case NANYOFM: /* NANYOFM already contains the inversion of the
6204 input ANYOF data, so, unlike things like
6205 NPOSIXA, don't change 'invert' to TRUE */
6209 SV* cp_list = get_ANYOFM_contents(scan);
6211 if (flags & SCF_DO_STCLASS_OR) {
6212 ssc_union(data->start_class, cp_list, invert);
6214 else if (flags & SCF_DO_STCLASS_AND) {
6215 ssc_intersection(data->start_class, cp_list, invert);
6218 SvREFCNT_dec_NN(cp_list);
6227 cp_list = _add_range_to_invlist(cp_list,
6229 ANYOFRbase(scan) + ANYOFRdelta(scan));
6231 if (flags & SCF_DO_STCLASS_OR) {
6232 ssc_union(data->start_class, cp_list, invert);
6234 else if (flags & SCF_DO_STCLASS_AND) {
6235 ssc_intersection(data->start_class, cp_list, invert);
6238 SvREFCNT_dec_NN(cp_list);
6247 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6248 if (flags & SCF_DO_STCLASS_AND) {
6249 bool was_there = cBOOL(
6250 ANYOF_POSIXL_TEST(data->start_class,
6252 ANYOF_POSIXL_ZERO(data->start_class);
6253 if (was_there) { /* Do an AND */
6254 ANYOF_POSIXL_SET(data->start_class, namedclass);
6256 /* No individual code points can now match */
6257 data->start_class->invlist
6258 = sv_2mortal(_new_invlist(0));
6261 int complement = namedclass + ((invert) ? -1 : 1);
6263 assert(flags & SCF_DO_STCLASS_OR);
6265 /* If the complement of this class was already there,
6266 * the result is that they match all code points,
6267 * (\d + \D == everything). Remove the classes from
6268 * future consideration. Locale is not relevant in
6270 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6271 ssc_match_all_cp(data->start_class);
6272 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6273 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6275 else { /* The usual case; just add this class to the
6277 ANYOF_POSIXL_SET(data->start_class, namedclass);
6282 case NPOSIXA: /* For these, we always know the exact set of
6287 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6288 goto join_posix_and_ascii;
6296 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6298 /* NPOSIXD matches all upper Latin1 code points unless the
6299 * target string being matched is UTF-8, which is
6300 * unknowable until match time. Since we are going to
6301 * invert, we want to get rid of all of them so that the
6302 * inversion will match all */
6303 if (OP(scan) == NPOSIXD) {
6304 _invlist_subtract(my_invlist, PL_UpperLatin1,
6308 join_posix_and_ascii:
6310 if (flags & SCF_DO_STCLASS_AND) {
6311 ssc_intersection(data->start_class, my_invlist, invert);
6312 ssc_clear_locale(data->start_class);
6315 assert(flags & SCF_DO_STCLASS_OR);
6316 ssc_union(data->start_class, my_invlist, invert);
6318 SvREFCNT_dec(my_invlist);
6320 if (flags & SCF_DO_STCLASS_OR)
6321 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6322 flags &= ~SCF_DO_STCLASS;
6325 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
6326 data->flags |= (OP(scan) == MEOL
6329 scan_commit(pRExC_state, data, minlenp, is_inf);
6332 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
6333 /* Lookbehind, or need to calculate parens/evals/stclass: */
6334 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6335 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6337 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6338 || OP(scan) == UNLESSM )
6340 /* Negative Lookahead/lookbehind
6341 In this case we can't do fixed string optimisation.
6344 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
6345 SSize_t deltanext, minnext;
6346 SSize_t fake_last_close = 0;
6347 regnode *fake_last_close_op = NULL;
6348 regnode *cur_last_close_op;
6351 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
6353 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6355 data_fake.whilem_c = data->whilem_c;
6356 data_fake.last_closep = data->last_closep;
6357 data_fake.last_close_opp = data->last_close_opp;
6360 data_fake.last_closep = &fake_last_close;
6361 data_fake.last_close_opp = &fake_last_close_op;
6364 /* remember the last_close_op we saw so we can see if
6365 * we are dealing with variable length lookbehind that
6366 * contains capturing buffers, which are considered
6368 cur_last_close_op= *(data_fake.last_close_opp);
6370 data_fake.pos_delta = delta;
6371 if ( flags & SCF_DO_STCLASS && !scan->flags
6372 && OP(scan) == IFMATCH ) { /* Lookahead */
6373 ssc_init(pRExC_state, &intrnl);
6374 data_fake.start_class = &intrnl;
6375 f |= SCF_DO_STCLASS_AND;
6377 if (flags & SCF_WHILEM_VISITED_POS)
6378 f |= SCF_WHILEM_VISITED_POS;
6379 next = regnext(scan);
6380 nscan = REGNODE_AFTER(scan);
6382 /* recurse study_chunk() for lookahead body */
6383 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6384 last, &data_fake, stopparen,
6385 recursed_depth, NULL, f, depth+1,
6390 || deltanext > (I32) U8_MAX
6391 || minnext > (I32)U8_MAX
6392 || minnext + deltanext > (I32)U8_MAX)
6394 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6398 /* The 'next_off' field has been repurposed to count the
6399 * additional starting positions to try beyond the initial
6400 * one. (This leaves it at 0 for non-variable length
6401 * matches to avoid breakage for those not using this
6404 scan->next_off = deltanext;
6406 /* See a CLOSE op inside this lookbehind? */
6407 cur_last_close_op != *(data_fake.last_close_opp)
6408 /* and not doing restudy. see: restudied */
6409 && !(flags & SCF_TRIE_DOING_RESTUDY)
6411 /* this is positive variable length lookbehind with
6412 * capture buffers inside of it */
6413 ckWARNexperimental_with_arg(RExC_parse,
6414 WARN_EXPERIMENTAL__VLB,
6415 "Variable length %s lookbehind with capturing is experimental",
6416 is_positive ? "positive" : "negative");
6419 scan->flags = (U8)minnext + deltanext;
6422 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6424 if (data_fake.flags & SF_HAS_EVAL)
6425 data->flags |= SF_HAS_EVAL;
6426 data->whilem_c = data_fake.whilem_c;
6428 if (f & SCF_DO_STCLASS_AND) {
6429 if (flags & SCF_DO_STCLASS_OR) {
6430 /* OR before, AND after: ideally we would recurse with
6431 * data_fake to get the AND applied by study of the
6432 * remainder of the pattern, and then derecurse;
6433 * *** HACK *** for now just treat as "no information".
6434 * See [perl #56690].
6436 ssc_init(pRExC_state, data->start_class);
6438 /* AND before and after: combine and continue. These
6439 * assertions are zero-length, so can match an EMPTY
6441 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6442 ANYOF_FLAGS(data->start_class)
6443 |= SSC_MATCHES_EMPTY_STRING;
6446 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
6448 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6450 /* Positive Lookahead/lookbehind
6451 In this case we can do fixed string optimisation,
6452 but we must be careful about it. Note in the case of
6453 lookbehind the positions will be offset by the minimum
6454 length of the pattern, something we won't know about
6455 until after the recurse.
6457 SSize_t deltanext, fake_last_close = 0;
6458 regnode *last_close_op = NULL;
6461 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
6462 /* We use SAVEFREEPV so that when the full compile
6463 is finished perl will clean up the allocated
6464 minlens when it's all done. This way we don't
6465 have to worry about freeing them when we know
6466 they wont be used, which would be a pain.
6469 Newx( minnextp, 1, SSize_t );
6470 SAVEFREEPV(minnextp);
6473 StructCopy(data, &data_fake, scan_data_t);
6474 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6477 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6478 data_fake.last_found=newSVsv(data->last_found);
6482 data_fake.last_closep = &fake_last_close;
6483 data_fake.last_close_opp = &fake_last_close_opp;
6485 data_fake.flags = 0;
6486 data_fake.substrs[0].flags = 0;
6487 data_fake.substrs[1].flags = 0;
6488 data_fake.pos_delta = delta;
6490 data_fake.flags |= SF_IS_INF;
6491 if ( flags & SCF_DO_STCLASS && !scan->flags
6492 && OP(scan) == IFMATCH ) { /* Lookahead */
6493 ssc_init(pRExC_state, &intrnl);
6494 data_fake.start_class = &intrnl;
6495 f |= SCF_DO_STCLASS_AND;
6497 if (flags & SCF_WHILEM_VISITED_POS)
6498 f |= SCF_WHILEM_VISITED_POS;
6499 next = regnext(scan);
6500 nscan = REGNODE_AFTER(scan);
6502 /* positive lookahead study_chunk() recursion */
6503 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6504 &deltanext, last, &data_fake,
6505 stopparen, recursed_depth, NULL,
6506 f, depth+1, mutate_ok);
6508 assert(0); /* This code has never been tested since this
6509 is normally not compiled */
6511 || deltanext > (I32) U8_MAX
6512 || *minnextp > (I32)U8_MAX
6513 || *minnextp + deltanext > (I32)U8_MAX)
6515 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6520 scan->next_off = deltanext;
6522 scan->flags = (U8)*minnextp + deltanext;
6527 if (f & SCF_DO_STCLASS_AND) {
6528 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6529 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6532 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6534 if (data_fake.flags & SF_HAS_EVAL)
6535 data->flags |= SF_HAS_EVAL;
6536 data->whilem_c = data_fake.whilem_c;
6537 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6539 if (RExC_rx->minlen < *minnextp)
6540 RExC_rx->minlen = *minnextp;
6541 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6542 SvREFCNT_dec_NN(data_fake.last_found);
6544 for (i = 0; i < 2; i++) {
6545 if (data_fake.substrs[i].minlenp != minlenp) {
6546 data->substrs[i].min_offset =
6547 data_fake.substrs[i].min_offset;
6548 data->substrs[i].max_offset =
6549 data_fake.substrs[i].max_offset;
6550 data->substrs[i].minlenp =
6551 data_fake.substrs[i].minlenp;
6552 data->substrs[i].lookbehind += scan->flags;
6560 else if (OP(scan) == OPEN) {
6561 if (stopparen != (I32)ARG(scan))
6564 else if (OP(scan) == CLOSE) {
6565 if (stopparen == (I32)ARG(scan)) {
6568 if ((I32)ARG(scan) == is_par) {
6569 next = regnext(scan);
6571 if ( next && (OP(next) != WHILEM) && next < last)
6572 is_par = 0; /* Disable optimization */
6575 *(data->last_closep) = ARG(scan);
6576 *(data->last_close_opp) = scan;
6579 else if (OP(scan) == EVAL) {
6581 data->flags |= SF_HAS_EVAL;
6583 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
6584 if (flags & SCF_DO_SUBSTR) {
6585 scan_commit(pRExC_state, data, minlenp, is_inf);
6586 flags &= ~SCF_DO_SUBSTR;
6588 if (OP(scan)==ACCEPT) {
6589 /* m{(*ACCEPT)x} does not have to start with 'x' */
6590 flags &= ~SCF_DO_STCLASS;
6592 data->flags |= SCF_SEEN_ACCEPT;
6597 else if (OP(scan) == COMMIT) {
6598 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
6599 * must not end up with "abcxyz" as a fixed substring else we'll
6600 * skip straight to attempting to match at offset 4.
6602 if (flags & SCF_DO_SUBSTR) {
6603 scan_commit(pRExC_state, data, minlenp, is_inf);
6604 flags &= ~SCF_DO_SUBSTR;
6607 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6609 if (flags & SCF_DO_SUBSTR) {
6610 scan_commit(pRExC_state, data, minlenp, is_inf);
6611 data->cur_is_floating = 1; /* float */
6613 is_inf = is_inf_internal = 1;
6614 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6615 ssc_anything(data->start_class);
6616 flags &= ~SCF_DO_STCLASS;
6618 else if (OP(scan) == GPOS) {
6619 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6620 !(delta || is_inf || (data && data->pos_delta)))
6622 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6623 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6624 if (RExC_rx->gofs < (STRLEN)min)
6625 RExC_rx->gofs = min;
6627 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6631 #ifdef TRIE_STUDY_OPT
6632 #ifdef FULL_TRIE_STUDY
6633 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
6634 /* NOTE - There is similar code to this block above for handling
6635 BRANCH nodes on the initial study. If you change stuff here
6637 regnode *trie_node= scan;
6638 regnode *tail= regnext(scan);
6639 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6640 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6643 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6644 /* Cannot merge strings after this. */
6645 scan_commit(pRExC_state, data, minlenp, is_inf);
6647 if (flags & SCF_DO_STCLASS)
6648 ssc_init_zero(pRExC_state, &accum);
6654 const regnode *nextbranch= NULL;
6657 for ( word=1 ; word <= trie->wordcount ; word++)
6659 SSize_t deltanext = 0, minnext = 0;
6660 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
6661 SSize_t fake_last_close = 0;
6662 regnode *fake_last_close_op = NULL;
6663 regnode_ssc this_class;
6665 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6667 data_fake.whilem_c = data->whilem_c;
6668 data_fake.last_closep = data->last_closep;
6669 data_fake.last_close_opp = data->last_close_opp;
6672 data_fake.last_closep = &fake_last_close;
6673 data_fake.last_close_opp = &fake_last_close_op;
6675 data_fake.pos_delta = delta;
6676 if (flags & SCF_DO_STCLASS) {
6677 ssc_init(pRExC_state, &this_class);
6678 data_fake.start_class = &this_class;
6679 f |= SCF_DO_STCLASS_AND;
6681 if (flags & SCF_WHILEM_VISITED_POS)
6682 f |= SCF_WHILEM_VISITED_POS;
6684 if (trie->jump[word]) {
6686 nextbranch = trie_node + trie->jump[0];
6687 scan= trie_node + trie->jump[word];
6688 /* We go from the jump point to the branch that follows
6689 it. Note this means we need the vestigal unused
6690 branches even though they arent otherwise used. */
6691 /* optimise study_chunk() for TRIE */
6692 minnext = study_chunk(pRExC_state, &scan, minlenp,
6693 &deltanext, (regnode *)nextbranch, &data_fake,
6694 stopparen, recursed_depth, NULL, f, depth+1,
6697 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
6698 nextbranch= regnext((regnode*)nextbranch);
6700 if (min1 > (SSize_t)(minnext + trie->minlen))
6701 min1 = minnext + trie->minlen;
6702 if (deltanext == OPTIMIZE_INFTY) {
6703 is_inf = is_inf_internal = 1;
6704 max1 = OPTIMIZE_INFTY;
6705 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6706 max1 = minnext + deltanext + trie->maxlen;
6708 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6710 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6711 if ( stopmin > min + min1)
6712 stopmin = min + min1;
6713 flags &= ~SCF_DO_SUBSTR;
6715 data->flags |= SCF_SEEN_ACCEPT;
6718 if (data_fake.flags & SF_HAS_EVAL)
6719 data->flags |= SF_HAS_EVAL;
6720 data->whilem_c = data_fake.whilem_c;
6722 if (flags & SCF_DO_STCLASS)
6723 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6725 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
6727 if (flags & SCF_DO_SUBSTR) {
6728 data->pos_min += min1;
6729 data->pos_delta += max1 - min1;
6730 if (max1 != min1 || is_inf)
6731 data->cur_is_floating = 1; /* float */
6734 if (delta != OPTIMIZE_INFTY) {
6735 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6736 delta += max1 - min1;
6738 delta = OPTIMIZE_INFTY;
6740 if (flags & SCF_DO_STCLASS_OR) {
6741 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6743 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6744 flags &= ~SCF_DO_STCLASS;
6747 else if (flags & SCF_DO_STCLASS_AND) {
6749 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6750 flags &= ~SCF_DO_STCLASS;
6753 /* Switch to OR mode: cache the old value of
6754 * data->start_class */
6756 StructCopy(data->start_class, and_withp, regnode_ssc);
6757 flags &= ~SCF_DO_STCLASS_AND;
6758 StructCopy(&accum, data->start_class, regnode_ssc);
6759 flags |= SCF_DO_STCLASS_OR;
6763 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
6767 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
6768 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6771 min += trie->minlen;
6772 delta += (trie->maxlen - trie->minlen);
6773 flags &= ~SCF_DO_STCLASS; /* xxx */
6774 if (flags & SCF_DO_SUBSTR) {
6775 /* Cannot expect anything... */
6776 scan_commit(pRExC_state, data, minlenp, is_inf);
6777 data->pos_min += trie->minlen;
6778 data->pos_delta += (trie->maxlen - trie->minlen);
6779 if (trie->maxlen != trie->minlen)
6780 data->cur_is_floating = 1; /* float */
6782 if (trie->jump) /* no more substrings -- for now /grr*/
6783 flags &= ~SCF_DO_SUBSTR;
6786 #endif /* old or new */
6787 #endif /* TRIE_STUDY_OPT */
6789 else if (OP(scan) == REGEX_SET) {
6790 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6791 " before optimization", REGNODE_NAME(REGEX_SET));
6794 /* Else: zero-length, ignore. */
6795 scan = regnext(scan);
6800 /* we need to unwind recursion. */
6803 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
6804 DEBUG_PEEP("fend", scan, depth, flags);
6806 /* restore previous context */
6807 last = frame->last_regnode;
6808 scan = frame->next_regnode;
6809 stopparen = frame->stopparen;
6810 recursed_depth = frame->prev_recursed_depth;
6812 RExC_frame_last = frame->prev_frame;
6813 frame = frame->this_prev_frame;
6814 goto fake_study_recurse;
6818 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
6820 /* is this pattern infinite? Eg, consider /(a|b+)/ */
6821 if (is_inf_internal)
6822 delta = OPTIMIZE_INFTY;
6824 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
6825 if (min > stopmin) {
6827 At this point 'min' represents the minimum length string we can
6828 match while *ignoring* the implication of ACCEPT, and 'delta'
6829 represents the difference between the minimum length and maximum
6830 length, and if the pattern matches an infinitely long string
6831 (consider the + and * quantifiers) then we use the special delta
6832 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
6833 minimum length that can be matched *and* accepted.
6835 A pattern is accepted when matching was successful *and*
6836 complete, and thus there is no further matching needing to be
6837 done, no backtracking to occur, etc. Prior to the introduction
6838 of ACCEPT the only opcode that signaled acceptance was the END
6839 opcode, which is always the very last opcode in a regex program.
6840 ACCEPT is thus conceptually an early successful return out of
6841 the matching process. stopmin starts out as OPTIMIZE_INFTY to
6842 represent "the entire pattern", and is ratched down to the
6843 "current min" if necessary when an ACCEPT opcode is encountered.
6845 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
6846 and we now need to account for it in both min and delta.
6847 Consider that in a pattern /AB/ normally the min length it can
6848 match can be computed as min(A)+min(B). But (*ACCEPT) means
6849 that it might be something else, not even neccesarily min(A) at
6852 A = /(foo(*ACCEPT)|x+)/
6854 AB = /(foo(*ACCEPT)|x+)whop/
6856 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
6857 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
6858 "whop", and the delta of 0 as the pattern is of fixed length, the
6859 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
6860 When handling AB we expect to see a min of 5 for "xwhop", and a
6861 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
6862 for "foo". This should result in a final min of 3 for "foo", and
6863 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
6865 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
6866 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
6867 stop min would be 4 for "dude". This should result in a final
6868 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
6870 When min is smaller than stopmin then we can ignore it. In the
6871 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
6872 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
6873 the ACCEPT doesn't reduce the minimum length of the string that
6874 might be matched, nor affect the maximum length.
6876 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
6877 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
6878 "foo". We currently turn this into a min of 3 for "foo" and a
6879 delta of 3 for "foobar" even though technically "foobar" isn't
6880 possible. ACCEPT affects some aspects of the optimizer, like
6881 length computations and mandatory substring optimizations, but
6882 there are other optimzations this routine perfoms that are not
6883 affected and this compromise simplifies implementation.
6885 It might be helpful to consider that this C function is called
6886 recursively on the pattern in a bottom up fashion, and that the
6887 min returned by a nested call may be marked as coming from an
6888 ACCEPT, causing its callers to treat the returned min as a
6889 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
6890 multiple calls into this function in different ways.
6893 if (OPTIMIZE_INFTY - delta >= min - stopmin)
6894 delta += min - stopmin;
6896 delta = OPTIMIZE_INFTY;
6903 if (flags & SCF_DO_SUBSTR && is_inf)
6904 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6905 if (is_par > (I32)U8_MAX)
6907 if (is_par && pars==1 && data) {
6908 data->flags |= SF_IN_PAR;
6909 data->flags &= ~SF_HAS_PAR;
6911 else if (pars && data) {
6912 data->flags |= SF_HAS_PAR;
6913 data->flags &= ~SF_IN_PAR;
6915 if (flags & SCF_DO_STCLASS_OR)
6916 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6917 if (flags & SCF_TRIE_RESTUDY)
6918 data->flags |= SCF_TRIE_RESTUDY;
6921 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6922 if (min > OPTIMIZE_INFTY - delta)
6923 RExC_maxlen = OPTIMIZE_INFTY;
6924 else if (RExC_maxlen < min + delta)
6925 RExC_maxlen = min + delta;
6927 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
6931 /* add a data member to the struct reg_data attached to this regex, it should
6932 * always return a non-zero return. the 's' argument is the type of the items
6933 * being added and the n is the number of items. The length of 's' should match
6934 * the number of items. */
6936 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6938 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
6940 PERL_ARGS_ASSERT_ADD_DATA;
6942 /* in the below expression we have (count + n - 1), the minus one is there
6943 * because the struct that we allocate already contains a slot for 1 data
6944 * item, so we do not need to allocate it the first time. IOW, the
6945 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
6946 * to allocate. See struct reg_data in regcomp.h
6948 Renewc(RExC_rxi->data,
6949 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
6950 char, struct reg_data);
6951 /* however in the data->what expression we use (count + n) and do not
6952 * subtract one from the result because the data structure contains a
6953 * pointer to an array, and does not allocate the first element as part of
6954 * the data struct. */
6956 Renew(RExC_rxi->data->what, (count + n), U8);
6958 /* when count == 1 it means we have not initialized anything.
6959 * we always fill the 0 slot of the data array with a '%' entry, which
6960 * means "zero" (all the other types are letters) which exists purely
6961 * so the return from add_data is ALWAYS true, so we can tell it apart
6962 * from a "no value" idx=0 in places where we would return an index
6963 * into add_data. This is particularly important with the new "single
6964 * pass, usually, but not always" strategy that we use, where the code
6965 * will use a 0 to represent "not able to compute this yet".
6967 Newx(RExC_rxi->data->what, n+1, U8);
6968 /* fill in the placeholder slot of 0 with a what of '%', we use
6969 * this because it sorta looks like a zero (0/0) and it is not a letter
6970 * like any of the other "whats", this type should never be created
6971 * any other way but here. '%' happens to also not appear in this
6972 * file for any other reason (at the time of writing this comment)*/
6973 RExC_rxi->data->what[0]= '%';
6974 RExC_rxi->data->data[0]= NULL;
6976 RExC_rxi->data->count = count + n;
6977 Copy(s, RExC_rxi->data->what + count, n, U8);
6982 /*XXX: todo make this not included in a non debugging perl, but appears to be
6983 * used anyway there, in 'use re' */
6984 #ifndef PERL_IN_XSUB_RE
6986 Perl_reginitcolors(pTHX)
6988 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6990 char *t = savepv(s);
6994 t = strchr(t, '\t');
7000 PL_colors[i] = t = (char *)"";
7005 PL_colors[i++] = (char *)"";
7012 #ifdef TRIE_STUDY_OPT
7013 /* search for "restudy" in this file for a detailed explanation */
7014 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
7017 (data.flags & SCF_TRIE_RESTUDY) \
7025 #define CHECK_RESTUDY_GOTO_butfirst
7029 * pregcomp - compile a regular expression into internal code
7031 * Decides which engine's compiler to call based on the hint currently in
7035 #ifndef PERL_IN_XSUB_RE
7037 /* return the currently in-scope regex engine (or the default if none) */
7039 regexp_engine const *
7040 Perl_current_re_engine(pTHX)
7042 if (IN_PERL_COMPILETIME) {
7043 HV * const table = GvHV(PL_hintgv);
7046 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
7047 return &PL_core_reg_engine;
7048 ptr = hv_fetchs(table, "regcomp", FALSE);
7049 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
7050 return &PL_core_reg_engine;
7051 return INT2PTR(regexp_engine*, SvIV(*ptr));
7055 if (!PL_curcop->cop_hints_hash)
7056 return &PL_core_reg_engine;
7057 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
7058 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
7059 return &PL_core_reg_engine;
7060 return INT2PTR(regexp_engine*, SvIV(ptr));
7066 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
7068 regexp_engine const *eng = current_re_engine();
7069 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7071 PERL_ARGS_ASSERT_PREGCOMP;
7073 /* Dispatch a request to compile a regexp to correct regexp engine. */
7075 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
7078 return CALLREGCOMP_ENG(eng, pattern, flags);
7083 =for apidoc re_compile
7085 Compile the regular expression pattern C<pattern>, returning a pointer to the
7086 compiled object for later matching with the internal regex engine.
7088 This function is typically used by a custom regexp engine C<.comp()> function
7089 to hand off to the core regexp engine those patterns it doesn't want to handle
7090 itself (typically passing through the same flags it was called with). In
7091 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
7092 to compile using the currently active regexp engine.
7094 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
7095 pointer to the input. Otherwise the PV is extracted and treated like a string
7096 representing a pattern. See L<perlre>.
7098 The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
7099 all begin with C<RXf_>.
7103 * public entry point for the perl core's own regex compiling code.
7104 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
7105 * pattern rather than a list of OPs, and uses the internal engine rather
7106 * than the current one */
7109 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
7111 SV *pat = pattern; /* defeat constness! */
7113 PERL_ARGS_ASSERT_RE_COMPILE;
7115 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
7116 #ifdef PERL_IN_XSUB_RE
7119 &PL_core_reg_engine,
7121 NULL, NULL, rx_flags, 0);
7125 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
7129 if (--cbs->refcnt > 0)
7131 for (n = 0; n < cbs->count; n++) {
7132 REGEXP *rx = cbs->cb[n].src_regex;
7134 cbs->cb[n].src_regex = NULL;
7135 SvREFCNT_dec_NN(rx);
7143 static struct reg_code_blocks *
7144 S_alloc_code_blocks(pTHX_ int ncode)
7146 struct reg_code_blocks *cbs;
7147 Newx(cbs, 1, struct reg_code_blocks);
7150 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
7152 Newx(cbs->cb, ncode, struct reg_code_block);
7159 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
7160 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
7161 * point to the realloced string and length.
7163 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
7167 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
7168 char **pat_p, STRLEN *plen_p, int num_code_blocks)
7170 U8 *const src = (U8*)*pat_p;
7175 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7177 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7178 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
7180 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
7181 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
7184 while (s < *plen_p) {
7185 append_utf8_from_native_byte(src[s], &d);
7187 if (n < num_code_blocks) {
7188 assert(pRExC_state->code_blocks);
7189 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
7190 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
7191 assert(*(d - 1) == '(');
7194 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
7195 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
7196 assert(*(d - 1) == ')');
7205 *pat_p = (char*) dst;
7207 RExC_orig_utf8 = RExC_utf8 = 1;
7212 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
7213 * while recording any code block indices, and handling overloading,
7214 * nested qr// objects etc. If pat is null, it will allocate a new
7215 * string, or just return the first arg, if there's only one.
7217 * Returns the malloced/updated pat.
7218 * patternp and pat_count is the array of SVs to be concatted;
7219 * oplist is the optional list of ops that generated the SVs;
7220 * recompile_p is a pointer to a boolean that will be set if
7221 * the regex will need to be recompiled.
7222 * delim, if non-null is an SV that will be inserted between each element
7226 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
7227 SV *pat, SV ** const patternp, int pat_count,
7228 OP *oplist, bool *recompile_p, SV *delim)
7232 bool use_delim = FALSE;
7233 bool alloced = FALSE;
7235 /* if we know we have at least two args, create an empty string,
7236 * then concatenate args to that. For no args, return an empty string */
7237 if (!pat && pat_count != 1) {
7243 for (svp = patternp; svp < patternp + pat_count; svp++) {
7246 STRLEN orig_patlen = 0;
7248 SV *msv = use_delim ? delim : *svp;
7249 if (!msv) msv = &PL_sv_undef;
7251 /* if we've got a delimiter, we go round the loop twice for each
7252 * svp slot (except the last), using the delimiter the second
7261 if (SvTYPE(msv) == SVt_PVAV) {
7262 /* we've encountered an interpolated array within
7263 * the pattern, e.g. /...@a..../. Expand the list of elements,
7264 * then recursively append elements.
7265 * The code in this block is based on S_pushav() */
7267 AV *const av = (AV*)msv;
7268 const SSize_t maxarg = AvFILL(av) + 1;
7272 assert(oplist->op_type == OP_PADAV
7273 || oplist->op_type == OP_RV2AV);
7274 oplist = OpSIBLING(oplist);
7277 if (SvRMAGICAL(av)) {
7280 Newx(array, maxarg, SV*);
7282 for (i=0; i < maxarg; i++) {
7283 SV ** const svp = av_fetch(av, i, FALSE);
7284 array[i] = svp ? *svp : &PL_sv_undef;
7288 array = AvARRAY(av);
7291 pat = S_concat_pat(aTHX_ pRExC_state, pat,
7292 array, maxarg, NULL, recompile_p,
7294 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
7297 pat = newSVpvs_flags("", SVs_TEMP);
7304 /* we make the assumption here that each op in the list of
7305 * op_siblings maps to one SV pushed onto the stack,
7306 * except for code blocks, with have both an OP_NULL and
7308 * This allows us to match up the list of SVs against the
7309 * list of OPs to find the next code block.
7311 * Note that PUSHMARK PADSV PADSV ..
7313 * PADRANGE PADSV PADSV ..
7314 * so the alignment still works. */
7317 if (oplist->op_type == OP_NULL
7318 && (oplist->op_flags & OPf_SPECIAL))
7320 assert(n < pRExC_state->code_blocks->count);
7321 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
7322 pRExC_state->code_blocks->cb[n].block = oplist;
7323 pRExC_state->code_blocks->cb[n].src_regex = NULL;
7326 oplist = OpSIBLING(oplist); /* skip CONST */
7329 oplist = OpSIBLING(oplist);;
7332 /* apply magic and QR overloading to arg */
7335 if (SvROK(msv) && SvAMAGIC(msv)) {
7336 SV *sv = AMG_CALLunary(msv, regexp_amg);
7340 if (SvTYPE(sv) != SVt_REGEXP)
7341 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
7346 /* try concatenation overload ... */
7347 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
7348 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
7351 /* overloading involved: all bets are off over literal
7352 * code. Pretend we haven't seen it */
7354 pRExC_state->code_blocks->count -= n;
7358 /* ... or failing that, try "" overload */
7359 while (SvAMAGIC(msv)
7360 && (sv = AMG_CALLunary(msv, string_amg))
7364 && SvRV(msv) == SvRV(sv))
7369 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
7373 /* this is a partially unrolled
7374 * sv_catsv_nomg(pat, msv);
7375 * that allows us to adjust code block indices if
7378 char *dst = SvPV_force_nomg(pat, dlen);
7380 if (SvUTF8(msv) && !SvUTF8(pat)) {
7381 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
7382 sv_setpvn(pat, dst, dlen);
7385 sv_catsv_nomg(pat, msv);
7389 /* We have only one SV to process, but we need to verify
7390 * it is properly null terminated or we will fail asserts
7391 * later. In theory we probably shouldn't get such SV's,
7392 * but if we do we should handle it gracefully. */
7393 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7394 /* not a string, or a string with a trailing null */
7397 /* a string with no trailing null, we need to copy it
7398 * so it has a trailing null */
7399 pat = sv_2mortal(newSVsv(msv));
7404 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7407 /* extract any code blocks within any embedded qr//'s */
7408 if (rx && SvTYPE(rx) == SVt_REGEXP
7409 && RX_ENGINE((REGEXP*)rx)->op_comp)
7412 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7413 if (ri->code_blocks && ri->code_blocks->count) {
7415 /* the presence of an embedded qr// with code means
7416 * we should always recompile: the text of the
7417 * qr// may not have changed, but it may be a
7418 * different closure than last time */
7420 if (pRExC_state->code_blocks) {
7421 int new_count = pRExC_state->code_blocks->count
7422 + ri->code_blocks->count;
7423 Renew(pRExC_state->code_blocks->cb,
7424 new_count, struct reg_code_block);
7425 pRExC_state->code_blocks->count = new_count;
7428 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7429 ri->code_blocks->count);
7431 for (i=0; i < ri->code_blocks->count; i++) {
7432 struct reg_code_block *src, *dst;
7433 STRLEN offset = orig_patlen
7434 + ReANY((REGEXP *)rx)->pre_prefix;
7435 assert(n < pRExC_state->code_blocks->count);
7436 src = &ri->code_blocks->cb[i];
7437 dst = &pRExC_state->code_blocks->cb[n];
7438 dst->start = src->start + offset;
7439 dst->end = src->end + offset;
7440 dst->block = src->block;
7441 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7450 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7459 /* see if there are any run-time code blocks in the pattern.
7460 * False positives are allowed */
7463 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7464 char *pat, STRLEN plen)
7469 PERL_UNUSED_CONTEXT;
7471 for (s = 0; s < plen; s++) {
7472 if ( pRExC_state->code_blocks
7473 && n < pRExC_state->code_blocks->count
7474 && s == pRExC_state->code_blocks->cb[n].start)
7476 s = pRExC_state->code_blocks->cb[n].end;
7480 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7482 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7484 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7491 /* Handle run-time code blocks. We will already have compiled any direct
7492 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7493 * copy of it, but with any literal code blocks blanked out and
7494 * appropriate chars escaped; then feed it into
7496 * eval "qr'modified_pattern'"
7500 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7504 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7506 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7507 * and merge them with any code blocks of the original regexp.
7509 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7510 * instead, just save the qr and return FALSE; this tells our caller that
7511 * the original pattern needs upgrading to utf8.
7515 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7516 char *pat, STRLEN plen)
7520 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7522 if (pRExC_state->runtime_code_qr) {
7523 /* this is the second time we've been called; this should
7524 * only happen if the main pattern got upgraded to utf8
7525 * during compilation; re-use the qr we compiled first time
7526 * round (which should be utf8 too)
7528 qr = pRExC_state->runtime_code_qr;
7529 pRExC_state->runtime_code_qr = NULL;
7530 assert(RExC_utf8 && SvUTF8(qr));
7536 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7540 /* determine how many extra chars we need for ' and \ escaping */
7541 for (s = 0; s < plen; s++) {
7542 if (pat[s] == '\'' || pat[s] == '\\')
7546 Newx(newpat, newlen, char);
7548 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7550 for (s = 0; s < plen; s++) {
7551 if ( pRExC_state->code_blocks
7552 && n < pRExC_state->code_blocks->count
7553 && s == pRExC_state->code_blocks->cb[n].start)
7555 /* blank out literal code block so that they aren't
7556 * recompiled: eg change from/to:
7566 assert(pat[s] == '(');
7567 assert(pat[s+1] == '?');
7571 while (s < pRExC_state->code_blocks->cb[n].end) {
7579 if (pat[s] == '\'' || pat[s] == '\\')
7584 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7586 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7592 Perl_re_printf( aTHX_
7593 "%sre-parsing pattern for runtime code:%s %s\n",
7594 PL_colors[4], PL_colors[5], newpat);
7597 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7603 PUSHSTACKi(PERLSI_REQUIRE);
7604 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7605 * parsing qr''; normally only q'' does this. It also alters
7607 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7608 SvREFCNT_dec_NN(sv);
7613 SV * const errsv = ERRSV;
7614 if (SvTRUE_NN(errsv))
7615 /* use croak_sv ? */
7616 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7618 assert(SvROK(qr_ref));
7620 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7621 /* the leaving below frees the tmp qr_ref.
7622 * Give qr a life of its own */
7630 if (!RExC_utf8 && SvUTF8(qr)) {
7631 /* first time through; the pattern got upgraded; save the
7632 * qr for the next time through */
7633 assert(!pRExC_state->runtime_code_qr);
7634 pRExC_state->runtime_code_qr = qr;
7639 /* extract any code blocks within the returned qr// */
7642 /* merge the main (r1) and run-time (r2) code blocks into one */
7644 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7645 struct reg_code_block *new_block, *dst;
7646 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7650 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7652 SvREFCNT_dec_NN(qr);
7656 if (!r1->code_blocks)
7657 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7659 r1c = r1->code_blocks->count;
7660 r2c = r2->code_blocks->count;
7662 Newx(new_block, r1c + r2c, struct reg_code_block);
7666 while (i1 < r1c || i2 < r2c) {
7667 struct reg_code_block *src;
7671 src = &r2->code_blocks->cb[i2++];
7675 src = &r1->code_blocks->cb[i1++];
7676 else if ( r1->code_blocks->cb[i1].start
7677 < r2->code_blocks->cb[i2].start)
7679 src = &r1->code_blocks->cb[i1++];
7680 assert(src->end < r2->code_blocks->cb[i2].start);
7683 assert( r1->code_blocks->cb[i1].start
7684 > r2->code_blocks->cb[i2].start);
7685 src = &r2->code_blocks->cb[i2++];
7687 assert(src->end < r1->code_blocks->cb[i1].start);
7690 assert(pat[src->start] == '(');
7691 assert(pat[src->end] == ')');
7692 dst->start = src->start;
7693 dst->end = src->end;
7694 dst->block = src->block;
7695 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7699 r1->code_blocks->count += r2c;
7700 Safefree(r1->code_blocks->cb);
7701 r1->code_blocks->cb = new_block;
7704 SvREFCNT_dec_NN(qr);
7710 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7711 struct reg_substr_datum *rsd,
7712 struct scan_data_substrs *sub,
7713 STRLEN longest_length)
7715 /* This is the common code for setting up the floating and fixed length
7716 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7717 * as to whether succeeded or not */
7721 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7722 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7724 if (! (longest_length
7725 || (eol /* Can't have SEOL and MULTI */
7726 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7728 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7729 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7734 /* copy the information about the longest from the reg_scan_data
7735 over to the program. */
7736 if (SvUTF8(sub->str)) {
7738 rsd->utf8_substr = sub->str;
7740 rsd->substr = sub->str;
7741 rsd->utf8_substr = NULL;
7743 /* end_shift is how many chars that must be matched that
7744 follow this item. We calculate it ahead of time as once the
7745 lookbehind offset is added in we lose the ability to correctly
7747 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7748 rsd->end_shift = ml - sub->min_offset
7750 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7752 + (SvTAIL(sub->str) != 0)
7756 t = (eol/* Can't have SEOL and MULTI */
7757 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7758 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7764 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7766 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7767 * properly wrapped with the right modifiers */
7769 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7770 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7771 != REGEX_DEPENDS_CHARSET);
7773 /* The caret is output if there are any defaults: if not all the STD
7774 * flags are set, or if no character set specifier is needed */
7776 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7778 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7779 == REG_RUN_ON_COMMENT_SEEN);
7780 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7781 >> RXf_PMf_STD_PMMOD_SHIFT);
7782 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7784 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7786 /* We output all the necessary flags; we never output a minus, as all
7787 * those are defaults, so are
7788 * covered by the caret */
7789 const STRLEN wraplen = pat_len + has_p + has_runon
7790 + has_default /* If needs a caret */
7791 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7793 /* If needs a character set specifier */
7794 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7795 + (sizeof("(?:)") - 1);
7797 PERL_ARGS_ASSERT_SET_REGEX_PV;
7799 /* make sure PL_bitcount bounds not exceeded */
7800 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7802 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7805 SvFLAGS(Rx) |= SVf_UTF8;
7808 /* If a default, cover it using the caret */
7810 *p++= DEFAULT_PAT_MOD;
7816 name = get_regex_charset_name(RExC_rx->extflags, &len);
7817 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7819 name = UNICODE_PAT_MODS;
7820 len = sizeof(UNICODE_PAT_MODS) - 1;
7822 Copy(name, p, len, char);
7826 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7829 while((ch = *fptr++)) {
7837 Copy(RExC_precomp, p, pat_len, char);
7838 assert ((RX_WRAPPED(Rx) - p) < 16);
7839 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7842 /* Adding a trailing \n causes this to compile properly:
7843 my $R = qr / A B C # D E/x; /($R)/
7844 Otherwise the parens are considered part of the comment */
7849 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7853 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7854 * regular expression into internal code.
7855 * The pattern may be passed either as:
7856 * a list of SVs (patternp plus pat_count)
7857 * a list of OPs (expr)
7858 * If both are passed, the SV list is used, but the OP list indicates
7859 * which SVs are actually pre-compiled code blocks
7861 * The SVs in the list have magic and qr overloading applied to them (and
7862 * the list may be modified in-place with replacement SVs in the latter
7865 * If the pattern hasn't changed from old_re, then old_re will be
7868 * eng is the current engine. If that engine has an op_comp method, then
7869 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7870 * do the initial concatenation of arguments and pass on to the external
7873 * If is_bare_re is not null, set it to a boolean indicating whether the
7874 * arg list reduced (after overloading) to a single bare regex which has
7875 * been returned (i.e. /$qr/).
7877 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7879 * pm_flags contains the PMf_* flags, typically based on those from the
7880 * pm_flags field of the related PMOP. Currently we're only interested in
7881 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7883 * For many years this code had an initial sizing pass that calculated
7884 * (sometimes incorrectly, leading to security holes) the size needed for the
7885 * compiled pattern. That was changed by commit
7886 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7887 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7888 * references to this sizing pass.
7890 * Now, an initial crude guess as to the size needed is made, based on the
7891 * length of the pattern. Patches welcome to improve that guess. That amount
7892 * of space is malloc'd and then immediately freed, and then clawed back node
7893 * by node. This design is to minimze, to the extent possible, memory churn
7894 * when doing the reallocs.
7896 * A separate parentheses counting pass may be needed in some cases.
7897 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7900 * The existence of a sizing pass necessitated design decisions that are no
7901 * longer needed. There are potential areas of simplification.
7903 * Beware that the optimization-preparation code in here knows about some
7904 * of the structure of the compiled regexp. [I'll say.]
7908 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7909 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7910 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7912 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7920 SV** new_patternp = patternp;
7922 /* these are all flags - maybe they should be turned
7923 * into a single int with different bit masks */
7924 I32 sawlookahead = 0;
7929 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7931 bool runtime_code = 0;
7933 RExC_state_t RExC_state;
7934 RExC_state_t * const pRExC_state = &RExC_state;
7935 #ifdef TRIE_STUDY_OPT
7936 /* search for "restudy" in this file for a detailed explanation */
7938 RExC_state_t copyRExC_state;
7940 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7942 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7944 DEBUG_r(if (!PL_colorset) reginitcolors());
7947 pRExC_state->warn_text = NULL;
7948 pRExC_state->unlexed_names = NULL;
7949 pRExC_state->code_blocks = NULL;
7952 *is_bare_re = FALSE;
7954 if (expr && (expr->op_type == OP_LIST ||
7955 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7956 /* allocate code_blocks if needed */
7960 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7961 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7962 ncode++; /* count of DO blocks */
7965 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7969 /* compile-time pattern with just OP_CONSTs and DO blocks */
7974 /* find how many CONSTs there are */
7977 if (expr->op_type == OP_CONST)
7980 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7981 if (o->op_type == OP_CONST)
7985 /* fake up an SV array */
7987 assert(!new_patternp);
7988 Newx(new_patternp, n, SV*);
7989 SAVEFREEPV(new_patternp);
7993 if (expr->op_type == OP_CONST)
7994 new_patternp[n] = cSVOPx_sv(expr);
7996 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7997 if (o->op_type == OP_CONST)
7998 new_patternp[n++] = cSVOPo_sv;
8003 DEBUG_PARSE_r(Perl_re_printf( aTHX_
8004 "Assembling pattern from %d elements%s\n", pat_count,
8005 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
8007 /* set expr to the first arg op */
8009 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
8010 && expr->op_type != OP_CONST)
8012 expr = cLISTOPx(expr)->op_first;
8013 assert( expr->op_type == OP_PUSHMARK
8014 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
8015 || expr->op_type == OP_PADRANGE);
8016 expr = OpSIBLING(expr);
8019 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
8020 expr, &recompile, NULL);
8022 /* handle bare (possibly after overloading) regex: foo =~ $re */
8027 if (SvTYPE(re) == SVt_REGEXP) {
8031 DEBUG_PARSE_r(Perl_re_printf( aTHX_
8032 "Precompiled pattern%s\n",
8033 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
8039 exp = SvPV_nomg(pat, plen);
8041 if (!eng->op_comp) {
8042 if ((SvUTF8(pat) && IN_BYTES)
8043 || SvGMAGICAL(pat) || SvAMAGIC(pat))
8045 /* make a temporary copy; either to convert to bytes,
8046 * or to avoid repeating get-magic / overloaded stringify */
8047 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
8048 (IN_BYTES ? 0 : SvUTF8(pat)));
8050 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
8053 /* ignore the utf8ness if the pattern is 0 length */
8054 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
8055 RExC_uni_semantics = 0;
8056 RExC_contains_locale = 0;
8057 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
8058 RExC_in_script_run = 0;
8059 RExC_study_started = 0;
8060 pRExC_state->runtime_code_qr = NULL;
8061 RExC_frame_head= NULL;
8062 RExC_frame_last= NULL;
8063 RExC_frame_count= 0;
8064 RExC_latest_warn_offset = 0;
8065 RExC_use_BRANCHJ = 0;
8066 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
8067 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
8068 RExC_total_parens = 0;
8069 RExC_open_parens = NULL;
8070 RExC_close_parens = NULL;
8071 RExC_paren_names = NULL;
8073 RExC_seen_d_op = FALSE;
8075 RExC_paren_name_list = NULL;
8079 RExC_mysv1= sv_newmortal();
8080 RExC_mysv2= sv_newmortal();
8084 SV *dsv= sv_newmortal();
8085 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
8086 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
8087 PL_colors[4], PL_colors[5], s);
8090 /* we jump here if we have to recompile, e.g., from upgrading the pattern
8093 if ((pm_flags & PMf_USE_RE_EVAL)
8094 /* this second condition covers the non-regex literal case,
8095 * i.e. $foo =~ '(?{})'. */
8096 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
8098 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
8101 /* return old regex if pattern hasn't changed */
8102 /* XXX: note in the below we have to check the flags as well as the
8105 * Things get a touch tricky as we have to compare the utf8 flag
8106 * independently from the compile flags. */
8110 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
8111 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
8112 && RX_PRECOMP(old_re)
8113 && RX_PRELEN(old_re) == plen
8114 && memEQ(RX_PRECOMP(old_re), exp, plen)
8115 && !runtime_code /* with runtime code, always recompile */ )
8118 SV *dsv= sv_newmortal();
8119 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
8120 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
8121 PL_colors[4], PL_colors[5], s);
8126 /* Allocate the pattern's SV */
8127 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
8128 RExC_rx = ReANY(Rx);
8129 if ( RExC_rx == NULL )
8130 FAIL("Regexp out of space");
8132 rx_flags = orig_rx_flags;
8134 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
8135 && initial_charset == REGEX_DEPENDS_CHARSET)
8138 /* Set to use unicode semantics if the pattern is in utf8 and has the
8139 * 'depends' charset specified, as it means unicode when utf8 */
8140 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
8141 RExC_uni_semantics = 1;
8144 RExC_pm_flags = pm_flags;
8147 assert(TAINTING_get || !TAINT_get);
8149 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
8151 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
8152 /* whoops, we have a non-utf8 pattern, whilst run-time code
8153 * got compiled as utf8. Try again with a utf8 pattern */
8154 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
8155 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
8159 assert(!pRExC_state->runtime_code_qr);
8165 RExC_in_lookaround = 0;
8166 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
8167 RExC_recode_x_to_native = 0;
8168 RExC_in_multi_char_class = 0;
8170 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
8171 RExC_precomp_end = RExC_end = exp + plen;
8173 RExC_whilem_seen = 0;
8175 RExC_recurse = NULL;
8176 RExC_study_chunk_recursed = NULL;
8177 RExC_study_chunk_recursed_bytes= 0;
8178 RExC_recurse_count = 0;
8179 RExC_sets_depth = 0;
8180 pRExC_state->code_index = 0;
8182 /* Initialize the string in the compiled pattern. This is so that there is
8183 * something to output if necessary */
8184 set_regex_pv(pRExC_state, Rx);
8187 Perl_re_printf( aTHX_
8188 "Starting parse and generation\n");
8190 RExC_lastparse=NULL;
8193 /* Allocate space and zero-initialize. Note, the two step process
8194 of zeroing when in debug mode, thus anything assigned has to
8195 happen after that */
8198 /* On the first pass of the parse, we guess how big this will be. Then
8199 * we grow in one operation to that amount and then give it back. As
8200 * we go along, we re-allocate what we need.
8202 * XXX Currently the guess is essentially that the pattern will be an
8203 * EXACT node with one byte input, one byte output. This is crude, and
8204 * better heuristics are welcome.
8206 * On any subsequent passes, we guess what we actually computed in the
8207 * latest earlier pass. Such a pass probably didn't complete so is
8208 * missing stuff. We could improve those guesses by knowing where the
8209 * parse stopped, and use the length so far plus apply the above
8210 * assumption to what's left. */
8211 RExC_size = STR_SZ(RExC_end - RExC_start);
8214 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
8215 if ( RExC_rxi == NULL )
8216 FAIL("Regexp out of space");
8218 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
8219 RXi_SET( RExC_rx, RExC_rxi );
8221 /* We start from 0 (over from 0 in the case this is a reparse. The first
8222 * node parsed will give back any excess memory we have allocated so far).
8226 /* non-zero initialization begins here */
8227 RExC_rx->engine= eng;
8228 RExC_rx->extflags = rx_flags;
8229 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
8231 if (pm_flags & PMf_IS_QR) {
8232 RExC_rxi->code_blocks = pRExC_state->code_blocks;
8233 if (RExC_rxi->code_blocks) {
8234 RExC_rxi->code_blocks->refcnt++;
8238 RExC_rx->intflags = 0;
8240 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
8241 RExC_parse_set(exp);
8243 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
8244 * code makes sure the final byte is an uncounted NUL. But should this
8245 * ever not be the case, lots of things could read beyond the end of the
8246 * buffer: loops like
8247 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
8248 * strchr(RExC_parse, "foo");
8249 * etc. So it is worth noting. */
8250 assert(*RExC_end == '\0');
8254 RExC_parens_buf_size = 0;
8255 RExC_emit_start = RExC_rxi->program;
8256 pRExC_state->code_index = 0;
8258 *((char*) RExC_emit_start) = (char) REG_MAGIC;
8259 RExC_emit = NODE_STEP_REGNODE;
8262 if (reg(pRExC_state, 0, &flags, 1)) {
8264 /* Success!, But we may need to redo the parse knowing how many parens
8265 * there actually are */
8266 if (IN_PARENS_PASS) {
8267 flags |= RESTART_PARSE;
8270 /* We have that number in RExC_npar */
8271 RExC_total_parens = RExC_npar;
8273 else if (! MUST_RESTART(flags)) {
8275 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
8278 /* Here, we either have success, or we have to redo the parse for some reason */
8279 if (MUST_RESTART(flags)) {
8281 /* It's possible to write a regexp in ascii that represents Unicode
8282 codepoints outside of the byte range, such as via \x{100}. If we
8283 detect such a sequence we have to convert the entire pattern to utf8
8284 and then recompile, as our sizing calculation will have been based
8285 on 1 byte == 1 character, but we will need to use utf8 to encode
8286 at least some part of the pattern, and therefore must convert the whole
8289 if (flags & NEED_UTF8) {
8291 /* We have stored the offset of the final warning output so far.
8292 * That must be adjusted. Any variant characters between the start
8293 * of the pattern and this warning count for 2 bytes in the final,
8294 * so just add them again */
8295 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
8296 RExC_latest_warn_offset +=
8297 variant_under_utf8_count((U8 *) exp, (U8 *) exp
8298 + RExC_latest_warn_offset);
8300 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
8301 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
8302 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
8305 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
8308 if (ALL_PARENS_COUNTED) {
8309 /* Make enough room for all the known parens, and zero it */
8310 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
8311 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
8312 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
8314 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
8315 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
8317 else { /* Parse did not complete. Reinitialize the parentheses
8319 RExC_total_parens = 0;
8320 if (RExC_open_parens) {
8321 Safefree(RExC_open_parens);
8322 RExC_open_parens = NULL;
8324 if (RExC_close_parens) {
8325 Safefree(RExC_close_parens);
8326 RExC_close_parens = NULL;
8330 /* Clean up what we did in this parse */
8331 SvREFCNT_dec_NN(RExC_rx_sv);
8336 /* Here, we have successfully parsed and generated the pattern's program
8337 * for the regex engine. We are ready to finish things up and look for
8340 /* Update the string to compile, with correct modifiers, etc */
8341 set_regex_pv(pRExC_state, Rx);
8343 RExC_rx->nparens = RExC_total_parens - 1;
8345 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
8346 if (RExC_whilem_seen > 15)
8347 RExC_whilem_seen = 15;
8350 Perl_re_printf( aTHX_
8351 "Required size %" IVdf " nodes\n", (IV)RExC_size);
8353 RExC_lastparse=NULL;
8356 SetProgLen(RExC_rxi,RExC_size);
8358 DEBUG_DUMP_PRE_OPTIMIZE_r({
8359 SV * const sv = sv_newmortal();
8360 RXi_GET_DECL(RExC_rx, ri);
8362 Perl_re_printf( aTHX_ "Program before optimization:\n");
8364 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
8369 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
8372 /* XXXX To minimize changes to RE engine we always allocate
8373 3-units-long substrs field. */
8374 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8375 if (RExC_recurse_count) {
8376 Newx(RExC_recurse, RExC_recurse_count, regnode *);
8377 SAVEFREEPV(RExC_recurse);
8380 if (RExC_seen & REG_RECURSE_SEEN) {
8381 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8382 * So its 1 if there are no parens. */
8383 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8384 ((RExC_total_parens & 0x07) != 0);
8385 Newx(RExC_study_chunk_recursed,
8386 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8387 SAVEFREEPV(RExC_study_chunk_recursed);
8391 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8393 RExC_study_chunk_recursed_count= 0;
8395 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8396 if (RExC_study_chunk_recursed) {
8397 Zero(RExC_study_chunk_recursed,
8398 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8402 #ifdef TRIE_STUDY_OPT
8403 /* search for "restudy" in this file for a detailed explanation */
8405 StructCopy(&zero_scan_data, &data, scan_data_t);
8406 copyRExC_state = RExC_state;
8409 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8411 RExC_state = copyRExC_state;
8412 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8413 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8415 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8416 StructCopy(&zero_scan_data, &data, scan_data_t);
8419 StructCopy(&zero_scan_data, &data, scan_data_t);
8422 /* Dig out information for optimizations. */
8423 RExC_rx->extflags = RExC_flags; /* was pm_op */
8424 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8427 SvUTF8_on(Rx); /* Unicode in it? */
8428 RExC_rxi->regstclass = NULL;
8429 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8430 RExC_rx->intflags |= PREGf_NAUGHTY;
8431 scan = RExC_rxi->program + 1; /* First BRANCH. */
8433 /* testing for BRANCH here tells us whether there is "must appear"
8434 data in the pattern. If there is then we can use it for optimisations */
8435 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8437 SSize_t fake_deltap;
8438 STRLEN longest_length[2];
8439 regnode_ssc ch_class; /* pointed to by data */
8441 SSize_t last_close = 0; /* pointed to by data */
8442 regnode *first= scan;
8443 regnode *first_next= regnext(first);
8444 regnode *last_close_op= NULL;
8448 * Skip introductions and multiplicators >= 1
8449 * so that we can extract the 'meat' of the pattern that must
8450 * match in the large if() sequence following.
8451 * NOTE that EXACT is NOT covered here, as it is normally
8452 * picked up by the optimiser separately.
8454 * This is unfortunate as the optimiser isnt handling lookahead
8455 * properly currently.
8458 while ((OP(first) == OPEN && (sawopen = 1)) ||
8459 /* An OR of *one* alternative - should not happen now. */
8460 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8461 /* for now we can't handle lookbehind IFMATCH*/
8462 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8463 (OP(first) == PLUS) ||
8464 (OP(first) == MINMOD) ||
8465 /* An {n,m} with n>0 */
8466 (REGNODE_TYPE(OP(first)) == CURLY && ARG1(first) > 0) ||
8467 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END ))
8470 * the only op that could be a regnode is PLUS, all the rest
8471 * will be regnode_1 or regnode_2.
8473 * (yves doesn't think this is true)
8475 if (OP(first) == PLUS)
8478 if (OP(first) == MINMOD)
8481 first = REGNODE_AFTER(first);
8482 first_next= regnext(first);
8485 /* Starting-point info. */
8487 DEBUG_PEEP("first:", first, 0, 0);
8488 /* Ignore EXACT as we deal with it later. */
8489 if (REGNODE_TYPE(OP(first)) == EXACT) {
8490 if (! isEXACTFish(OP(first))) {
8491 NOOP; /* Empty, get anchored substr later. */
8494 RExC_rxi->regstclass = first;
8497 else if (REGNODE_TYPE(OP(first)) == TRIE &&
8498 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8500 /* this can happen only on restudy
8501 * Search for "restudy" in this file to find
8502 * a comment with details. */
8503 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8506 else if (REGNODE_SIMPLE(OP(first)))
8507 RExC_rxi->regstclass = first;
8508 else if (REGNODE_TYPE(OP(first)) == BOUND ||
8509 REGNODE_TYPE(OP(first)) == NBOUND)
8510 RExC_rxi->regstclass = first;
8511 else if (REGNODE_TYPE(OP(first)) == BOL) {
8512 RExC_rx->intflags |= (OP(first) == MBOL
8515 first = REGNODE_AFTER(first);
8518 else if (OP(first) == GPOS) {
8519 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8520 first = REGNODE_AFTER_type(first,tregnode_GPOS);
8523 else if ((!sawopen || !RExC_sawback) &&
8525 (OP(first) == STAR &&
8526 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
8527 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8529 /* turn .* into ^.* with an implied $*=1 */
8531 (OP(REGNODE_AFTER(first)) == REG_ANY)
8534 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8535 first = REGNODE_AFTER(first);
8538 if (sawplus && !sawminmod && !sawlookahead
8539 && (!sawopen || !RExC_sawback)
8540 && !pRExC_state->code_blocks) /* May examine pos and $& */
8541 /* x+ must match at the 1st pos of run of x's */
8542 RExC_rx->intflags |= PREGf_SKIP;
8544 /* Scan is after the zeroth branch, first is atomic matcher. */
8545 #ifdef TRIE_STUDY_OPT
8546 /* search for "restudy" in this file for a detailed explanation */
8549 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8550 (IV)(first - scan + 1))
8554 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8555 (IV)(first - scan + 1))
8561 * If there's something expensive in the r.e., find the
8562 * longest literal string that must appear and make it the
8563 * regmust. Resolve ties in favor of later strings, since
8564 * the regstart check works with the beginning of the r.e.
8565 * and avoiding duplication strengthens checking. Not a
8566 * strong reason, but sufficient in the absence of others.
8567 * [Now we resolve ties in favor of the earlier string if
8568 * it happens that c_offset_min has been invalidated, since the
8569 * earlier string may buy us something the later one won't.]
8572 data.substrs[0].str = newSVpvs("");
8573 data.substrs[1].str = newSVpvs("");
8574 data.last_found = newSVpvs("");
8575 data.cur_is_floating = 0; /* initially any found substring is fixed */
8576 ENTER_with_name("study_chunk");
8577 SAVEFREESV(data.substrs[0].str);
8578 SAVEFREESV(data.substrs[1].str);
8579 SAVEFREESV(data.last_found);
8581 if (!RExC_rxi->regstclass) {
8582 ssc_init(pRExC_state, &ch_class);
8583 data.start_class = &ch_class;
8584 stclass_flag = SCF_DO_STCLASS_AND;
8585 } else /* XXXX Check for BOUND? */
8587 data.last_closep = &last_close;
8588 data.last_close_opp = &last_close_op;
8592 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8593 * (NO top level branches)
8595 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
8596 scan + RExC_size, /* Up to end */
8598 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8599 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8601 /* search for "restudy" in this file for a detailed explanation
8602 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
8605 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8608 if ( RExC_total_parens == 1 && !data.cur_is_floating
8609 && data.last_start_min == 0 && data.last_end > 0
8610 && !RExC_seen_zerolen
8611 && !(RExC_seen & REG_VERBARG_SEEN)
8612 && !(RExC_seen & REG_GPOS_SEEN)
8614 RExC_rx->extflags |= RXf_CHECK_ALL;
8616 scan_commit(pRExC_state, &data,&minlen, 0);
8619 /* XXX this is done in reverse order because that's the way the
8620 * code was before it was parameterised. Don't know whether it
8621 * actually needs doing in reverse order. DAPM */
8622 for (i = 1; i >= 0; i--) {
8623 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8626 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8627 && data.substrs[0].min_offset
8628 == data.substrs[1].min_offset
8629 && SvCUR(data.substrs[0].str)
8630 == SvCUR(data.substrs[1].str)
8632 && S_setup_longest (aTHX_ pRExC_state,
8633 &(RExC_rx->substrs->data[i]),
8637 RExC_rx->substrs->data[i].min_offset =
8638 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8640 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8641 /* Don't offset infinity */
8642 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8643 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8644 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8647 RExC_rx->substrs->data[i].substr = NULL;
8648 RExC_rx->substrs->data[i].utf8_substr = NULL;
8649 longest_length[i] = 0;
8653 LEAVE_with_name("study_chunk");
8655 if (RExC_rxi->regstclass
8656 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8657 RExC_rxi->regstclass = NULL;
8659 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8660 || RExC_rx->substrs->data[0].min_offset)
8662 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8663 && is_ssc_worth_it(pRExC_state, data.start_class))
8665 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8667 ssc_finalize(pRExC_state, data.start_class);
8669 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8670 StructCopy(data.start_class,
8671 (regnode_ssc*)RExC_rxi->data->data[n],
8673 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8674 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8675 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8676 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8677 Perl_re_printf( aTHX_
8678 "synthetic stclass \"%s\".\n",
8679 SvPVX_const(sv));});
8680 data.start_class = NULL;
8683 /* A temporary algorithm prefers floated substr to fixed one of
8684 * same length to dig more info. */
8685 i = (longest_length[0] <= longest_length[1]);
8686 RExC_rx->substrs->check_ix = i;
8687 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8688 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8689 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8690 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8691 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8692 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8693 RExC_rx->intflags |= PREGf_NOSCAN;
8695 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8696 RExC_rx->extflags |= RXf_USE_INTUIT;
8697 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8698 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8701 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8702 if ( (STRLEN)minlen < longest_length[1] )
8703 minlen= longest_length[1];
8704 if ( (STRLEN)minlen < longest_length[0] )
8705 minlen= longest_length[0];
8709 /* Several toplevels. Best we can is to set minlen. */
8710 SSize_t fake_deltap;
8711 regnode_ssc ch_class;
8712 SSize_t last_close = 0;
8713 regnode *last_close_op = NULL;
8715 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8717 scan = RExC_rxi->program + 1;
8718 ssc_init(pRExC_state, &ch_class);
8719 data.start_class = &ch_class;
8720 data.last_closep = &last_close;
8721 data.last_close_opp = &last_close_op;
8725 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8726 * (patterns WITH top level branches)
8728 minlen = study_chunk(pRExC_state,
8729 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
8730 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8731 ? SCF_TRIE_DOING_RESTUDY
8734 /* search for "restudy" in this file for a detailed explanation
8735 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
8737 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8739 RExC_rx->check_substr = NULL;
8740 RExC_rx->check_utf8 = NULL;
8741 RExC_rx->substrs->data[0].substr = NULL;
8742 RExC_rx->substrs->data[0].utf8_substr = NULL;
8743 RExC_rx->substrs->data[1].substr = NULL;
8744 RExC_rx->substrs->data[1].utf8_substr = NULL;
8746 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8747 && is_ssc_worth_it(pRExC_state, data.start_class))
8749 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8751 ssc_finalize(pRExC_state, data.start_class);
8753 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8754 StructCopy(data.start_class,
8755 (regnode_ssc*)RExC_rxi->data->data[n],
8757 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8758 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8759 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8760 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8761 Perl_re_printf( aTHX_
8762 "synthetic stclass \"%s\".\n",
8763 SvPVX_const(sv));});
8764 data.start_class = NULL;
8768 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8769 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8770 RExC_rx->maxlen = REG_INFTY;
8773 RExC_rx->maxlen = RExC_maxlen;
8776 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8777 the "real" pattern. */
8779 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8780 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8782 RExC_rx->minlenret = minlen;
8783 if (RExC_rx->minlen < minlen)
8784 RExC_rx->minlen = minlen;
8786 if (RExC_seen & REG_RECURSE_SEEN ) {
8787 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8788 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8790 if (RExC_seen & REG_GPOS_SEEN)
8791 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8792 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8793 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8795 if (pRExC_state->code_blocks)
8796 RExC_rx->extflags |= RXf_EVAL_SEEN;
8797 if (RExC_seen & REG_VERBARG_SEEN)
8799 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8800 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8802 if (RExC_seen & REG_CUTGROUP_SEEN)
8803 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8804 if (pm_flags & PMf_USE_RE_EVAL)
8805 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8806 if (RExC_paren_names)
8807 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8809 RXp_PAREN_NAMES(RExC_rx) = NULL;
8811 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8812 * so it can be used in pp.c */
8813 if (RExC_rx->intflags & PREGf_ANCH)
8814 RExC_rx->extflags |= RXf_IS_ANCHORED;
8818 /* this is used to identify "special" patterns that might result
8819 * in Perl NOT calling the regex engine and instead doing the match "itself",
8820 * particularly special cases in split//. By having the regex compiler
8821 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8822 * we avoid weird issues with equivalent patterns resulting in different behavior,
8823 * AND we allow non Perl engines to get the same optimizations by the setting the
8824 * flags appropriately - Yves */
8825 regnode *first = RExC_rxi->program + 1;
8827 regnode *next = NULL;
8829 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
8830 next = REGNODE_AFTER(first);
8833 /* It's safe to read through *next only if OP(first) is a regop of
8834 * the right type (not EXACT, for example).
8836 if (REGNODE_TYPE(fop) == NOTHING && nop == END)
8837 RExC_rx->extflags |= RXf_NULL;
8838 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8839 /* when fop is SBOL first->flags will be true only when it was
8840 * produced by parsing /\A/, and not when parsing /^/. This is
8841 * very important for the split code as there we want to
8842 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8843 * See rt #122761 for more details. -- Yves */
8844 RExC_rx->extflags |= RXf_START_ONLY;
8845 else if (fop == PLUS
8846 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
8847 && OP(regnext(first)) == END)
8848 RExC_rx->extflags |= RXf_WHITE;
8849 else if ( RExC_rx->extflags & RXf_SPLIT
8850 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
8851 && STR_LEN(first) == 1
8852 && *(STRING(first)) == ' '
8853 && OP(regnext(first)) == END )
8854 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8858 if (RExC_contains_locale) {
8859 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8863 if (RExC_paren_names) {
8864 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8865 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8866 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8869 RExC_rxi->name_list_idx = 0;
8871 while ( RExC_recurse_count > 0 ) {
8872 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8874 * This data structure is set up in study_chunk() and is used
8875 * to calculate the distance between a GOSUB regopcode and
8876 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8879 * If for some reason someone writes code that optimises
8880 * away a GOSUB opcode then the assert should be changed to
8881 * an if(scan) to guard the ARG2L_SET() - Yves
8884 assert(scan && OP(scan) == GOSUB);
8885 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8888 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8889 /* assume we don't need to swap parens around before we match */
8891 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8892 (unsigned long)RExC_study_chunk_recursed_count);
8896 Perl_re_printf( aTHX_ "Final program:\n");
8900 if (RExC_open_parens) {
8901 Safefree(RExC_open_parens);
8902 RExC_open_parens = NULL;
8904 if (RExC_close_parens) {
8905 Safefree(RExC_close_parens);
8906 RExC_close_parens = NULL;
8910 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8911 * by setting the regexp SV to readonly-only instead. If the
8912 * pattern's been recompiled, the USEDness should remain. */
8913 if (old_re && SvREADONLY(old_re))
8921 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8924 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8926 PERL_UNUSED_ARG(value);
8928 if (flags & RXapif_FETCH) {
8929 return reg_named_buff_fetch(rx, key, flags);
8930 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8931 Perl_croak_no_modify();
8933 } else if (flags & RXapif_EXISTS) {
8934 return reg_named_buff_exists(rx, key, flags)
8937 } else if (flags & RXapif_REGNAMES) {
8938 return reg_named_buff_all(rx, flags);
8939 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8940 return reg_named_buff_scalar(rx, flags);
8942 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8948 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8951 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8952 PERL_UNUSED_ARG(lastkey);
8954 if (flags & RXapif_FIRSTKEY)
8955 return reg_named_buff_firstkey(rx, flags);
8956 else if (flags & RXapif_NEXTKEY)
8957 return reg_named_buff_nextkey(rx, flags);
8959 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8966 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8970 struct regexp *const rx = ReANY(r);
8972 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8974 if (rx && RXp_PAREN_NAMES(rx)) {
8975 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8978 SV* sv_dat=HeVAL(he_str);
8979 I32 *nums=(I32*)SvPVX(sv_dat);
8980 AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL;
8981 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8982 if ((I32)(rx->nparens) >= nums[i]
8983 && rx->offs[nums[i]].start != -1
8984 && rx->offs[nums[i]].end != -1)
8987 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8992 ret = newSV_type(SVt_NULL);
8995 av_push_simple(retarray, ret);
8998 return newRV_noinc(MUTABLE_SV(retarray));
9005 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
9008 struct regexp *const rx = ReANY(r);
9010 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
9012 if (rx && RXp_PAREN_NAMES(rx)) {
9013 if (flags & RXapif_ALL) {
9014 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
9016 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
9018 SvREFCNT_dec_NN(sv);
9030 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
9032 struct regexp *const rx = ReANY(r);
9034 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
9036 if ( rx && RXp_PAREN_NAMES(rx) ) {
9037 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
9039 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
9046 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
9048 struct regexp *const rx = ReANY(r);
9049 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9051 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
9053 if (rx && RXp_PAREN_NAMES(rx)) {
9054 HV *hv = RXp_PAREN_NAMES(rx);
9056 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
9059 SV* sv_dat = HeVAL(temphe);
9060 I32 *nums = (I32*)SvPVX(sv_dat);
9061 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
9062 if ((I32)(rx->lastparen) >= nums[i] &&
9063 rx->offs[nums[i]].start != -1 &&
9064 rx->offs[nums[i]].end != -1)
9070 if (parno || flags & RXapif_ALL) {
9071 return newSVhek(HeKEY_hek(temphe));
9079 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
9084 struct regexp *const rx = ReANY(r);
9086 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
9088 if (rx && RXp_PAREN_NAMES(rx)) {
9089 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
9090 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
9091 } else if (flags & RXapif_ONE) {
9092 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
9093 av = MUTABLE_AV(SvRV(ret));
9094 length = av_count(av);
9095 SvREFCNT_dec_NN(ret);
9096 return newSViv(length);
9098 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
9103 return &PL_sv_undef;
9107 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
9109 struct regexp *const rx = ReANY(r);
9112 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
9114 if (rx && RXp_PAREN_NAMES(rx)) {
9115 HV *hv= RXp_PAREN_NAMES(rx);
9117 (void)hv_iterinit(hv);
9118 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
9121 SV* sv_dat = HeVAL(temphe);
9122 I32 *nums = (I32*)SvPVX(sv_dat);
9123 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
9124 if ((I32)(rx->lastparen) >= nums[i] &&
9125 rx->offs[nums[i]].start != -1 &&
9126 rx->offs[nums[i]].end != -1)
9132 if (parno || flags & RXapif_ALL) {
9133 av_push(av, newSVhek(HeKEY_hek(temphe)));
9138 return newRV_noinc(MUTABLE_SV(av));
9142 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
9145 struct regexp *const rx = ReANY(r);
9151 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
9153 if ( n == RX_BUFF_IDX_CARET_PREMATCH
9154 || n == RX_BUFF_IDX_CARET_FULLMATCH
9155 || n == RX_BUFF_IDX_CARET_POSTMATCH
9158 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
9160 /* on something like
9163 * the KEEPCOPY is set on the PMOP rather than the regex */
9164 if (PL_curpm && r == PM_GETRE(PL_curpm))
9165 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
9174 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
9175 /* no need to distinguish between them any more */
9176 n = RX_BUFF_IDX_FULLMATCH;
9178 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
9179 && rx->offs[0].start != -1)
9181 /* $`, ${^PREMATCH} */
9182 i = rx->offs[0].start;
9186 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
9187 && rx->offs[0].end != -1)
9189 /* $', ${^POSTMATCH} */
9190 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
9191 i = rx->sublen + rx->suboffset - rx->offs[0].end;
9194 if (inRANGE(n, 0, (I32)rx->nparens) &&
9195 (s1 = rx->offs[n].start) != -1 &&
9196 (t1 = rx->offs[n].end) != -1)
9198 /* $&, ${^MATCH}, $1 ... */
9200 s = rx->subbeg + s1 - rx->suboffset;
9205 assert(s >= rx->subbeg);
9206 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
9208 #ifdef NO_TAINT_SUPPORT
9209 sv_setpvn(sv, s, i);
9211 const int oldtainted = TAINT_get;
9213 sv_setpvn(sv, s, i);
9214 TAINT_set(oldtainted);
9216 if (RXp_MATCH_UTF8(rx))
9221 if (RXp_MATCH_TAINTED(rx)) {
9222 if (SvTYPE(sv) >= SVt_PVMG) {
9223 MAGIC* const mg = SvMAGIC(sv);
9226 SvMAGIC_set(sv, mg->mg_moremagic);
9228 if ((mgt = SvMAGIC(sv))) {
9229 mg->mg_moremagic = mgt;
9230 SvMAGIC_set(sv, mg);
9247 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
9248 SV const * const value)
9250 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
9252 PERL_UNUSED_ARG(rx);
9253 PERL_UNUSED_ARG(paren);
9254 PERL_UNUSED_ARG(value);
9257 Perl_croak_no_modify();
9261 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
9264 struct regexp *const rx = ReANY(r);
9268 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
9270 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
9271 || paren == RX_BUFF_IDX_CARET_FULLMATCH
9272 || paren == RX_BUFF_IDX_CARET_POSTMATCH
9275 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
9277 /* on something like
9280 * the KEEPCOPY is set on the PMOP rather than the regex */
9281 if (PL_curpm && r == PM_GETRE(PL_curpm))
9282 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
9288 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
9290 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
9291 case RX_BUFF_IDX_PREMATCH: /* $` */
9292 if (rx->offs[0].start != -1) {
9293 i = rx->offs[0].start;
9302 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
9303 case RX_BUFF_IDX_POSTMATCH: /* $' */
9304 if (rx->offs[0].end != -1) {
9305 i = rx->sublen - rx->offs[0].end;
9307 s1 = rx->offs[0].end;
9314 default: /* $& / ${^MATCH}, $1, $2, ... */
9315 if (paren <= (I32)rx->nparens &&
9316 (s1 = rx->offs[paren].start) != -1 &&
9317 (t1 = rx->offs[paren].end) != -1)
9323 if (ckWARN(WARN_UNINITIALIZED))
9324 report_uninit((const SV *)sv);
9329 if (i > 0 && RXp_MATCH_UTF8(rx)) {
9330 const char * const s = rx->subbeg - rx->suboffset + s1;
9335 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
9342 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
9344 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
9345 PERL_UNUSED_ARG(rx);
9349 return newSVpvs("Regexp");
9352 /* Scans the name of a named buffer from the pattern.
9353 * If flags is REG_RSN_RETURN_NULL returns null.
9354 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
9355 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
9356 * to the parsed name as looked up in the RExC_paren_names hash.
9357 * If there is an error throws a vFAIL().. type exception.
9360 #define REG_RSN_RETURN_NULL 0
9361 #define REG_RSN_RETURN_NAME 1
9362 #define REG_RSN_RETURN_DATA 2
9365 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
9367 char *name_start = RExC_parse;
9370 PERL_ARGS_ASSERT_REG_SCAN_NAME;
9372 assert (RExC_parse <= RExC_end);
9373 if (RExC_parse == RExC_end) NOOP;
9374 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
9375 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
9376 * using do...while */
9379 RExC_parse_inc_utf8();
9380 } while ( RExC_parse < RExC_end
9381 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9384 RExC_parse_inc_by(1);
9385 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9387 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
9389 vFAIL("Group name must start with a non-digit word character");
9391 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9392 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9393 if ( flags == REG_RSN_RETURN_NAME)
9395 else if (flags==REG_RSN_RETURN_DATA) {
9398 if ( ! sv_name ) /* should not happen*/
9399 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9400 if (RExC_paren_names)
9401 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9403 sv_dat = HeVAL(he_str);
9404 if ( ! sv_dat ) { /* Didn't find group */
9406 /* It might be a forward reference; we can't fail until we
9407 * know, by completing the parse to get all the groups, and
9409 if (ALL_PARENS_COUNTED) {
9410 vFAIL("Reference to nonexistent named group");
9413 REQUIRE_PARENS_PASS;
9419 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9420 (unsigned long) flags);
9423 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9424 if (RExC_lastparse!=RExC_parse) { \
9425 Perl_re_printf( aTHX_ "%s", \
9426 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9427 RExC_end - RExC_parse, 16, \
9429 PERL_PV_ESCAPE_UNI_DETECT | \
9430 PERL_PV_PRETTY_ELLIPSES | \
9431 PERL_PV_PRETTY_LTGT | \
9432 PERL_PV_ESCAPE_RE | \
9433 PERL_PV_PRETTY_EXACTSIZE \
9437 Perl_re_printf( aTHX_ "%16s",""); \
9439 if (RExC_lastnum!=RExC_emit) \
9440 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9442 Perl_re_printf( aTHX_ "|%4s",""); \
9443 Perl_re_printf( aTHX_ "|%*s%-4s", \
9444 (int)((depth*2)), "", \
9447 RExC_lastnum=RExC_emit; \
9448 RExC_lastparse=RExC_parse; \
9453 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9454 DEBUG_PARSE_MSG((funcname)); \
9455 Perl_re_printf( aTHX_ "%4s","\n"); \
9457 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9458 DEBUG_PARSE_MSG((funcname)); \
9459 Perl_re_printf( aTHX_ fmt "\n",args); \
9462 /* This section of code defines the inversion list object and its methods. The
9463 * interfaces are highly subject to change, so as much as possible is static to
9464 * this file. An inversion list is here implemented as a malloc'd C UV array
9465 * as an SVt_INVLIST scalar.
9467 * An inversion list for Unicode is an array of code points, sorted by ordinal
9468 * number. Each element gives the code point that begins a range that extends
9469 * up-to but not including the code point given by the next element. The final
9470 * element gives the first code point of a range that extends to the platform's
9471 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9472 * ...) give ranges whose code points are all in the inversion list. We say
9473 * that those ranges are in the set. The odd-numbered elements give ranges
9474 * whose code points are not in the inversion list, and hence not in the set.
9475 * Thus, element [0] is the first code point in the list. Element [1]
9476 * is the first code point beyond that not in the list; and element [2] is the
9477 * first code point beyond that that is in the list. In other words, the first
9478 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9479 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9480 * all code points in that range are not in the inversion list. The third
9481 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9482 * list, and so forth. Thus every element whose index is divisible by two
9483 * gives the beginning of a range that is in the list, and every element whose
9484 * index is not divisible by two gives the beginning of a range not in the
9485 * list. If the final element's index is divisible by two, the inversion list
9486 * extends to the platform's infinity; otherwise the highest code point in the
9487 * inversion list is the contents of that element minus 1.
9489 * A range that contains just a single code point N will look like
9491 * invlist[i+1] == N+1
9493 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9494 * impossible to represent, so element [i+1] is omitted. The single element
9496 * invlist[0] == UV_MAX
9497 * contains just UV_MAX, but is interpreted as matching to infinity.
9499 * Taking the complement (inverting) an inversion list is quite simple, if the
9500 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9501 * This implementation reserves an element at the beginning of each inversion
9502 * list to always contain 0; there is an additional flag in the header which
9503 * indicates if the list begins at the 0, or is offset to begin at the next
9504 * element. This means that the inversion list can be inverted without any
9505 * copying; just flip the flag.
9507 * More about inversion lists can be found in "Unicode Demystified"
9508 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9510 * The inversion list data structure is currently implemented as an SV pointing
9511 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9512 * array of UV whose memory management is automatically handled by the existing
9513 * facilities for SV's.
9515 * Some of the methods should always be private to the implementation, and some
9516 * should eventually be made public */
9518 /* The header definitions are in F<invlist_inline.h> */
9520 #ifndef PERL_IN_XSUB_RE
9522 PERL_STATIC_INLINE UV*
9523 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9525 /* Returns a pointer to the first element in the inversion list's array.
9526 * This is called upon initialization of an inversion list. Where the
9527 * array begins depends on whether the list has the code point U+0000 in it
9528 * or not. The other parameter tells it whether the code that follows this
9529 * call is about to put a 0 in the inversion list or not. The first
9530 * element is either the element reserved for 0, if TRUE, or the element
9531 * after it, if FALSE */
9533 bool* offset = get_invlist_offset_addr(invlist);
9534 UV* zero_addr = (UV *) SvPVX(invlist);
9536 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9539 assert(! _invlist_len(invlist));
9543 /* 1^1 = 0; 1^0 = 1 */
9544 *offset = 1 ^ will_have_0;
9545 return zero_addr + *offset;
9549 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9551 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9552 * steals the list from 'src', so 'src' is made to have a NULL list. This
9553 * is similar to what SvSetMagicSV() would do, if it were implemented on
9554 * inversion lists, though this routine avoids a copy */
9556 const UV src_len = _invlist_len(src);
9557 const bool src_offset = *get_invlist_offset_addr(src);
9558 const STRLEN src_byte_len = SvLEN(src);
9559 char * array = SvPVX(src);
9561 #ifndef NO_TAINT_SUPPORT
9562 const int oldtainted = TAINT_get;
9565 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9567 assert(is_invlist(src));
9568 assert(is_invlist(dest));
9569 assert(! invlist_is_iterating(src));
9570 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9572 /* Make sure it ends in the right place with a NUL, as our inversion list
9573 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9575 array[src_byte_len - 1] = '\0';
9577 TAINT_NOT; /* Otherwise it breaks */
9578 sv_usepvn_flags(dest,
9582 /* This flag is documented to cause a copy to be avoided */
9583 SV_HAS_TRAILING_NUL);
9584 TAINT_set(oldtainted);
9589 /* Finish up copying over the other fields in an inversion list */
9590 *get_invlist_offset_addr(dest) = src_offset;
9591 invlist_set_len(dest, src_len, src_offset);
9592 *get_invlist_previous_index_addr(dest) = 0;
9593 invlist_iterfinish(dest);
9596 PERL_STATIC_INLINE IV*
9597 S_get_invlist_previous_index_addr(SV* invlist)
9599 /* Return the address of the IV that is reserved to hold the cached index
9601 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9603 assert(is_invlist(invlist));
9605 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9608 PERL_STATIC_INLINE IV
9609 S_invlist_previous_index(SV* const invlist)
9611 /* Returns cached index of previous search */
9613 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9615 return *get_invlist_previous_index_addr(invlist);
9618 PERL_STATIC_INLINE void
9619 S_invlist_set_previous_index(SV* const invlist, const IV index)
9621 /* Caches <index> for later retrieval */
9623 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9625 assert(index == 0 || index < (int) _invlist_len(invlist));
9627 *get_invlist_previous_index_addr(invlist) = index;
9630 PERL_STATIC_INLINE void
9631 S_invlist_trim(SV* invlist)
9633 /* Free the not currently-being-used space in an inversion list */
9635 /* But don't free up the space needed for the 0 UV that is always at the
9636 * beginning of the list, nor the trailing NUL */
9637 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9639 PERL_ARGS_ASSERT_INVLIST_TRIM;
9641 assert(is_invlist(invlist));
9643 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9646 PERL_STATIC_INLINE void
9647 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9649 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9651 assert(is_invlist(invlist));
9653 invlist_set_len(invlist, 0, 0);
9654 invlist_trim(invlist);
9657 #endif /* ifndef PERL_IN_XSUB_RE */
9659 PERL_STATIC_INLINE bool
9660 S_invlist_is_iterating(const SV* const invlist)
9662 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9664 /* get_invlist_iter_addr()'s sv is non-const only because it returns a
9665 * value that can be used to modify the invlist, it doesn't modify the
9667 return *(get_invlist_iter_addr((SV*)invlist)) < (STRLEN) UV_MAX;
9670 #ifndef PERL_IN_XSUB_RE
9672 PERL_STATIC_INLINE UV
9673 S_invlist_max(const SV* const invlist)
9675 /* Returns the maximum number of elements storable in the inversion list's
9676 * array, without having to realloc() */
9678 PERL_ARGS_ASSERT_INVLIST_MAX;
9680 assert(is_invlist(invlist));
9682 /* Assumes worst case, in which the 0 element is not counted in the
9683 * inversion list, so subtracts 1 for that */
9684 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9685 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9686 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9690 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9692 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9694 /* First 1 is in case the zero element isn't in the list; second 1 is for
9696 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9697 invlist_set_len(invlist, 0, 0);
9699 /* Force iterinit() to be used to get iteration to work */
9700 invlist_iterfinish(invlist);
9702 *get_invlist_previous_index_addr(invlist) = 0;
9703 SvPOK_on(invlist); /* This allows B to extract the PV */
9707 Perl__new_invlist(pTHX_ IV initial_size)
9710 /* Return a pointer to a newly constructed inversion list, with enough
9711 * space to store 'initial_size' elements. If that number is negative, a
9712 * system default is used instead */
9716 if (initial_size < 0) {
9720 new_list = newSV_type(SVt_INVLIST);
9721 initialize_invlist_guts(new_list, initial_size);
9727 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9729 /* Return a pointer to a newly constructed inversion list, initialized to
9730 * point to <list>, which has to be in the exact correct inversion list
9731 * form, including internal fields. Thus this is a dangerous routine that
9732 * should not be used in the wrong hands. The passed in 'list' contains
9733 * several header fields at the beginning that are not part of the
9734 * inversion list body proper */
9736 const STRLEN length = (STRLEN) list[0];
9737 const UV version_id = list[1];
9738 const bool offset = cBOOL(list[2]);
9739 #define HEADER_LENGTH 3
9740 /* If any of the above changes in any way, you must change HEADER_LENGTH
9741 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9742 * perl -E 'say int(rand 2**31-1)'
9744 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9745 data structure type, so that one being
9746 passed in can be validated to be an
9747 inversion list of the correct vintage.
9750 SV* invlist = newSV_type(SVt_INVLIST);
9752 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9754 if (version_id != INVLIST_VERSION_ID) {
9755 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9758 /* The generated array passed in includes header elements that aren't part
9759 * of the list proper, so start it just after them */
9760 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9762 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9763 shouldn't touch it */
9765 *(get_invlist_offset_addr(invlist)) = offset;
9767 /* The 'length' passed to us is the physical number of elements in the
9768 * inversion list. But if there is an offset the logical number is one
9770 invlist_set_len(invlist, length - offset, offset);
9772 invlist_set_previous_index(invlist, 0);
9774 /* Initialize the iteration pointer. */
9775 invlist_iterfinish(invlist);
9777 SvREADONLY_on(invlist);
9784 S__append_range_to_invlist(pTHX_ SV* const invlist,
9785 const UV start, const UV end)
9787 /* Subject to change or removal. Append the range from 'start' to 'end' at
9788 * the end of the inversion list. The range must be above any existing
9792 UV max = invlist_max(invlist);
9793 UV len = _invlist_len(invlist);
9796 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9798 if (len == 0) { /* Empty lists must be initialized */
9799 offset = start != 0;
9800 array = _invlist_array_init(invlist, ! offset);
9803 /* Here, the existing list is non-empty. The current max entry in the
9804 * list is generally the first value not in the set, except when the
9805 * set extends to the end of permissible values, in which case it is
9806 * the first entry in that final set, and so this call is an attempt to
9807 * append out-of-order */
9809 UV final_element = len - 1;
9810 array = invlist_array(invlist);
9811 if ( array[final_element] > start
9812 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9814 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9815 array[final_element], start,
9816 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9819 /* Here, it is a legal append. If the new range begins 1 above the end
9820 * of the range below it, it is extending the range below it, so the
9821 * new first value not in the set is one greater than the newly
9822 * extended range. */
9823 offset = *get_invlist_offset_addr(invlist);
9824 if (array[final_element] == start) {
9825 if (end != UV_MAX) {
9826 array[final_element] = end + 1;
9829 /* But if the end is the maximum representable on the machine,
9830 * assume that infinity was actually what was meant. Just let
9831 * the range that this would extend to have no end */
9832 invlist_set_len(invlist, len - 1, offset);
9838 /* Here the new range doesn't extend any existing set. Add it */
9840 len += 2; /* Includes an element each for the start and end of range */
9842 /* If wll overflow the existing space, extend, which may cause the array to
9845 invlist_extend(invlist, len);
9847 /* Have to set len here to avoid assert failure in invlist_array() */
9848 invlist_set_len(invlist, len, offset);
9850 array = invlist_array(invlist);
9853 invlist_set_len(invlist, len, offset);
9856 /* The next item on the list starts the range, the one after that is
9857 * one past the new range. */
9858 array[len - 2] = start;
9859 if (end != UV_MAX) {
9860 array[len - 1] = end + 1;
9863 /* But if the end is the maximum representable on the machine, just let
9864 * the range have no end */
9865 invlist_set_len(invlist, len - 1, offset);
9870 Perl__invlist_search(SV* const invlist, const UV cp)
9872 /* Searches the inversion list for the entry that contains the input code
9873 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9874 * return value is the index into the list's array of the range that
9875 * contains <cp>, that is, 'i' such that
9876 * array[i] <= cp < array[i+1]
9881 IV high = _invlist_len(invlist);
9882 const IV highest_element = high - 1;
9885 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9887 /* If list is empty, return failure. */
9888 if (UNLIKELY(high == 0)) {
9892 /* (We can't get the array unless we know the list is non-empty) */
9893 array = invlist_array(invlist);
9895 mid = invlist_previous_index(invlist);
9897 if (UNLIKELY(mid > highest_element)) {
9898 mid = highest_element;
9901 /* <mid> contains the cache of the result of the previous call to this
9902 * function (0 the first time). See if this call is for the same result,
9903 * or if it is for mid-1. This is under the theory that calls to this
9904 * function will often be for related code points that are near each other.
9905 * And benchmarks show that caching gives better results. We also test
9906 * here if the code point is within the bounds of the list. These tests
9907 * replace others that would have had to be made anyway to make sure that
9908 * the array bounds were not exceeded, and these give us extra information
9909 * at the same time */
9910 if (cp >= array[mid]) {
9911 if (cp >= array[highest_element]) {
9912 return highest_element;
9915 /* Here, array[mid] <= cp < array[highest_element]. This means that
9916 * the final element is not the answer, so can exclude it; it also
9917 * means that <mid> is not the final element, so can refer to 'mid + 1'
9919 if (cp < array[mid + 1]) {
9925 else { /* cp < aray[mid] */
9926 if (cp < array[0]) { /* Fail if outside the array */
9930 if (cp >= array[mid - 1]) {
9935 /* Binary search. What we are looking for is <i> such that
9936 * array[i] <= cp < array[i+1]
9937 * The loop below converges on the i+1. Note that there may not be an
9938 * (i+1)th element in the array, and things work nonetheless */
9939 while (low < high) {
9940 mid = (low + high) / 2;
9941 assert(mid <= highest_element);
9942 if (array[mid] <= cp) { /* cp >= array[mid] */
9945 /* We could do this extra test to exit the loop early.
9946 if (cp < array[low]) {
9951 else { /* cp < array[mid] */
9958 invlist_set_previous_index(invlist, high);
9963 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9964 const bool complement_b, SV** output)
9966 /* Take the union of two inversion lists and point '*output' to it. On
9967 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9968 * even 'a' or 'b'). If to an inversion list, the contents of the original
9969 * list will be replaced by the union. The first list, 'a', may be
9970 * NULL, in which case a copy of the second list is placed in '*output'.
9971 * If 'complement_b' is TRUE, the union is taken of the complement
9972 * (inversion) of 'b' instead of b itself.
9974 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9975 * Richard Gillam, published by Addison-Wesley, and explained at some
9976 * length there. The preface says to incorporate its examples into your
9977 * code at your own risk.
9979 * The algorithm is like a merge sort. */
9981 const UV* array_a; /* a's array */
9983 UV len_a; /* length of a's array */
9986 SV* u; /* the resulting union */
9990 UV i_a = 0; /* current index into a's array */
9994 /* running count, as explained in the algorithm source book; items are
9995 * stopped accumulating and are output when the count changes to/from 0.
9996 * The count is incremented when we start a range that's in an input's set,
9997 * and decremented when we start a range that's not in a set. So this
9998 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9999 * and hence nothing goes into the union; 1, just one of the inputs is in
10000 * its set (and its current range gets added to the union); and 2 when both
10001 * inputs are in their sets. */
10004 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
10006 assert(*output == NULL || is_invlist(*output));
10008 len_b = _invlist_len(b);
10011 /* Here, 'b' is empty, hence it's complement is all possible code
10012 * points. So if the union includes the complement of 'b', it includes
10013 * everything, and we need not even look at 'a'. It's easiest to
10014 * create a new inversion list that matches everything. */
10015 if (complement_b) {
10016 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
10018 if (*output == NULL) { /* If the output didn't exist, just point it
10020 *output = everything;
10022 else { /* Otherwise, replace its contents with the new list */
10023 invlist_replace_list_destroys_src(*output, everything);
10024 SvREFCNT_dec_NN(everything);
10030 /* Here, we don't want the complement of 'b', and since 'b' is empty,
10031 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
10032 * output will be empty */
10034 if (a == NULL || _invlist_len(a) == 0) {
10035 if (*output == NULL) {
10036 *output = _new_invlist(0);
10039 invlist_clear(*output);
10044 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
10045 * union. We can just return a copy of 'a' if '*output' doesn't point
10046 * to an existing list */
10047 if (*output == NULL) {
10048 *output = invlist_clone(a, NULL);
10052 /* If the output is to overwrite 'a', we have a no-op, as it's
10053 * already in 'a' */
10054 if (*output == a) {
10058 /* Here, '*output' is to be overwritten by 'a' */
10059 u = invlist_clone(a, NULL);
10060 invlist_replace_list_destroys_src(*output, u);
10061 SvREFCNT_dec_NN(u);
10066 /* Here 'b' is not empty. See about 'a' */
10068 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
10070 /* Here, 'a' is empty (and b is not). That means the union will come
10071 * entirely from 'b'. If '*output' is NULL, we can directly return a
10072 * clone of 'b'. Otherwise, we replace the contents of '*output' with
10075 SV ** dest = (*output == NULL) ? output : &u;
10076 *dest = invlist_clone(b, NULL);
10077 if (complement_b) {
10078 _invlist_invert(*dest);
10082 invlist_replace_list_destroys_src(*output, u);
10083 SvREFCNT_dec_NN(u);
10089 /* Here both lists exist and are non-empty */
10090 array_a = invlist_array(a);
10091 array_b = invlist_array(b);
10093 /* If are to take the union of 'a' with the complement of b, set it
10094 * up so are looking at b's complement. */
10095 if (complement_b) {
10097 /* To complement, we invert: if the first element is 0, remove it. To
10098 * do this, we just pretend the array starts one later */
10099 if (array_b[0] == 0) {
10105 /* But if the first element is not zero, we pretend the list starts
10106 * at the 0 that is always stored immediately before the array. */
10112 /* Size the union for the worst case: that the sets are completely
10114 u = _new_invlist(len_a + len_b);
10116 /* Will contain U+0000 if either component does */
10117 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
10118 || (len_b > 0 && array_b[0] == 0));
10120 /* Go through each input list item by item, stopping when have exhausted
10122 while (i_a < len_a && i_b < len_b) {
10123 UV cp; /* The element to potentially add to the union's array */
10124 bool cp_in_set; /* is it in the input list's set or not */
10126 /* We need to take one or the other of the two inputs for the union.
10127 * Since we are merging two sorted lists, we take the smaller of the
10128 * next items. In case of a tie, we take first the one that is in its
10129 * set. If we first took the one not in its set, it would decrement
10130 * the count, possibly to 0 which would cause it to be output as ending
10131 * the range, and the next time through we would take the same number,
10132 * and output it again as beginning the next range. By doing it the
10133 * opposite way, there is no possibility that the count will be
10134 * momentarily decremented to 0, and thus the two adjoining ranges will
10135 * be seamlessly merged. (In a tie and both are in the set or both not
10136 * in the set, it doesn't matter which we take first.) */
10137 if ( array_a[i_a] < array_b[i_b]
10138 || ( array_a[i_a] == array_b[i_b]
10139 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
10141 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
10142 cp = array_a[i_a++];
10145 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10146 cp = array_b[i_b++];
10149 /* Here, have chosen which of the two inputs to look at. Only output
10150 * if the running count changes to/from 0, which marks the
10151 * beginning/end of a range that's in the set */
10154 array_u[i_u++] = cp;
10161 array_u[i_u++] = cp;
10167 /* The loop above increments the index into exactly one of the input lists
10168 * each iteration, and ends when either index gets to its list end. That
10169 * means the other index is lower than its end, and so something is
10170 * remaining in that one. We decrement 'count', as explained below, if
10171 * that list is in its set. (i_a and i_b each currently index the element
10172 * beyond the one we care about.) */
10173 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10174 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10179 /* Above we decremented 'count' if the list that had unexamined elements in
10180 * it was in its set. This has made it so that 'count' being non-zero
10181 * means there isn't anything left to output; and 'count' equal to 0 means
10182 * that what is left to output is precisely that which is left in the
10183 * non-exhausted input list.
10185 * To see why, note first that the exhausted input obviously has nothing
10186 * left to add to the union. If it was in its set at its end, that means
10187 * the set extends from here to the platform's infinity, and hence so does
10188 * the union and the non-exhausted set is irrelevant. The exhausted set
10189 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
10190 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
10191 * 'count' remains at 1. This is consistent with the decremented 'count'
10192 * != 0 meaning there's nothing left to add to the union.
10194 * But if the exhausted input wasn't in its set, it contributed 0 to
10195 * 'count', and the rest of the union will be whatever the other input is.
10196 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
10197 * otherwise it gets decremented to 0. This is consistent with 'count'
10198 * == 0 meaning the remainder of the union is whatever is left in the
10199 * non-exhausted list. */
10204 IV copy_count = len_a - i_a;
10205 if (copy_count > 0) { /* The non-exhausted input is 'a' */
10206 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
10208 else { /* The non-exhausted input is b */
10209 copy_count = len_b - i_b;
10210 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
10212 len_u = i_u + copy_count;
10215 /* Set the result to the final length, which can change the pointer to
10216 * array_u, so re-find it. (Note that it is unlikely that this will
10217 * change, as we are shrinking the space, not enlarging it) */
10218 if (len_u != _invlist_len(u)) {
10219 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
10221 array_u = invlist_array(u);
10224 if (*output == NULL) { /* Simply return the new inversion list */
10228 /* Otherwise, overwrite the inversion list that was in '*output'. We
10229 * could instead free '*output', and then set it to 'u', but experience
10230 * has shown [perl #127392] that if the input is a mortal, we can get a
10231 * huge build-up of these during regex compilation before they get
10233 invlist_replace_list_destroys_src(*output, u);
10234 SvREFCNT_dec_NN(u);
10241 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
10242 const bool complement_b, SV** i)
10244 /* Take the intersection of two inversion lists and point '*i' to it. On
10245 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
10246 * even 'a' or 'b'). If to an inversion list, the contents of the original
10247 * list will be replaced by the intersection. The first list, 'a', may be
10248 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
10249 * TRUE, the result will be the intersection of 'a' and the complement (or
10250 * inversion) of 'b' instead of 'b' directly.
10252 * The basis for this comes from "Unicode Demystified" Chapter 13 by
10253 * Richard Gillam, published by Addison-Wesley, and explained at some
10254 * length there. The preface says to incorporate its examples into your
10255 * code at your own risk. In fact, it had bugs
10257 * The algorithm is like a merge sort, and is essentially the same as the
10261 const UV* array_a; /* a's array */
10263 UV len_a; /* length of a's array */
10266 SV* r; /* the resulting intersection */
10270 UV i_a = 0; /* current index into a's array */
10274 /* running count of how many of the two inputs are postitioned at ranges
10275 * that are in their sets. As explained in the algorithm source book,
10276 * items are stopped accumulating and are output when the count changes
10277 * to/from 2. The count is incremented when we start a range that's in an
10278 * input's set, and decremented when we start a range that's not in a set.
10279 * Only when it is 2 are we in the intersection. */
10282 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
10284 assert(*i == NULL || is_invlist(*i));
10286 /* Special case if either one is empty */
10287 len_a = (a == NULL) ? 0 : _invlist_len(a);
10288 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
10289 if (len_a != 0 && complement_b) {
10291 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
10292 * must be empty. Here, also we are using 'b's complement, which
10293 * hence must be every possible code point. Thus the intersection
10294 * is simply 'a'. */
10296 if (*i == a) { /* No-op */
10301 *i = invlist_clone(a, NULL);
10305 r = invlist_clone(a, NULL);
10306 invlist_replace_list_destroys_src(*i, r);
10307 SvREFCNT_dec_NN(r);
10311 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
10312 * intersection must be empty */
10314 *i = _new_invlist(0);
10322 /* Here both lists exist and are non-empty */
10323 array_a = invlist_array(a);
10324 array_b = invlist_array(b);
10326 /* If are to take the intersection of 'a' with the complement of b, set it
10327 * up so are looking at b's complement. */
10328 if (complement_b) {
10330 /* To complement, we invert: if the first element is 0, remove it. To
10331 * do this, we just pretend the array starts one later */
10332 if (array_b[0] == 0) {
10338 /* But if the first element is not zero, we pretend the list starts
10339 * at the 0 that is always stored immediately before the array. */
10345 /* Size the intersection for the worst case: that the intersection ends up
10346 * fragmenting everything to be completely disjoint */
10347 r= _new_invlist(len_a + len_b);
10349 /* Will contain U+0000 iff both components do */
10350 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
10351 && len_b > 0 && array_b[0] == 0);
10353 /* Go through each list item by item, stopping when have exhausted one of
10355 while (i_a < len_a && i_b < len_b) {
10356 UV cp; /* The element to potentially add to the intersection's
10358 bool cp_in_set; /* Is it in the input list's set or not */
10360 /* We need to take one or the other of the two inputs for the
10361 * intersection. Since we are merging two sorted lists, we take the
10362 * smaller of the next items. In case of a tie, we take first the one
10363 * that is not in its set (a difference from the union algorithm). If
10364 * we first took the one in its set, it would increment the count,
10365 * possibly to 2 which would cause it to be output as starting a range
10366 * in the intersection, and the next time through we would take that
10367 * same number, and output it again as ending the set. By doing the
10368 * opposite of this, there is no possibility that the count will be
10369 * momentarily incremented to 2. (In a tie and both are in the set or
10370 * both not in the set, it doesn't matter which we take first.) */
10371 if ( array_a[i_a] < array_b[i_b]
10372 || ( array_a[i_a] == array_b[i_b]
10373 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
10375 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
10376 cp = array_a[i_a++];
10379 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
10380 cp= array_b[i_b++];
10383 /* Here, have chosen which of the two inputs to look at. Only output
10384 * if the running count changes to/from 2, which marks the
10385 * beginning/end of a range that's in the intersection */
10389 array_r[i_r++] = cp;
10394 array_r[i_r++] = cp;
10401 /* The loop above increments the index into exactly one of the input lists
10402 * each iteration, and ends when either index gets to its list end. That
10403 * means the other index is lower than its end, and so something is
10404 * remaining in that one. We increment 'count', as explained below, if the
10405 * exhausted list was in its set. (i_a and i_b each currently index the
10406 * element beyond the one we care about.) */
10407 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10408 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10413 /* Above we incremented 'count' if the exhausted list was in its set. This
10414 * has made it so that 'count' being below 2 means there is nothing left to
10415 * output; otheriwse what's left to add to the intersection is precisely
10416 * that which is left in the non-exhausted input list.
10418 * To see why, note first that the exhausted input obviously has nothing
10419 * left to affect the intersection. If it was in its set at its end, that
10420 * means the set extends from here to the platform's infinity, and hence
10421 * anything in the non-exhausted's list will be in the intersection, and
10422 * anything not in it won't be. Hence, the rest of the intersection is
10423 * precisely what's in the non-exhausted list The exhausted set also
10424 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10425 * it means 'count' is now at least 2. This is consistent with the
10426 * incremented 'count' being >= 2 means to add the non-exhausted list to
10427 * the intersection.
10429 * But if the exhausted input wasn't in its set, it contributed 0 to
10430 * 'count', and the intersection can't include anything further; the
10431 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10432 * incremented. This is consistent with 'count' being < 2 meaning nothing
10433 * further to add to the intersection. */
10434 if (count < 2) { /* Nothing left to put in the intersection. */
10437 else { /* copy the non-exhausted list, unchanged. */
10438 IV copy_count = len_a - i_a;
10439 if (copy_count > 0) { /* a is the one with stuff left */
10440 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10442 else { /* b is the one with stuff left */
10443 copy_count = len_b - i_b;
10444 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10446 len_r = i_r + copy_count;
10449 /* Set the result to the final length, which can change the pointer to
10450 * array_r, so re-find it. (Note that it is unlikely that this will
10451 * change, as we are shrinking the space, not enlarging it) */
10452 if (len_r != _invlist_len(r)) {
10453 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10455 array_r = invlist_array(r);
10458 if (*i == NULL) { /* Simply return the calculated intersection */
10461 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10462 instead free '*i', and then set it to 'r', but experience has
10463 shown [perl #127392] that if the input is a mortal, we can get a
10464 huge build-up of these during regex compilation before they get
10467 invlist_replace_list_destroys_src(*i, r);
10472 SvREFCNT_dec_NN(r);
10479 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10481 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10482 * set. A pointer to the inversion list is returned. This may actually be
10483 * a new list, in which case the passed in one has been destroyed. The
10484 * passed-in inversion list can be NULL, in which case a new one is created
10485 * with just the one range in it. The new list is not necessarily
10486 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10487 * result of this function. The gain would not be large, and in many
10488 * cases, this is called multiple times on a single inversion list, so
10489 * anything freed may almost immediately be needed again.
10491 * This used to mostly call the 'union' routine, but that is much more
10492 * heavyweight than really needed for a single range addition */
10494 UV* array; /* The array implementing the inversion list */
10495 UV len; /* How many elements in 'array' */
10496 SSize_t i_s; /* index into the invlist array where 'start'
10498 SSize_t i_e = 0; /* And the index where 'end' should go */
10499 UV cur_highest; /* The highest code point in the inversion list
10500 upon entry to this function */
10502 /* This range becomes the whole inversion list if none already existed */
10503 if (invlist == NULL) {
10504 invlist = _new_invlist(2);
10505 _append_range_to_invlist(invlist, start, end);
10509 /* Likewise, if the inversion list is currently empty */
10510 len = _invlist_len(invlist);
10512 _append_range_to_invlist(invlist, start, end);
10516 /* Starting here, we have to know the internals of the list */
10517 array = invlist_array(invlist);
10519 /* If the new range ends higher than the current highest ... */
10520 cur_highest = invlist_highest(invlist);
10521 if (end > cur_highest) {
10523 /* If the whole range is higher, we can just append it */
10524 if (start > cur_highest) {
10525 _append_range_to_invlist(invlist, start, end);
10529 /* Otherwise, add the portion that is higher ... */
10530 _append_range_to_invlist(invlist, cur_highest + 1, end);
10532 /* ... and continue on below to handle the rest. As a result of the
10533 * above append, we know that the index of the end of the range is the
10534 * final even numbered one of the array. Recall that the final element
10535 * always starts a range that extends to infinity. If that range is in
10536 * the set (meaning the set goes from here to infinity), it will be an
10537 * even index, but if it isn't in the set, it's odd, and the final
10538 * range in the set is one less, which is even. */
10539 if (end == UV_MAX) {
10547 /* We have dealt with appending, now see about prepending. If the new
10548 * range starts lower than the current lowest ... */
10549 if (start < array[0]) {
10551 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10552 * Let the union code handle it, rather than having to know the
10553 * trickiness in two code places. */
10554 if (UNLIKELY(start == 0)) {
10557 range_invlist = _new_invlist(2);
10558 _append_range_to_invlist(range_invlist, start, end);
10560 _invlist_union(invlist, range_invlist, &invlist);
10562 SvREFCNT_dec_NN(range_invlist);
10567 /* If the whole new range comes before the first entry, and doesn't
10568 * extend it, we have to insert it as an additional range */
10569 if (end < array[0] - 1) {
10571 goto splice_in_new_range;
10574 /* Here the new range adjoins the existing first range, extending it
10578 /* And continue on below to handle the rest. We know that the index of
10579 * the beginning of the range is the first one of the array */
10582 else { /* Not prepending any part of the new range to the existing list.
10583 * Find where in the list it should go. This finds i_s, such that:
10584 * invlist[i_s] <= start < array[i_s+1]
10586 i_s = _invlist_search(invlist, start);
10589 /* At this point, any extending before the beginning of the inversion list
10590 * and/or after the end has been done. This has made it so that, in the
10591 * code below, each endpoint of the new range is either in a range that is
10592 * in the set, or is in a gap between two ranges that are. This means we
10593 * don't have to worry about exceeding the array bounds.
10595 * Find where in the list the new range ends (but we can skip this if we
10596 * have already determined what it is, or if it will be the same as i_s,
10597 * which we already have computed) */
10599 i_e = (start == end)
10601 : _invlist_search(invlist, end);
10604 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10605 * is a range that goes to infinity there is no element at invlist[i_e+1],
10606 * so only the first relation holds. */
10608 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10610 /* Here, the ranges on either side of the beginning of the new range
10611 * are in the set, and this range starts in the gap between them.
10613 * The new range extends the range above it downwards if the new range
10614 * ends at or above that range's start */
10615 const bool extends_the_range_above = ( end == UV_MAX
10616 || end + 1 >= array[i_s+1]);
10618 /* The new range extends the range below it upwards if it begins just
10619 * after where that range ends */
10620 if (start == array[i_s]) {
10622 /* If the new range fills the entire gap between the other ranges,
10623 * they will get merged together. Other ranges may also get
10624 * merged, depending on how many of them the new range spans. In
10625 * the general case, we do the merge later, just once, after we
10626 * figure out how many to merge. But in the case where the new
10627 * range exactly spans just this one gap (possibly extending into
10628 * the one above), we do the merge here, and an early exit. This
10629 * is done here to avoid having to special case later. */
10630 if (i_e - i_s <= 1) {
10632 /* If i_e - i_s == 1, it means that the new range terminates
10633 * within the range above, and hence 'extends_the_range_above'
10634 * must be true. (If the range above it extends to infinity,
10635 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10636 * will be 0, so no harm done.) */
10637 if (extends_the_range_above) {
10638 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10639 invlist_set_len(invlist,
10641 *(get_invlist_offset_addr(invlist)));
10645 /* Here, i_e must == i_s. We keep them in sync, as they apply
10646 * to the same range, and below we are about to decrement i_s
10651 /* Here, the new range is adjacent to the one below. (It may also
10652 * span beyond the range above, but that will get resolved later.)
10653 * Extend the range below to include this one. */
10654 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10656 start = array[i_s];
10658 else if (extends_the_range_above) {
10660 /* Here the new range only extends the range above it, but not the
10661 * one below. It merges with the one above. Again, we keep i_e
10662 * and i_s in sync if they point to the same range */
10667 array[i_s] = start;
10671 /* Here, we've dealt with the new range start extending any adjoining
10674 * If the new range extends to infinity, it is now the final one,
10675 * regardless of what was there before */
10676 if (UNLIKELY(end == UV_MAX)) {
10677 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10681 /* If i_e started as == i_s, it has also been dealt with,
10682 * and been updated to the new i_s, which will fail the following if */
10683 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10685 /* Here, the ranges on either side of the end of the new range are in
10686 * the set, and this range ends in the gap between them.
10688 * If this range is adjacent to (hence extends) the range above it, it
10689 * becomes part of that range; likewise if it extends the range below,
10690 * it becomes part of that range */
10691 if (end + 1 == array[i_e+1]) {
10693 array[i_e] = start;
10695 else if (start <= array[i_e]) {
10696 array[i_e] = end + 1;
10703 /* If the range fits entirely in an existing range (as possibly already
10704 * extended above), it doesn't add anything new */
10705 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10709 /* Here, no part of the range is in the list. Must add it. It will
10710 * occupy 2 more slots */
10711 splice_in_new_range:
10713 invlist_extend(invlist, len + 2);
10714 array = invlist_array(invlist);
10715 /* Move the rest of the array down two slots. Don't include any
10717 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10719 /* Do the actual splice */
10720 array[i_e+1] = start;
10721 array[i_e+2] = end + 1;
10722 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10726 /* Here the new range crossed the boundaries of a pre-existing range. The
10727 * code above has adjusted things so that both ends are in ranges that are
10728 * in the set. This means everything in between must also be in the set.
10729 * Just squash things together */
10730 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10731 invlist_set_len(invlist,
10733 *(get_invlist_offset_addr(invlist)));
10739 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10740 UV** other_elements_ptr)
10742 /* Create and return an inversion list whose contents are to be populated
10743 * by the caller. The caller gives the number of elements (in 'size') and
10744 * the very first element ('element0'). This function will set
10745 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10746 * are to be placed.
10748 * Obviously there is some trust involved that the caller will properly
10749 * fill in the other elements of the array.
10751 * (The first element needs to be passed in, as the underlying code does
10752 * things differently depending on whether it is zero or non-zero) */
10754 SV* invlist = _new_invlist(size);
10757 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10759 invlist = add_cp_to_invlist(invlist, element0);
10760 offset = *get_invlist_offset_addr(invlist);
10762 invlist_set_len(invlist, size, offset);
10763 *other_elements_ptr = invlist_array(invlist) + 1;
10769 #ifndef PERL_IN_XSUB_RE
10771 Perl__invlist_invert(pTHX_ SV* const invlist)
10773 /* Complement the input inversion list. This adds a 0 if the list didn't
10774 * have a zero; removes it otherwise. As described above, the data
10775 * structure is set up so that this is very efficient */
10777 PERL_ARGS_ASSERT__INVLIST_INVERT;
10779 assert(! invlist_is_iterating(invlist));
10781 /* The inverse of matching nothing is matching everything */
10782 if (_invlist_len(invlist) == 0) {
10783 _append_range_to_invlist(invlist, 0, UV_MAX);
10787 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10791 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10793 /* Return a new inversion list that is a copy of the input one, which is
10794 * unchanged. The new list will not be mortal even if the old one was. */
10796 const STRLEN nominal_length = _invlist_len(invlist);
10797 const STRLEN physical_length = SvCUR(invlist);
10798 const bool offset = *(get_invlist_offset_addr(invlist));
10800 PERL_ARGS_ASSERT_INVLIST_CLONE;
10802 if (new_invlist == NULL) {
10803 new_invlist = _new_invlist(nominal_length);
10806 sv_upgrade(new_invlist, SVt_INVLIST);
10807 initialize_invlist_guts(new_invlist, nominal_length);
10810 *(get_invlist_offset_addr(new_invlist)) = offset;
10811 invlist_set_len(new_invlist, nominal_length, offset);
10812 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10814 return new_invlist;
10819 PERL_STATIC_INLINE UV
10820 S_invlist_lowest(SV* const invlist)
10822 /* Returns the lowest code point that matches an inversion list. This API
10823 * has an ambiguity, as it returns 0 under either the lowest is actually
10824 * 0, or if the list is empty. If this distinction matters to you, check
10825 * for emptiness before calling this function */
10827 UV len = _invlist_len(invlist);
10830 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10836 array = invlist_array(invlist);
10842 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10844 /* Get the contents of an inversion list into a string SV so that they can
10845 * be printed out. If 'traditional_style' is TRUE, it uses the format
10846 * traditionally done for debug tracing; otherwise it uses a format
10847 * suitable for just copying to the output, with blanks between ranges and
10848 * a dash between range components */
10852 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10853 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10855 if (traditional_style) {
10856 output = newSVpvs("\n");
10859 output = newSVpvs("");
10862 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10864 assert(! invlist_is_iterating(invlist));
10866 invlist_iterinit(invlist);
10867 while (invlist_iternext(invlist, &start, &end)) {
10868 if (end == UV_MAX) {
10869 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10870 start, intra_range_delimiter,
10871 inter_range_delimiter);
10873 else if (end != start) {
10874 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10876 intra_range_delimiter,
10877 end, inter_range_delimiter);
10880 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10881 start, inter_range_delimiter);
10885 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10886 SvCUR_set(output, SvCUR(output) - 1);
10892 #ifndef PERL_IN_XSUB_RE
10894 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10895 const char * const indent, SV* const invlist)
10897 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10898 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10899 * the string 'indent'. The output looks like this:
10900 [0] 0x000A .. 0x000D
10902 [4] 0x2028 .. 0x2029
10903 [6] 0x3104 .. INFTY
10904 * This means that the first range of code points matched by the list are
10905 * 0xA through 0xD; the second range contains only the single code point
10906 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10907 * are used to define each range (except if the final range extends to
10908 * infinity, only a single element is needed). The array index of the
10909 * first element for the corresponding range is given in brackets. */
10914 PERL_ARGS_ASSERT__INVLIST_DUMP;
10916 if (invlist_is_iterating(invlist)) {
10917 Perl_dump_indent(aTHX_ level, file,
10918 "%sCan't dump inversion list because is in middle of iterating\n",
10923 invlist_iterinit(invlist);
10924 while (invlist_iternext(invlist, &start, &end)) {
10925 if (end == UV_MAX) {
10926 Perl_dump_indent(aTHX_ level, file,
10927 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10928 indent, (UV)count, start);
10930 else if (end != start) {
10931 Perl_dump_indent(aTHX_ level, file,
10932 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10933 indent, (UV)count, start, end);
10936 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10937 indent, (UV)count, start);
10945 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10947 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10949 /* Return a boolean as to if the two passed in inversion lists are
10950 * identical. The final argument, if TRUE, says to take the complement of
10951 * the second inversion list before doing the comparison */
10953 const UV len_a = _invlist_len(a);
10954 UV len_b = _invlist_len(b);
10956 const UV* array_a = NULL;
10957 const UV* array_b = NULL;
10959 PERL_ARGS_ASSERT__INVLISTEQ;
10961 /* This code avoids accessing the arrays unless it knows the length is
10966 return ! complement_b;
10970 array_a = invlist_array(a);
10974 array_b = invlist_array(b);
10977 /* If are to compare 'a' with the complement of b, set it
10978 * up so are looking at b's complement. */
10979 if (complement_b) {
10981 /* The complement of nothing is everything, so <a> would have to have
10982 * just one element, starting at zero (ending at infinity) */
10984 return (len_a == 1 && array_a[0] == 0);
10986 if (array_b[0] == 0) {
10988 /* Otherwise, to complement, we invert. Here, the first element is
10989 * 0, just remove it. To do this, we just pretend the array starts
10997 /* But if the first element is not zero, we pretend the list starts
10998 * at the 0 that is always stored immediately before the array. */
11004 return len_a == len_b
11005 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
11011 * As best we can, determine the characters that can match the start of
11012 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
11013 * can be false positive matches
11015 * Returns the invlist as a new SV*; it is the caller's responsibility to
11016 * call SvREFCNT_dec() when done with it.
11019 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
11021 const U8 * s = (U8*)STRING(node);
11022 SSize_t bytelen = STR_LEN(node);
11024 /* Start out big enough for 2 separate code points */
11025 SV* invlist = _new_invlist(4);
11027 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
11032 /* We punt and assume can match anything if the node begins
11033 * with a multi-character fold. Things are complicated. For
11034 * example, /ffi/i could match any of:
11035 * "\N{LATIN SMALL LIGATURE FFI}"
11036 * "\N{LATIN SMALL LIGATURE FF}I"
11037 * "F\N{LATIN SMALL LIGATURE FI}"
11038 * plus several other things; and making sure we have all the
11039 * possibilities is hard. */
11040 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
11041 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
11044 /* Any Latin1 range character can potentially match any
11045 * other depending on the locale, and in Turkic locales, 'I' and
11046 * 'i' can match U+130 and U+131 */
11047 if (OP(node) == EXACTFL) {
11048 _invlist_union(invlist, PL_Latin1, &invlist);
11049 if (isALPHA_FOLD_EQ(uc, 'I')) {
11050 invlist = add_cp_to_invlist(invlist,
11051 LATIN_SMALL_LETTER_DOTLESS_I);
11052 invlist = add_cp_to_invlist(invlist,
11053 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
11057 /* But otherwise, it matches at least itself. We can
11058 * quickly tell if it has a distinct fold, and if so,
11059 * it matches that as well */
11060 invlist = add_cp_to_invlist(invlist, uc);
11061 if (IS_IN_SOME_FOLD_L1(uc))
11062 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
11065 /* Some characters match above-Latin1 ones under /i. This
11066 * is true of EXACTFL ones when the locale is UTF-8 */
11067 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
11068 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
11069 EXACTFAA_NO_TRIE)))
11071 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
11075 else { /* Pattern is UTF-8 */
11076 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
11077 const U8* e = s + bytelen;
11080 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
11082 /* The only code points that aren't folded in a UTF EXACTFish
11083 * node are the problematic ones in EXACTFL nodes */
11084 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
11085 /* We need to check for the possibility that this EXACTFL
11086 * node begins with a multi-char fold. Therefore we fold
11087 * the first few characters of it so that we can make that
11093 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
11095 *(d++) = (U8) toFOLD(*s);
11096 if (fc < 0) { /* Save the first fold */
11103 UV fold = toFOLD_utf8_safe(s, e, d, &len);
11104 if (fc < 0) { /* Save the first fold */
11112 /* And set up so the code below that looks in this folded
11113 * buffer instead of the node's string */
11118 /* When we reach here 's' points to the fold of the first
11119 * character(s) of the node; and 'e' points to far enough along
11120 * the folded string to be just past any possible multi-char
11123 * Like the non-UTF case above, we punt if the node begins with a
11124 * multi-char fold */
11126 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
11127 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
11129 else { /* Single char fold */
11132 const U32 * remaining_folds;
11133 Size_t folds_count;
11135 /* It matches itself */
11136 invlist = add_cp_to_invlist(invlist, fc);
11138 /* ... plus all the things that fold to it, which are found in
11139 * PL_utf8_foldclosures */
11140 folds_count = _inverse_folds(fc, &first_fold,
11142 for (k = 0; k < folds_count; k++) {
11143 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
11145 /* /aa doesn't allow folds between ASCII and non- */
11146 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
11147 && isASCII(c) != isASCII(fc))
11152 invlist = add_cp_to_invlist(invlist, c);
11155 if (OP(node) == EXACTFL) {
11157 /* If either [iI] are present in an EXACTFL node the above code
11158 * should have added its normal case pair, but under a Turkish
11159 * locale they could match instead the case pairs from it. Add
11160 * those as potential matches as well */
11161 if (isALPHA_FOLD_EQ(fc, 'I')) {
11162 invlist = add_cp_to_invlist(invlist,
11163 LATIN_SMALL_LETTER_DOTLESS_I);
11164 invlist = add_cp_to_invlist(invlist,
11165 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
11167 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
11168 invlist = add_cp_to_invlist(invlist, 'I');
11170 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
11171 invlist = add_cp_to_invlist(invlist, 'i');
11180 #undef HEADER_LENGTH
11181 #undef TO_INTERNAL_SIZE
11182 #undef FROM_INTERNAL_SIZE
11183 #undef INVLIST_VERSION_ID
11185 /* End of inversion list object */
11188 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
11190 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
11191 * constructs, and updates RExC_flags with them. On input, RExC_parse
11192 * should point to the first flag; it is updated on output to point to the
11193 * final ')' or ':'. There needs to be at least one flag, or this will
11196 /* for (?g), (?gc), and (?o) warnings; warning
11197 about (?c) will warn about (?g) -- japhy */
11199 #define WASTED_O 0x01
11200 #define WASTED_G 0x02
11201 #define WASTED_C 0x04
11202 #define WASTED_GC (WASTED_G|WASTED_C)
11203 I32 wastedflags = 0x00;
11204 U32 posflags = 0, negflags = 0;
11205 U32 *flagsp = &posflags;
11206 char has_charset_modifier = '\0';
11208 bool has_use_defaults = FALSE;
11209 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
11210 int x_mod_count = 0;
11212 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
11214 /* '^' as an initial flag sets certain defaults */
11215 if (UCHARAT(RExC_parse) == '^') {
11216 RExC_parse_inc_by(1);
11217 has_use_defaults = TRUE;
11218 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
11219 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
11220 ? REGEX_UNICODE_CHARSET
11221 : REGEX_DEPENDS_CHARSET;
11222 set_regex_charset(&RExC_flags, cs);
11225 cs = get_regex_charset(RExC_flags);
11226 if ( cs == REGEX_DEPENDS_CHARSET
11227 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
11229 cs = REGEX_UNICODE_CHARSET;
11233 while (RExC_parse < RExC_end) {
11234 /* && memCHRs("iogcmsx", *RExC_parse) */
11235 /* (?g), (?gc) and (?o) are useless here
11236 and must be globally applied -- japhy */
11237 if ((RExC_pm_flags & PMf_WILDCARD)) {
11238 if (flagsp == & negflags) {
11239 if (*RExC_parse == 'm') {
11240 RExC_parse_inc_by(1);
11241 /* diag_listed_as: Use of %s is not allowed in Unicode
11242 property wildcard subpatterns in regex; marked by <--
11244 vFAIL("Use of modifier '-m' is not allowed in Unicode"
11245 " property wildcard subpatterns");
11249 if (*RExC_parse == 's') {
11250 goto modifier_illegal_in_wildcard;
11255 switch (*RExC_parse) {
11257 /* Code for the imsxn flags */
11258 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
11260 case LOCALE_PAT_MOD:
11261 if (has_charset_modifier) {
11262 goto excess_modifier;
11264 else if (flagsp == &negflags) {
11267 cs = REGEX_LOCALE_CHARSET;
11268 has_charset_modifier = LOCALE_PAT_MOD;
11270 case UNICODE_PAT_MOD:
11271 if (has_charset_modifier) {
11272 goto excess_modifier;
11274 else if (flagsp == &negflags) {
11277 cs = REGEX_UNICODE_CHARSET;
11278 has_charset_modifier = UNICODE_PAT_MOD;
11280 case ASCII_RESTRICT_PAT_MOD:
11281 if (flagsp == &negflags) {
11284 if (has_charset_modifier) {
11285 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
11286 goto excess_modifier;
11288 /* Doubled modifier implies more restricted */
11289 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
11292 cs = REGEX_ASCII_RESTRICTED_CHARSET;
11294 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
11296 case DEPENDS_PAT_MOD:
11297 if (has_use_defaults) {
11298 goto fail_modifiers;
11300 else if (flagsp == &negflags) {
11303 else if (has_charset_modifier) {
11304 goto excess_modifier;
11307 /* The dual charset means unicode semantics if the
11308 * pattern (or target, not known until runtime) are
11309 * utf8, or something in the pattern indicates unicode
11311 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
11312 ? REGEX_UNICODE_CHARSET
11313 : REGEX_DEPENDS_CHARSET;
11314 has_charset_modifier = DEPENDS_PAT_MOD;
11317 RExC_parse_inc_by(1);
11318 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
11319 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
11321 else if (has_charset_modifier == *(RExC_parse - 1)) {
11322 vFAIL2("Regexp modifier \"%c\" may not appear twice",
11323 *(RExC_parse - 1));
11326 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
11328 NOT_REACHED; /*NOTREACHED*/
11330 RExC_parse_inc_by(1);
11331 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
11332 *(RExC_parse - 1));
11333 NOT_REACHED; /*NOTREACHED*/
11334 case GLOBAL_PAT_MOD: /* 'g' */
11335 if (RExC_pm_flags & PMf_WILDCARD) {
11336 goto modifier_illegal_in_wildcard;
11339 case ONCE_PAT_MOD: /* 'o' */
11340 if (ckWARN(WARN_REGEXP)) {
11341 const I32 wflagbit = *RExC_parse == 'o'
11344 if (! (wastedflags & wflagbit) ) {
11345 wastedflags |= wflagbit;
11346 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
11349 "Useless (%s%c) - %suse /%c modifier",
11350 flagsp == &negflags ? "?-" : "?",
11352 flagsp == &negflags ? "don't " : "",
11359 case CONTINUE_PAT_MOD: /* 'c' */
11360 if (RExC_pm_flags & PMf_WILDCARD) {
11361 goto modifier_illegal_in_wildcard;
11363 if (ckWARN(WARN_REGEXP)) {
11364 if (! (wastedflags & WASTED_C) ) {
11365 wastedflags |= WASTED_GC;
11366 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
11369 "Useless (%sc) - %suse /gc modifier",
11370 flagsp == &negflags ? "?-" : "?",
11371 flagsp == &negflags ? "don't " : ""
11376 case KEEPCOPY_PAT_MOD: /* 'p' */
11377 if (RExC_pm_flags & PMf_WILDCARD) {
11378 goto modifier_illegal_in_wildcard;
11380 if (flagsp == &negflags) {
11381 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
11383 *flagsp |= RXf_PMf_KEEPCOPY;
11387 /* A flag is a default iff it is following a minus, so
11388 * if there is a minus, it means will be trying to
11389 * re-specify a default which is an error */
11390 if (has_use_defaults || flagsp == &negflags) {
11391 goto fail_modifiers;
11393 flagsp = &negflags;
11394 wastedflags = 0; /* reset so (?g-c) warns twice */
11400 if ( (RExC_pm_flags & PMf_WILDCARD)
11401 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11403 RExC_parse_inc_by(1);
11404 /* diag_listed_as: Use of %s is not allowed in Unicode
11405 property wildcard subpatterns in regex; marked by <--
11407 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11408 " property wildcard subpatterns",
11409 has_charset_modifier);
11412 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11413 negflags |= RXf_PMf_EXTENDED_MORE;
11415 RExC_flags |= posflags;
11417 if (negflags & RXf_PMf_EXTENDED) {
11418 negflags |= RXf_PMf_EXTENDED_MORE;
11420 RExC_flags &= ~negflags;
11421 set_regex_charset(&RExC_flags, cs);
11426 RExC_parse_inc_if_char();
11427 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11428 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11429 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11430 NOT_REACHED; /*NOTREACHED*/
11436 vFAIL("Sequence (?... not terminated");
11438 modifier_illegal_in_wildcard:
11439 RExC_parse_inc_by(1);
11440 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11441 subpatterns in regex; marked by <-- HERE in m/%s/ */
11442 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11443 " subpatterns", *(RExC_parse - 1));
11447 - reg - regular expression, i.e. main body or parenthesized thing
11449 * Caller must absorb opening parenthesis.
11451 * Combining parenthesis handling with the base level of regular expression
11452 * is a trifle forced, but the need to tie the tails of the branches to what
11453 * follows makes it hard to avoid.
11455 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11457 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11459 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11462 STATIC regnode_offset
11463 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11465 char * backref_parse_start,
11469 regnode_offset ret;
11470 char* name_start = RExC_parse;
11472 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11473 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11475 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11477 if (RExC_parse != name_start && ch == '}') {
11478 while (isBLANK(*RExC_parse)) {
11479 RExC_parse_inc_by(1);
11482 if (RExC_parse == name_start || *RExC_parse != ch) {
11483 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11484 vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
11488 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11489 RExC_rxi->data->data[num]=(void*)sv_dat;
11490 SvREFCNT_inc_simple_void_NN(sv_dat);
11493 ret = reganode(pRExC_state,
11496 : (ASCII_FOLD_RESTRICTED)
11498 : (AT_LEAST_UNI_SEMANTICS)
11504 *flagp |= HASWIDTH;
11506 nextchar(pRExC_state);
11510 /* reg_la_NOTHING()
11512 * Maybe parse a parenthezised lookaround construct that is equivalent to a
11513 * NOTHING regop when the construct is empty.
11515 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
11517 * Checks for unterminated constructs and throws a "not terminated" error
11518 * with the appropriate type if necessary
11520 * Assuming it does not throw an exception increments RExC_seen_zerolen.
11522 * If the construct is empty generates a NOTHING op and returns its
11523 * regnode_offset, which the caller would then return to its caller.
11525 * If the construct is not empty increments RExC_in_lookaround, and turns
11526 * on any flags provided in RExC_seen, and then returns 0 to signify
11527 * that parsing should continue.
11529 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
11530 * any use of it would have had to be broken onto multiple lines, hence
11531 * the abbreviation.
11533 STATIC regnode_offset
11534 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
11538 PERL_ARGS_ASSERT_REG_LA_NOTHING;
11540 /* false below so we do not force /x */
11541 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
11543 if (RExC_parse >= RExC_end)
11544 vFAIL2("Sequence (%s... not terminated", type);
11546 /* Always increment as NOTHING regops are zerolen */
11547 RExC_seen_zerolen++;
11549 if (*RExC_parse == ')') {
11550 regnode_offset ret= reg_node(pRExC_state, NOTHING);
11551 nextchar(pRExC_state);
11555 RExC_seen |= flags;
11556 RExC_in_lookaround++;
11557 return 0; /* keep parsing! */
11562 * Maybe parse a parenthezised lookaround construct that is equivalent to a
11563 * OPFAIL regop when the construct is empty.
11565 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
11567 * Checks for unterminated constructs and throws a "not terminated" error
11570 * If the construct is empty generates an OPFAIL op and returns its
11571 * regnode_offset which the caller should then return to its caller.
11573 * If the construct is not empty increments RExC_in_lookaround, and also
11574 * increments RExC_seen_zerolen, and turns on the flags provided in
11575 * RExC_seen, and then returns 0 to signify that parsing should continue.
11577 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
11578 * any use of it would have had to be broken onto multiple lines, hence
11579 * the abbreviation.
11582 STATIC regnode_offset
11583 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
11587 PERL_ARGS_ASSERT_REG_LA_OPFAIL;
11589 /* FALSE so we don't force to /x below */;
11590 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
11592 if (RExC_parse >= RExC_end)
11593 vFAIL2("Sequence (%s... not terminated", type);
11595 if (*RExC_parse == ')') {
11596 regnode_offset ret= reganode(pRExC_state, OPFAIL, 0);
11597 nextchar(pRExC_state);
11598 return ret; /* return produced regop */
11601 /* only increment zerolen *after* we check if we produce an OPFAIL
11602 * as an OPFAIL does not match a zero length construct, as it
11603 * does not match ever. */
11604 RExC_seen_zerolen++;
11605 RExC_seen |= flags;
11606 RExC_in_lookaround++;
11607 return 0; /* keep parsing! */
11610 /* Below are the main parsing routines.
11612 * S_reg() parses a whole pattern or subpattern. It itself handles things
11613 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
11614 * alternation '|' in the '...' pattern.
11615 * S_regbranch() effectively implements the concatenation operator, handling
11616 * one alternative of '|', repeatedly calling S_regpiece on each
11617 * segment of the input.
11618 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
11619 * and then adds any quantifier for that chunk.
11620 * S_regatom() parses the next chunk of the input, returning when it
11621 * determines it has found a complete atomic chunk. The chunk may
11622 * be a nested subpattern, in which case S_reg is called
11625 * The functions generate regnodes as they go along, appending each to the
11626 * pattern data structure so far. They return the offset of the current final
11627 * node into that structure, or 0 on failure.
11629 * There are three parameters common to all of them:
11630 * pRExC_state is a structure with much information about the current
11631 * state of the parse. It's easy to add new elements to
11632 * convey new information, but beware that an error return may
11633 * require clearing the element.
11634 * flagp is a pointer to bit flags set in a lower level to pass up
11635 * to higher levels information, such as the cause of a
11636 * failure, or some characteristic about the generated node
11637 * depth is roughly the recursion depth, mostly unused except for
11638 * pretty printing debugging info.
11640 * There are ancillary functions that these may farm work out to, using the
11643 * The protocol for handling flags is that each function will, before
11644 * returning, add into *flagp the flags it needs to pass up. Each function has
11645 * a second flags variable, typically named 'flags', which it sets and clears
11646 * at will. Flag bits in it are used in that function, and it calls the next
11647 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
11648 * 'flags' will contain whatever it had before the call, plus whatever that
11649 * function passed up. If it wants to pass any of these up to its caller, it
11650 * has to add them to its *flagp. This means that it takes extra steps to keep
11651 * passing a flag upwards, and otherwise the flag bit is cleared for higher
11655 /* On success, returns the offset at which any next node should be placed into
11656 * the regex engine program being compiled.
11658 * Returns 0 otherwise, with *flagp set to indicate why:
11659 * TRYAGAIN at the end of (?) that only sets flags.
11660 * RESTART_PARSE if the parse needs to be restarted, or'd with
11661 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11662 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11664 STATIC regnode_offset
11665 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11666 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11667 * 2 is like 1, but indicates that nextchar() has been called to advance
11668 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11669 * this flag alerts us to the need to check for that */
11671 regnode_offset ret = 0; /* Will be the head of the group. */
11673 regnode_offset lastbr;
11674 regnode_offset ender = 0;
11677 U32 oregflags = RExC_flags;
11678 bool have_branch = 0;
11680 I32 freeze_paren = 0;
11681 I32 after_freeze = 0;
11682 I32 num; /* numeric backreferences */
11683 SV * max_open; /* Max number of unclosed parens */
11684 I32 was_in_lookaround = RExC_in_lookaround;
11686 /* The difference between the following variables can be seen with *
11687 * the broken pattern /(?:foo/ where segment_parse_start will point *
11688 * at the 'f', and reg_parse_start will point at the '(' */
11690 /* the following is used for unmatched '(' errors */
11691 char * const reg_parse_start = RExC_parse;
11693 /* the following is used to track where various segments of
11694 * the pattern that we parse out started. */
11695 char * segment_parse_start = RExC_parse;
11697 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11699 PERL_ARGS_ASSERT_REG;
11700 DEBUG_PARSE("reg ");
11702 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11704 if (!SvIOK(max_open)) {
11705 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11707 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11709 vFAIL("Too many nested open parens");
11712 *flagp = 0; /* Initialize. */
11714 /* Having this true makes it feasible to have a lot fewer tests for the
11715 * parse pointer being in scope. For example, we can write
11716 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
11718 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
11720 assert(*RExC_end == '\0');
11722 /* Make an OPEN node, if parenthesized. */
11725 /* Under /x, space and comments can be gobbled up between the '(' and
11726 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11727 * intervening space, as the sequence is a token, and a token should be
11729 bool has_intervening_patws = (paren == 2)
11730 && *(RExC_parse - 1) != '(';
11732 if (RExC_parse >= RExC_end) {
11733 vFAIL("Unmatched (");
11736 if (paren == 'r') { /* Atomic script run */
11740 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11741 char *start_verb = RExC_parse + 1;
11743 char *start_arg = NULL;
11744 unsigned char op = 0;
11745 int arg_required = 0;
11746 int internal_argval = -1; /* if > -1 no argument allowed */
11747 bool has_upper = FALSE;
11748 U32 seen_flag_set = 0; /* RExC_seen flags we must set */
11750 if (has_intervening_patws) {
11751 RExC_parse_inc_by(1); /* past the '*' */
11753 /* For strict backwards compatibility, don't change the message
11754 * now that we also have lowercase operands */
11755 if (isUPPER(*RExC_parse)) {
11756 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11759 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11762 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11763 if ( *RExC_parse == ':' ) {
11764 start_arg = RExC_parse + 1;
11768 if (isUPPER(*RExC_parse)) {
11771 RExC_parse_inc_by(1);
11774 RExC_parse_inc_utf8();
11777 verb_len = RExC_parse - start_verb;
11779 if (RExC_parse >= RExC_end) {
11780 goto unterminated_verb_pattern;
11784 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11787 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11788 unterminated_verb_pattern:
11790 vFAIL("Unterminated verb pattern argument");
11793 vFAIL("Unterminated '(*...' argument");
11797 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11799 vFAIL("Unterminated verb pattern");
11802 vFAIL("Unterminated '(*...' construct");
11807 /* Here, we know that RExC_parse < RExC_end */
11809 switch ( *start_verb ) {
11810 case 'A': /* (*ACCEPT) */
11811 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11813 internal_argval = RExC_nestroot;
11816 case 'C': /* (*COMMIT) */
11817 if ( memEQs(start_verb, verb_len,"COMMIT") )
11820 case 'F': /* (*FAIL) */
11821 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11825 case ':': /* (*:NAME) */
11826 case 'M': /* (*MARK:NAME) */
11827 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11832 case 'P': /* (*PRUNE) */
11833 if ( memEQs(start_verb, verb_len,"PRUNE") )
11836 case 'S': /* (*SKIP) */
11837 if ( memEQs(start_verb, verb_len,"SKIP") )
11840 case 'T': /* (*THEN) */
11841 /* [19:06] <TimToady> :: is then */
11842 if ( memEQs(start_verb, verb_len,"THEN") ) {
11844 RExC_seen |= REG_CUTGROUP_SEEN;
11848 if ( memEQs(start_verb, verb_len, "asr")
11849 || memEQs(start_verb, verb_len, "atomic_script_run"))
11851 paren = 'r'; /* Mnemonic: recursed run */
11854 else if (memEQs(start_verb, verb_len, "atomic")) {
11855 paren = 't'; /* AtOMIC */
11856 goto alpha_assertions;
11860 if ( memEQs(start_verb, verb_len, "plb")
11861 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11864 goto lookbehind_alpha_assertions;
11866 else if ( memEQs(start_verb, verb_len, "pla")
11867 || memEQs(start_verb, verb_len, "positive_lookahead"))
11870 goto alpha_assertions;
11874 if ( memEQs(start_verb, verb_len, "nlb")
11875 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11878 goto lookbehind_alpha_assertions;
11880 else if ( memEQs(start_verb, verb_len, "nla")
11881 || memEQs(start_verb, verb_len, "negative_lookahead"))
11884 goto alpha_assertions;
11888 if ( memEQs(start_verb, verb_len, "sr")
11889 || memEQs(start_verb, verb_len, "script_run"))
11891 regnode_offset atomic;
11897 /* This indicates Unicode rules. */
11898 REQUIRE_UNI_RULES(flagp, 0);
11904 RExC_parse_set(start_arg);
11906 if (RExC_in_script_run) {
11908 /* Nested script runs are treated as no-ops, because
11909 * if the nested one fails, the outer one must as
11910 * well. It could fail sooner, and avoid (??{} with
11911 * side effects, but that is explicitly documented as
11912 * undefined behavior. */
11916 if (paren == 's') {
11921 /* But, the atomic part of a nested atomic script run
11922 * isn't a no-op, but can be treated just like a '(?>'
11928 if (paren == 's') {
11929 /* Here, we're starting a new regular script run */
11930 ret = reg_node(pRExC_state, SROPEN);
11931 RExC_in_script_run = 1;
11936 /* Here, we are starting an atomic script run. This is
11937 * handled by recursing to deal with the atomic portion
11938 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11940 ret = reg_node(pRExC_state, SROPEN);
11942 RExC_in_script_run = 1;
11944 atomic = reg(pRExC_state, 'r', &flags, depth);
11945 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11946 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11950 if (! REGTAIL(pRExC_state, ret, atomic)) {
11951 REQUIRE_BRANCHJ(flagp, 0);
11954 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11957 REQUIRE_BRANCHJ(flagp, 0);
11960 RExC_in_script_run = 0;
11966 lookbehind_alpha_assertions:
11967 seen_flag_set = REG_LOOKBEHIND_SEEN;
11972 if ( !start_arg ) {
11976 if ( RExC_parse == start_arg ) {
11977 if ( paren == 'A' || paren == 'B' ) {
11978 /* An empty negative lookaround assertion is failure.
11979 * See also: S_reg_la_OPFAIL() */
11981 /* Note: OPFAIL is *not* zerolen. */
11982 ret = reganode(pRExC_state, OPFAIL, 0);
11983 nextchar(pRExC_state);
11987 if ( paren == 'a' || paren == 'b' ) {
11988 /* An empty positive lookaround assertion is success.
11989 * See also: S_reg_la_NOTHING() */
11991 /* Note: NOTHING is zerolen, so increment here */
11992 RExC_seen_zerolen++;
11993 ret = reg_node(pRExC_state, NOTHING);
11994 nextchar(pRExC_state);
11999 RExC_seen_zerolen++;
12000 RExC_in_lookaround++;
12001 RExC_seen |= seen_flag_set;
12003 RExC_parse_set(start_arg);
12007 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
12008 UTF8fARG(UTF, verb_len, start_verb));
12009 NOT_REACHED; /*NOTREACHED*/
12011 } /* End of switch */
12013 RExC_parse_inc_safe();
12014 if (has_upper || verb_len == 0) {
12015 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
12016 UTF8fARG(UTF, verb_len, start_verb));
12019 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
12020 UTF8fARG(UTF, verb_len, start_verb));
12023 if ( RExC_parse == start_arg ) {
12026 if ( arg_required && !start_arg ) {
12027 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
12028 (int) verb_len, start_verb);
12030 if (internal_argval == -1) {
12031 ret = reganode(pRExC_state, op, 0);
12033 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
12035 RExC_seen |= REG_VERBARG_SEEN;
12037 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
12038 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
12039 STR_WITH_LEN("S"));
12040 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
12041 FLAGS(REGNODE_p(ret)) = 1;
12043 FLAGS(REGNODE_p(ret)) = 0;
12045 if ( internal_argval != -1 )
12046 ARG2L_SET(REGNODE_p(ret), internal_argval);
12047 nextchar(pRExC_state);
12050 else if (*RExC_parse == '?') { /* (?...) */
12051 bool is_logical = 0;
12052 const char * const seqstart = RExC_parse;
12053 const char * endptr;
12054 const char non_existent_group_msg[]
12055 = "Reference to nonexistent group";
12056 const char impossible_group[] = "Invalid reference to group";
12058 if (has_intervening_patws) {
12059 RExC_parse_inc_by(1);
12060 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
12063 RExC_parse_inc_by(1); /* past the '?' */
12064 paren = *RExC_parse; /* might be a trailing NUL, if not
12067 if (RExC_parse > RExC_end) {
12070 ret = 0; /* For look-ahead/behind. */
12073 case 'P': /* (?P...) variants for those used to PCRE/Python */
12074 paren = *RExC_parse;
12075 if ( paren == '<') { /* (?P<...>) named capture */
12076 RExC_parse_inc_by(1);
12077 if (RExC_parse >= RExC_end) {
12078 vFAIL("Sequence (?P<... not terminated");
12080 goto named_capture;
12082 else if (paren == '>') { /* (?P>name) named recursion */
12083 RExC_parse_inc_by(1);
12084 if (RExC_parse >= RExC_end) {
12085 vFAIL("Sequence (?P>... not terminated");
12087 goto named_recursion;
12089 else if (paren == '=') { /* (?P=...) named backref */
12090 RExC_parse_inc_by(1);
12091 return handle_named_backref(pRExC_state, flagp,
12092 segment_parse_start, ')');
12094 RExC_parse_inc_if_char();
12095 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
12096 vFAIL3("Sequence (%.*s...) not recognized",
12097 (int) (RExC_parse - seqstart), seqstart);
12098 NOT_REACHED; /*NOTREACHED*/
12099 case '<': /* (?<...) */
12100 /* If you want to support (?<*...), first reconcile with GH #17363 */
12101 if (*RExC_parse == '!') {
12102 paren = ','; /* negative lookbehind (?<! ... ) */
12103 RExC_parse_inc_by(1);
12104 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
12109 if (*RExC_parse == '=') {
12110 /* paren = '<' - negative lookahead (?<= ... ) */
12111 RExC_parse_inc_by(1);
12112 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
12123 case '\'': /* (?'...') */
12124 name_start = RExC_parse;
12125 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
12126 if ( RExC_parse == name_start
12127 || RExC_parse >= RExC_end
12128 || *RExC_parse != paren)
12130 vFAIL2("Sequence (?%c... not terminated",
12131 paren=='>' ? '<' : (char) paren);
12136 if (!svname) /* shouldn't happen */
12138 "panic: reg_scan_name returned NULL");
12139 if (!RExC_paren_names) {
12140 RExC_paren_names= newHV();
12141 sv_2mortal(MUTABLE_SV(RExC_paren_names));
12143 RExC_paren_name_list= newAV();
12144 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
12147 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
12149 sv_dat = HeVAL(he_str);
12151 /* croak baby croak */
12153 "panic: paren_name hash element allocation failed");
12154 } else if ( SvPOK(sv_dat) ) {
12155 /* (?|...) can mean we have dupes so scan to check
12156 its already been stored. Maybe a flag indicating
12157 we are inside such a construct would be useful,
12158 but the arrays are likely to be quite small, so
12159 for now we punt -- dmq */
12160 IV count = SvIV(sv_dat);
12161 I32 *pv = (I32*)SvPVX(sv_dat);
12163 for ( i = 0 ; i < count ; i++ ) {
12164 if ( pv[i] == RExC_npar ) {
12170 pv = (I32*)SvGROW(sv_dat,
12171 SvCUR(sv_dat) + sizeof(I32)+1);
12172 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
12173 pv[count] = RExC_npar;
12174 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
12177 (void)SvUPGRADE(sv_dat, SVt_PVNV);
12178 sv_setpvn(sv_dat, (char *)&(RExC_npar),
12181 SvIV_set(sv_dat, 1);
12184 /* Yes this does cause a memory leak in debugging Perls
12186 if (!av_store(RExC_paren_name_list,
12187 RExC_npar, SvREFCNT_inc_NN(svname)))
12188 SvREFCNT_dec_NN(svname);
12191 /*sv_dump(sv_dat);*/
12193 nextchar(pRExC_state);
12195 goto capturing_parens;
12197 NOT_REACHED; /*NOTREACHED*/
12198 case '=': /* (?=...) */
12199 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
12202 case '!': /* (?!...) */
12203 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
12206 case '|': /* (?|...) */
12207 /* branch reset, behave like a (?:...) except that
12208 buffers in alternations share the same numbers */
12210 after_freeze = freeze_paren = RExC_npar;
12212 /* XXX This construct currently requires an extra pass.
12213 * Investigation would be required to see if that could be
12215 REQUIRE_PARENS_PASS;
12217 case ':': /* (?:...) */
12218 case '>': /* (?>...) */
12220 case '$': /* (?$...) */
12221 case '@': /* (?@...) */
12222 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
12224 case '0' : /* (?0) */
12225 case 'R' : /* (?R) */
12226 if (RExC_parse == RExC_end || *RExC_parse != ')')
12227 FAIL("Sequence (?R) not terminated");
12229 RExC_seen |= REG_RECURSE_SEEN;
12231 /* XXX These constructs currently require an extra pass.
12232 * It probably could be changed */
12233 REQUIRE_PARENS_PASS;
12235 *flagp |= POSTPONED;
12236 goto gen_recurse_regop;
12238 /* named and numeric backreferences */
12239 case '&': /* (?&NAME) */
12240 segment_parse_start = RExC_parse - 1;
12243 SV *sv_dat = reg_scan_name(pRExC_state,
12244 REG_RSN_RETURN_DATA);
12245 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
12247 if (RExC_parse >= RExC_end || *RExC_parse != ')')
12248 vFAIL("Sequence (?&... not terminated");
12249 goto gen_recurse_regop;
12252 if (! inRANGE(RExC_parse[0], '1', '9')) {
12253 RExC_parse_inc_by(1);
12254 vFAIL("Illegal pattern");
12256 goto parse_recursion;
12258 case '-': /* (?-1) */
12259 if (! inRANGE(RExC_parse[0], '1', '9')) {
12260 RExC_parse--; /* rewind to let it be handled later */
12264 case '1': case '2': case '3': case '4': /* (?1) */
12265 case '5': case '6': case '7': case '8': case '9':
12266 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
12269 bool is_neg = FALSE;
12271 segment_parse_start = RExC_parse - 1;
12272 if (*RExC_parse == '-') {
12273 RExC_parse_inc_by(1);
12277 if (grok_atoUV(RExC_parse, &unum, &endptr)
12281 RExC_parse_set((char*)endptr);
12283 else { /* Overflow, or something like that. Position
12284 beyond all digits for the message */
12285 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
12286 RExC_parse_inc_by(1);
12288 vFAIL(impossible_group);
12291 /* -num is always representable on 1 and 2's complement
12296 if (*RExC_parse!=')')
12297 vFAIL("Expecting close bracket");
12300 if (paren == '-' || paren == '+') {
12302 /* Don't overflow */
12303 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
12304 RExC_parse_inc_by(1);
12305 vFAIL(impossible_group);
12309 Diagram of capture buffer numbering.
12310 Top line is the normal capture buffer numbers
12311 Bottom line is the negative indexing as from
12315 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
12316 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
12317 - 5 4 3 2 1 X Y x x
12319 Resolve to absolute group. Recall that RExC_npar is +1 of
12320 the actual parenthesis group number. For lookahead, we
12321 have to compensate for that. Using the above example, when
12322 we get to Y in the parse, num is 2 and RExC_npar is 6. We
12323 want 7 for +2, and 4 for -2.
12325 if ( paren == '+' ) {
12331 if (paren == '-' && num < 1) {
12332 RExC_parse_inc_by(1);
12333 vFAIL(non_existent_group_msg);
12337 if (num >= RExC_npar) {
12339 /* It might be a forward reference; we can't fail until we
12340 * know, by completing the parse to get all the groups, and
12341 * then reparsing */
12342 if (ALL_PARENS_COUNTED) {
12343 if (num >= RExC_total_parens) {
12344 RExC_parse_inc_by(1);
12345 vFAIL(non_existent_group_msg);
12349 REQUIRE_PARENS_PASS;
12353 /* We keep track how many GOSUB items we have produced.
12354 To start off the ARG2L() of the GOSUB holds its "id",
12355 which is used later in conjunction with RExC_recurse
12356 to calculate the offset we need to jump for the GOSUB,
12357 which it will store in the final representation.
12358 We have to defer the actual calculation until much later
12359 as the regop may move.
12361 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
12362 RExC_recurse_count++;
12363 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12364 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
12365 22, "| |", (int)(depth * 2 + 1), "",
12366 (UV)ARG(REGNODE_p(ret)),
12367 (IV)ARG2L(REGNODE_p(ret))));
12368 RExC_seen |= REG_RECURSE_SEEN;
12370 *flagp |= POSTPONED;
12371 assert(*RExC_parse == ')');
12372 nextchar(pRExC_state);
12377 case '?': /* (??...) */
12379 if (*RExC_parse != '{') {
12380 RExC_parse_inc_if_char();
12381 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
12383 "Sequence (%" UTF8f "...) not recognized",
12384 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
12385 NOT_REACHED; /*NOTREACHED*/
12387 *flagp |= POSTPONED;
12389 RExC_parse_inc_by(1);
12391 case '{': /* (?{...}) */
12394 struct reg_code_block *cb;
12397 RExC_seen_zerolen++;
12399 if ( !pRExC_state->code_blocks
12400 || pRExC_state->code_index
12401 >= pRExC_state->code_blocks->count
12402 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
12403 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
12406 if (RExC_pm_flags & PMf_USE_RE_EVAL)
12407 FAIL("panic: Sequence (?{...}): no code block found\n");
12408 FAIL("Eval-group not allowed at runtime, use re 'eval'");
12410 /* this is a pre-compiled code block (?{...}) */
12411 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
12412 RExC_parse_set(RExC_start + cb->end);
12414 if (cb->src_regex) {
12415 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
12416 RExC_rxi->data->data[n] =
12417 (void*)SvREFCNT_inc((SV*)cb->src_regex);
12418 RExC_rxi->data->data[n+1] = (void*)o;
12421 n = add_data(pRExC_state,
12422 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
12423 RExC_rxi->data->data[n] = (void*)o;
12425 pRExC_state->code_index++;
12426 nextchar(pRExC_state);
12429 regnode_offset eval;
12430 ret = reg_node(pRExC_state, LOGICAL);
12432 eval = reg2Lanode(pRExC_state, EVAL,
12435 /* for later propagation into (??{})
12437 RExC_flags & RXf_PMf_COMPILETIME
12439 FLAGS(REGNODE_p(ret)) = 2;
12440 if (! REGTAIL(pRExC_state, ret, eval)) {
12441 REQUIRE_BRANCHJ(flagp, 0);
12445 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
12448 case '(': /* (?(?{...})...) and (?(?=...)...) */
12451 const int DEFINE_len = sizeof("DEFINE") - 1;
12452 if ( RExC_parse < RExC_end - 1
12453 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
12454 && ( RExC_parse[1] == '='
12455 || RExC_parse[1] == '!'
12456 || RExC_parse[1] == '<'
12457 || RExC_parse[1] == '{'))
12458 || ( RExC_parse[0] == '*' /* (?(*...)) */
12459 && ( memBEGINs(RExC_parse + 1,
12460 (Size_t) (RExC_end - (RExC_parse + 1)),
12462 || memBEGINs(RExC_parse + 1,
12463 (Size_t) (RExC_end - (RExC_parse + 1)),
12465 || memBEGINs(RExC_parse + 1,
12466 (Size_t) (RExC_end - (RExC_parse + 1)),
12468 || memBEGINs(RExC_parse + 1,
12469 (Size_t) (RExC_end - (RExC_parse + 1)),
12471 || memBEGINs(RExC_parse + 1,
12472 (Size_t) (RExC_end - (RExC_parse + 1)),
12473 "positive_lookahead:")
12474 || memBEGINs(RExC_parse + 1,
12475 (Size_t) (RExC_end - (RExC_parse + 1)),
12476 "positive_lookbehind:")
12477 || memBEGINs(RExC_parse + 1,
12478 (Size_t) (RExC_end - (RExC_parse + 1)),
12479 "negative_lookahead:")
12480 || memBEGINs(RExC_parse + 1,
12481 (Size_t) (RExC_end - (RExC_parse + 1)),
12482 "negative_lookbehind:"))))
12483 ) { /* Lookahead or eval. */
12485 regnode_offset tail;
12487 ret = reg_node(pRExC_state, LOGICAL);
12488 FLAGS(REGNODE_p(ret)) = 1;
12490 tail = reg(pRExC_state, 1, &flag, depth+1);
12491 RETURN_FAIL_ON_RESTART(flag, flagp);
12492 if (! REGTAIL(pRExC_state, ret, tail)) {
12493 REQUIRE_BRANCHJ(flagp, 0);
12497 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
12498 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
12500 char ch = RExC_parse[0] == '<' ? '>' : '\'';
12501 char *name_start= RExC_parse;
12502 RExC_parse_inc_by(1);
12504 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
12505 if ( RExC_parse == name_start
12506 || RExC_parse >= RExC_end
12507 || *RExC_parse != ch)
12509 vFAIL2("Sequence (?(%c... not terminated",
12510 (ch == '>' ? '<' : ch));
12512 RExC_parse_inc_by(1);
12514 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12515 RExC_rxi->data->data[num]=(void*)sv_dat;
12516 SvREFCNT_inc_simple_void_NN(sv_dat);
12518 ret = reganode(pRExC_state, GROUPPN, num);
12519 goto insert_if_check_paren;
12521 else if (memBEGINs(RExC_parse,
12522 (STRLEN) (RExC_end - RExC_parse),
12525 ret = reganode(pRExC_state, DEFINEP, 0);
12526 RExC_parse_inc_by(DEFINE_len);
12528 goto insert_if_check_paren;
12530 else if (RExC_parse[0] == 'R') {
12531 RExC_parse_inc_by(1);
12532 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
12533 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
12534 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
12537 if (RExC_parse[0] == '0') {
12539 RExC_parse_inc_by(1);
12541 else if (inRANGE(RExC_parse[0], '1', '9')) {
12544 if (grok_atoUV(RExC_parse, &uv, &endptr)
12547 parno = (I32)uv + 1;
12548 RExC_parse_set((char*)endptr);
12550 /* else "Switch condition not recognized" below */
12551 } else if (RExC_parse[0] == '&') {
12553 RExC_parse_inc_by(1);
12554 sv_dat = reg_scan_name(pRExC_state,
12555 REG_RSN_RETURN_DATA);
12557 parno = 1 + *((I32 *)SvPVX(sv_dat));
12559 ret = reganode(pRExC_state, INSUBP, parno);
12560 goto insert_if_check_paren;
12562 else if (inRANGE(RExC_parse[0], '1', '9')) {
12567 if (grok_atoUV(RExC_parse, &uv, &endptr)
12571 RExC_parse_set((char*)endptr);
12574 vFAIL("panic: grok_atoUV returned FALSE");
12576 ret = reganode(pRExC_state, GROUPP, parno);
12578 insert_if_check_paren:
12579 if (UCHARAT(RExC_parse) != ')') {
12580 RExC_parse_inc_safe();
12581 vFAIL("Switch condition not recognized");
12583 nextchar(pRExC_state);
12585 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12588 REQUIRE_BRANCHJ(flagp, 0);
12590 br = regbranch(pRExC_state, &flags, 1, depth+1);
12592 RETURN_FAIL_ON_RESTART(flags,flagp);
12593 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12596 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12599 REQUIRE_BRANCHJ(flagp, 0);
12601 c = UCHARAT(RExC_parse);
12602 nextchar(pRExC_state);
12603 if (flags&HASWIDTH)
12604 *flagp |= HASWIDTH;
12607 vFAIL("(?(DEFINE)....) does not allow branches");
12609 /* Fake one for optimizer. */
12610 lastbr = reganode(pRExC_state, IFTHEN, 0);
12612 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12613 RETURN_FAIL_ON_RESTART(flags, flagp);
12614 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12617 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12618 REQUIRE_BRANCHJ(flagp, 0);
12620 if (flags&HASWIDTH)
12621 *flagp |= HASWIDTH;
12622 c = UCHARAT(RExC_parse);
12623 nextchar(pRExC_state);
12628 if (RExC_parse >= RExC_end)
12629 vFAIL("Switch (?(condition)... not terminated");
12631 vFAIL("Switch (?(condition)... contains too many branches");
12633 ender = reg_node(pRExC_state, TAIL);
12634 if (! REGTAIL(pRExC_state, br, ender)) {
12635 REQUIRE_BRANCHJ(flagp, 0);
12638 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12639 REQUIRE_BRANCHJ(flagp, 0);
12641 if (! REGTAIL(pRExC_state,
12643 REGNODE_AFTER(REGNODE_p(lastbr))),
12646 REQUIRE_BRANCHJ(flagp, 0);
12650 if (! REGTAIL(pRExC_state, ret, ender)) {
12651 REQUIRE_BRANCHJ(flagp, 0);
12653 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12654 RExC_size++; /* XXX WHY do we need this?!!
12655 For large programs it seems to be required
12656 but I can't figure out why. -- dmq*/
12660 RExC_parse_inc_safe();
12661 vFAIL("Unknown switch condition (?(...))");
12663 case '[': /* (?[ ... ]) */
12664 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
12665 case 0: /* A NUL */
12666 RExC_parse--; /* for vFAIL to print correctly */
12667 vFAIL("Sequence (? incomplete");
12671 if (RExC_strict) { /* [perl #132851] */
12672 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12675 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12677 default: /* e.g., (?i) */
12678 RExC_parse_set((char *) seqstart + 1);
12680 parse_lparen_question_flags(pRExC_state);
12681 if (UCHARAT(RExC_parse) != ':') {
12682 if (RExC_parse < RExC_end)
12683 nextchar(pRExC_state);
12688 nextchar(pRExC_state);
12693 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12697 if (! ALL_PARENS_COUNTED) {
12698 /* If we are in our first pass through (and maybe only pass),
12699 * we need to allocate memory for the capturing parentheses
12703 if (!RExC_parens_buf_size) {
12704 /* first guess at number of parens we might encounter */
12705 RExC_parens_buf_size = 10;
12707 /* setup RExC_open_parens, which holds the address of each
12708 * OPEN tag, and to make things simpler for the 0 index the
12709 * start of the program - this is used later for offsets */
12710 Newxz(RExC_open_parens, RExC_parens_buf_size,
12712 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12714 /* setup RExC_close_parens, which holds the address of each
12715 * CLOSE tag, and to make things simpler for the 0 index
12716 * the end of the program - this is used later for offsets
12718 Newxz(RExC_close_parens, RExC_parens_buf_size,
12720 /* we dont know where end op starts yet, so we dont need to
12721 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12724 else if (RExC_npar > RExC_parens_buf_size) {
12725 I32 old_size = RExC_parens_buf_size;
12727 RExC_parens_buf_size *= 2;
12729 Renew(RExC_open_parens, RExC_parens_buf_size,
12731 Zero(RExC_open_parens + old_size,
12732 RExC_parens_buf_size - old_size, regnode_offset);
12734 Renew(RExC_close_parens, RExC_parens_buf_size,
12736 Zero(RExC_close_parens + old_size,
12737 RExC_parens_buf_size - old_size, regnode_offset);
12741 ret = reganode(pRExC_state, OPEN, parno);
12742 if (!RExC_nestroot)
12743 RExC_nestroot = parno;
12744 if (RExC_open_parens && !RExC_open_parens[parno])
12746 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12747 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12748 22, "| |", (int)(depth * 2 + 1), "",
12750 RExC_open_parens[parno]= ret;
12755 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12764 /* Pick up the branches, linking them together. */
12765 segment_parse_start = RExC_parse;
12766 br = regbranch(pRExC_state, &flags, 1, depth+1);
12768 /* branch_len = (paren != 0); */
12771 RETURN_FAIL_ON_RESTART(flags, flagp);
12772 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12774 if (*RExC_parse == '|') {
12775 if (RExC_use_BRANCHJ) {
12776 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12779 reginsert(pRExC_state, BRANCH, br, depth+1);
12783 else if (paren == ':') {
12784 *flagp |= flags&SIMPLE;
12786 if (is_open) { /* Starts with OPEN. */
12787 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12788 REQUIRE_BRANCHJ(flagp, 0);
12791 else if (paren != '?') /* Not Conditional */
12793 *flagp |= flags & (HASWIDTH | POSTPONED);
12795 while (*RExC_parse == '|') {
12796 if (RExC_use_BRANCHJ) {
12799 ender = reganode(pRExC_state, LONGJMP, 0);
12801 /* Append to the previous. */
12802 shut_gcc_up = REGTAIL(pRExC_state,
12803 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
12805 PERL_UNUSED_VAR(shut_gcc_up);
12807 nextchar(pRExC_state);
12808 if (freeze_paren) {
12809 if (RExC_npar > after_freeze)
12810 after_freeze = RExC_npar;
12811 RExC_npar = freeze_paren;
12813 br = regbranch(pRExC_state, &flags, 0, depth+1);
12816 RETURN_FAIL_ON_RESTART(flags, flagp);
12817 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12819 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12820 REQUIRE_BRANCHJ(flagp, 0);
12823 *flagp |= flags & (HASWIDTH | POSTPONED);
12826 if (have_branch || paren != ':') {
12829 /* Make a closing node, and hook it on the end. */
12832 ender = reg_node(pRExC_state, TAIL);
12835 ender = reganode(pRExC_state, CLOSE, parno);
12836 if ( RExC_close_parens ) {
12837 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12838 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12839 22, "| |", (int)(depth * 2 + 1), "",
12840 (IV)parno, ender));
12841 RExC_close_parens[parno]= ender;
12842 if (RExC_nestroot == parno)
12847 ender = reg_node(pRExC_state, SRCLOSE);
12848 RExC_in_script_run = 0;
12850 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
12851 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
12852 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
12853 case '<': /* (?<= ... ) */
12854 case ',': /* (?<! ... ) */
12855 *flagp &= ~HASWIDTH;
12856 ender = reg_node(pRExC_state, LOOKBEHIND_END);
12858 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
12863 *flagp &= ~HASWIDTH;
12865 case 't': /* aTomic */
12867 ender = reg_node(pRExC_state, SUCCEED);
12870 ender = reg_node(pRExC_state, END);
12871 assert(!RExC_end_op); /* there can only be one! */
12872 RExC_end_op = REGNODE_p(ender);
12873 if (RExC_close_parens) {
12874 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12875 "%*s%*s Setting close paren #0 (END) to %zu\n",
12876 22, "| |", (int)(depth * 2 + 1), "",
12879 RExC_close_parens[0]= ender;
12884 DEBUG_PARSE_MSG("lsbr");
12885 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12886 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12887 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12888 SvPV_nolen_const(RExC_mysv1),
12890 SvPV_nolen_const(RExC_mysv2),
12892 (IV)(ender - lastbr)
12895 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12896 REQUIRE_BRANCHJ(flagp, 0);
12900 char is_nothing= 1;
12902 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12904 /* Hook the tails of the branches to the closing node. */
12905 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12906 const U8 op = REGNODE_TYPE(OP(br));
12907 regnode *nextoper = REGNODE_AFTER(br);
12908 if (op == BRANCH) {
12909 if (! REGTAIL_STUDY(pRExC_state,
12910 REGNODE_OFFSET(nextoper),
12913 REQUIRE_BRANCHJ(flagp, 0);
12915 if ( OP(nextoper) != NOTHING
12916 || regnext(nextoper) != REGNODE_p(ender))
12919 else if (op == BRANCHJ) {
12920 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12921 REGNODE_OFFSET(nextoper),
12923 PERL_UNUSED_VAR(shut_gcc_up);
12924 /* for now we always disable this optimisation * /
12925 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
12926 if ( OP(nopr) != NOTHING
12927 || regnext(nopr) != REGNODE_p(ender))
12933 regnode * ret_as_regnode = REGNODE_p(ret);
12934 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
12935 ? regnext(ret_as_regnode)
12938 DEBUG_PARSE_MSG("NADA");
12939 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12940 NULL, pRExC_state);
12941 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12942 NULL, pRExC_state);
12943 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12944 SvPV_nolen_const(RExC_mysv1),
12945 (IV)REG_NODE_NUM(ret_as_regnode),
12946 SvPV_nolen_const(RExC_mysv2),
12952 if (OP(REGNODE_p(ender)) == TAIL) {
12954 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
12957 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12958 OP(opt)= OPTIMIZED;
12959 NEXT_OFF(br)= REGNODE_p(ender) - br;
12967 /* Even/odd or x=don't care: 010101x10x */
12968 static const char parens[] = "=!aA<,>Bbt";
12969 /* flag below is set to 0 up through 'A'; 1 for larger */
12971 if (paren && (p = strchr(parens, paren))) {
12972 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12973 int flag = (p - parens) > 3;
12975 if (paren == '>' || paren == 't') {
12976 node = SUSPEND, flag = 0;
12979 reginsert(pRExC_state, node, ret, depth+1);
12980 FLAGS(REGNODE_p(ret)) = flag;
12981 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12983 REQUIRE_BRANCHJ(flagp, 0);
12988 /* Check for proper termination. */
12990 /* restore original flags, but keep (?p) and, if we've encountered
12991 * something in the parse that changes /d rules into /u, keep the /u */
12992 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12993 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
12994 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12996 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12997 RExC_parse_set(reg_parse_start);
12998 vFAIL("Unmatched (");
13000 nextchar(pRExC_state);
13002 else if (!paren && RExC_parse < RExC_end) {
13003 if (*RExC_parse == ')') {
13004 RExC_parse_inc_by(1);
13005 vFAIL("Unmatched )");
13008 FAIL("Junk on end of regexp"); /* "Can't happen". */
13009 NOT_REACHED; /* NOTREACHED */
13012 if (after_freeze > RExC_npar)
13013 RExC_npar = after_freeze;
13015 RExC_in_lookaround = was_in_lookaround;
13021 - regbranch - one alternative of an | operator
13023 * Implements the concatenation operator.
13025 * On success, returns the offset at which any next node should be placed into
13026 * the regex engine program being compiled.
13028 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
13029 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
13032 STATIC regnode_offset
13033 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
13035 regnode_offset ret;
13036 regnode_offset chain = 0;
13037 regnode_offset latest;
13038 I32 flags = 0, c = 0;
13039 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13041 PERL_ARGS_ASSERT_REGBRANCH;
13043 DEBUG_PARSE("brnc");
13048 if (RExC_use_BRANCHJ)
13049 ret = reganode(pRExC_state, BRANCHJ, 0);
13051 ret = reg_node(pRExC_state, BRANCH);
13055 *flagp = 0; /* Initialize. */
13057 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13058 FALSE /* Don't force to /x */ );
13059 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
13060 flags &= ~TRYAGAIN;
13061 latest = regpiece(pRExC_state, &flags, depth+1);
13063 if (flags & TRYAGAIN)
13065 RETURN_FAIL_ON_RESTART(flags, flagp);
13066 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
13070 *flagp |= flags&(HASWIDTH|POSTPONED);
13072 /* FIXME adding one for every branch after the first is probably
13073 * excessive now we have TRIE support. (hv) */
13075 if (! REGTAIL(pRExC_state, chain, latest)) {
13076 /* XXX We could just redo this branch, but figuring out what
13077 * bookkeeping needs to be reset is a pain, and it's likely
13078 * that other branches that goto END will also be too large */
13079 REQUIRE_BRANCHJ(flagp, 0);
13085 if (chain == 0) { /* Loop ran zero times. */
13086 chain = reg_node(pRExC_state, NOTHING);
13091 *flagp |= flags&SIMPLE;
13103 #ifndef PERL_IN_XSUB_RE
13105 Perl_regcurly(const char *s, const char *e, const char * result[5])
13107 /* This function matches a {m,n} quantifier. When called with a NULL final
13108 * argument, it simply parses the input from 's' up through 'e-1', and
13109 * returns a boolean as to whether or not this input is syntactically a
13110 * {m,n} quantifier.
13112 * When called with a non-NULL final parameter, and when the function
13113 * returns TRUE, it additionally stores information into the array
13114 * specified by that parameter about what it found in the parse. The
13115 * parameter must be a pointer into a 5 element array of 'const char *'
13116 * elements. The returned information is as follows:
13117 * result[RBRACE] points to the closing brace
13118 * result[MIN_S] points to the first byte of the lower bound
13119 * result[MIN_E] points to one beyond the final byte of the lower bound
13120 * result[MAX_S] points to the first byte of the upper bound
13121 * result[MAX_E] points to one beyond the final byte of the upper bound
13123 * If the quantifier is of the form {m,} (meaning an infinite upper
13124 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
13125 * to is irrelevant, just that it's the same place
13127 * If instead the quantifier is of the form {m} there is actually only
13128 * one bound, and both the upper and lower result[] elements are set to
13131 * This function checks only for syntactic validity; it leaves checking for
13132 * semantic validity and raising any diagnostics to the caller. This
13133 * function is called in multiple places to check for syntax, but only from
13134 * one for semantics. It makes it as simple as possible for the
13135 * syntax-only callers, while furnishing just enough information for the
13139 const char * min_start = NULL;
13140 const char * max_start = NULL;
13141 const char * min_end = NULL;
13142 const char * max_end = NULL;
13144 bool has_comma = FALSE;
13146 PERL_ARGS_ASSERT_REGCURLY;
13148 if (s >= e || *s++ != '{')
13151 while (s < e && isBLANK(*s)) {
13159 } while (s < e && isDIGIT(*s));
13163 while (s < e && isBLANK(*s)) {
13171 while (s < e && isBLANK(*s)) {
13179 } while (s < e && isDIGIT(*s));
13184 while (s < e && isBLANK(*s)) {
13187 /* Need at least one number */
13188 if (s >= e || *s != '}' || (! min_start && ! max_end)) {
13194 result[RBRACE] = s;
13196 result[MIN_S] = min_start;
13197 result[MIN_E] = min_end;
13200 result[MAX_S] = max_start;
13201 result[MAX_E] = max_end;
13204 /* Having no value after the comma is signalled by setting
13205 * start and end to the same value. What that value is isn't
13206 * relevant; NULL is chosen simply because it will fail if the
13207 * caller mistakenly uses it */
13208 result[MAX_S] = result[MAX_E] = NULL;
13211 else { /* No comma means lower and upper bounds are the same */
13212 result[MAX_S] = min_start;
13213 result[MAX_E] = min_end;
13222 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
13223 const char * start, const char * end)
13225 /* This is a helper function for regpiece() to compute, given the
13226 * quantifier {m,n}, the value of either m or n, based on the starting
13227 * position 'start' in the string, through the byte 'end-1', returning it
13228 * if valid, and failing appropriately if not. It knows the restrictions
13229 * imposed on quantifier values */
13232 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
13234 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
13236 if (grok_atoUV(start, &uv, &end)) {
13237 if (uv < REG_INFTY) { /* A valid, small-enough number */
13241 else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
13242 leading zeros or overflow */
13243 RExC_parse_set((char * ) end);
13245 /* Perhaps too generic a msg for what is only failure from having
13246 * leading zeros, but this is how it's always behaved. */
13247 vFAIL("Invalid quantifier in {,}");
13248 NOT_REACHED; /*NOTREACHED*/
13251 /* Here, found a quantifier, but was too large; either it overflowed or was
13252 * too big a legal number */
13253 RExC_parse_set((char * ) end);
13254 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
13256 NOT_REACHED; /*NOTREACHED*/
13257 return U32_MAX; /* Perhaps some compilers will be expecting a return */
13261 - regpiece - something followed by possible quantifier * + ? {n,m}
13263 * Note that the branching code sequences used for ? and the general cases
13264 * of * and + are somewhat optimized: they use the same NOTHING node as
13265 * both the endmarker for their branch list and the body of the last branch.
13266 * It might seem that this node could be dispensed with entirely, but the
13267 * endmarker role is not redundant.
13269 * On success, returns the offset at which any next node should be placed into
13270 * the regex engine program being compiled.
13272 * Returns 0 otherwise, with *flagp set to indicate why:
13273 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
13274 * RESTART_PARSE if the parse needs to be restarted, or'd with
13275 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
13277 STATIC regnode_offset
13278 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13280 regnode_offset ret;
13283 const char * const origparse = RExC_parse;
13285 I32 max = REG_INFTY;
13287 /* Save the original in case we change the emitted regop to a FAIL. */
13288 const regnode_offset orig_emit = RExC_emit;
13290 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13292 PERL_ARGS_ASSERT_REGPIECE;
13294 DEBUG_PARSE("piec");
13296 ret = regatom(pRExC_state, &flags, depth+1);
13298 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
13299 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
13304 const char * regcurly_return[5];
13307 nextchar(pRExC_state);
13312 nextchar(pRExC_state);
13317 nextchar(pRExC_state);
13321 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
13322 to determine which */
13323 if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
13324 const char * min_start = regcurly_return[MIN_S];
13325 const char * min_end = regcurly_return[MIN_E];
13326 const char * max_start = regcurly_return[MAX_S];
13327 const char * max_end = regcurly_return[MAX_E];
13330 min = get_quantifier_value(pRExC_state, min_start, min_end);
13336 if (max_start == max_end) { /* Was of the form {m,} */
13339 else if (max_start == min_start) { /* Was of the form {m} */
13342 else { /* Was of the form {m,n} */
13343 assert(max_end >= max_start);
13345 max = get_quantifier_value(pRExC_state, max_start, max_end);
13348 RExC_parse_set((char *) regcurly_return[RBRACE]);
13349 nextchar(pRExC_state);
13351 if (max < min) { /* If can't match, warn and optimize to fail
13353 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
13354 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
13355 NEXT_OFF(REGNODE_p(orig_emit)) =
13356 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
13359 else if (min == max && *RExC_parse == '?') {
13360 ckWARN2reg(RExC_parse + 1,
13361 "Useless use of greediness modifier '%c'",
13366 } /* End of is {m,n} */
13368 /* Here was a '{', but what followed it didn't form a quantifier. */
13374 NOT_REACHED; /*NOTREACHED*/
13377 /* Here we have a quantifier, and have calculated 'min' and 'max'.
13379 * Check and possibly adjust a zero width operand */
13380 if (! (flags & (HASWIDTH|POSTPONED))) {
13381 if (max > REG_INFTY/3) {
13382 ckWARN2reg(RExC_parse,
13383 "%" UTF8f " matches null string many times",
13384 UTF8fARG(UTF, (RExC_parse >= origparse
13385 ? RExC_parse - origparse
13390 /* There's no point in trying to match something 0 length more than
13391 * once except for extra side effects, which we don't have here since
13401 /* If this is a code block pass it up */
13402 *flagp |= (flags & POSTPONED);
13405 *flagp |= (flags & HASWIDTH);
13406 if (max == REG_INFTY)
13407 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
13410 /* 'SIMPLE' operands don't require full generality */
13411 if ((flags&SIMPLE)) {
13412 if (max == REG_INFTY) {
13414 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
13415 goto min0_maxINF_wildcard_forbidden;
13418 reginsert(pRExC_state, STAR, ret, depth+1);
13422 else if (min == 1) {
13423 reginsert(pRExC_state, PLUS, ret, depth+1);
13429 /* Here, SIMPLE, but not the '*' and '+' special cases */
13431 MARK_NAUGHTY_EXP(2, 2);
13432 reginsert(pRExC_state, CURLY, ret, depth+1);
13434 else { /* not SIMPLE */
13435 const regnode_offset w = reg_node(pRExC_state, WHILEM);
13437 FLAGS(REGNODE_p(w)) = 0;
13438 if (! REGTAIL(pRExC_state, ret, w)) {
13439 REQUIRE_BRANCHJ(flagp, 0);
13441 if (RExC_use_BRANCHJ) {
13442 reginsert(pRExC_state, LONGJMP, ret, depth+1);
13443 reginsert(pRExC_state, NOTHING, ret, depth+1);
13444 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
13446 reginsert(pRExC_state, CURLYX, ret, depth+1);
13448 if (RExC_use_BRANCHJ)
13449 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
13451 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
13454 REQUIRE_BRANCHJ(flagp, 0);
13456 RExC_whilem_seen++;
13457 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
13460 /* Finish up the CURLY/CURLYX case */
13461 FLAGS(REGNODE_p(ret)) = 0;
13463 ARG1_SET(REGNODE_p(ret), (U16)min);
13464 ARG2_SET(REGNODE_p(ret), (U16)max);
13468 /* Process any greediness modifiers */
13469 if (*RExC_parse == '?') {
13470 nextchar(pRExC_state);
13471 reginsert(pRExC_state, MINMOD, ret, depth+1);
13472 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
13473 REQUIRE_BRANCHJ(flagp, 0);
13476 else if (*RExC_parse == '+') {
13477 regnode_offset ender;
13478 nextchar(pRExC_state);
13479 ender = reg_node(pRExC_state, SUCCEED);
13480 if (! REGTAIL(pRExC_state, ret, ender)) {
13481 REQUIRE_BRANCHJ(flagp, 0);
13483 reginsert(pRExC_state, SUSPEND, ret, depth+1);
13484 ender = reg_node(pRExC_state, TAIL);
13485 if (! REGTAIL(pRExC_state, ret, ender)) {
13486 REQUIRE_BRANCHJ(flagp, 0);
13490 /* Forbid extra quantifiers */
13491 if (isQUANTIFIER(RExC_parse, RExC_end)) {
13492 RExC_parse_inc_by(1);
13493 vFAIL("Nested quantifiers");
13498 min0_maxINF_wildcard_forbidden:
13500 /* Here we are in a wildcard match, and the minimum match length is 0, and
13501 * the max could be infinity. This is currently forbidden. The only
13502 * reason is to make it harder to write patterns that take a long long time
13503 * to halt, and because the use of this construct isn't necessary in
13504 * matching Unicode property values */
13505 RExC_parse_inc_by(1);
13506 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
13507 subpatterns in regex; marked by <-- HERE in m/%s/
13509 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
13512 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
13513 * legal at all in wildcards, so can't get this far */
13515 NOT_REACHED; /*NOTREACHED*/
13519 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
13520 regnode_offset * node_p,
13528 /* This routine teases apart the various meanings of \N and returns
13529 * accordingly. The input parameters constrain which meaning(s) is/are valid
13530 * in the current context.
13532 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
13534 * If <code_point_p> is not NULL, the context is expecting the result to be a
13535 * single code point. If this \N instance turns out to a single code point,
13536 * the function returns TRUE and sets *code_point_p to that code point.
13538 * If <node_p> is not NULL, the context is expecting the result to be one of
13539 * the things representable by a regnode. If this \N instance turns out to be
13540 * one such, the function generates the regnode, returns TRUE and sets *node_p
13541 * to point to the offset of that regnode into the regex engine program being
13544 * If this instance of \N isn't legal in any context, this function will
13545 * generate a fatal error and not return.
13547 * On input, RExC_parse should point to the first char following the \N at the
13548 * time of the call. On successful return, RExC_parse will have been updated
13549 * to point to just after the sequence identified by this routine. Also
13550 * *flagp has been updated as needed.
13552 * When there is some problem with the current context and this \N instance,
13553 * the function returns FALSE, without advancing RExC_parse, nor setting
13554 * *node_p, nor *code_point_p, nor *flagp.
13556 * If <cp_count> is not NULL, the caller wants to know the length (in code
13557 * points) that this \N sequence matches. This is set, and the input is
13558 * parsed for errors, even if the function returns FALSE, as detailed below.
13560 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
13562 * Probably the most common case is for the \N to specify a single code point.
13563 * *cp_count will be set to 1, and *code_point_p will be set to that code
13566 * Another possibility is for the input to be an empty \N{}. This is no
13567 * longer accepted, and will generate a fatal error.
13569 * Another possibility is for a custom charnames handler to be in effect which
13570 * translates the input name to an empty string. *cp_count will be set to 0.
13571 * *node_p will be set to a generated NOTHING node.
13573 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
13574 * set to 0. *node_p will be set to a generated REG_ANY node.
13576 * The fifth possibility is that \N resolves to a sequence of more than one
13577 * code points. *cp_count will be set to the number of code points in the
13578 * sequence. *node_p will be set to a generated node returned by this
13579 * function calling S_reg().
13581 * The sixth and final possibility is that it is premature to be calling this
13582 * function; the parse needs to be restarted. This can happen when this
13583 * changes from /d to /u rules, or when the pattern needs to be upgraded to
13584 * UTF-8. The latter occurs only when the fifth possibility would otherwise
13585 * be in effect, and is because one of those code points requires the pattern
13586 * to be recompiled as UTF-8. The function returns FALSE, and sets the
13587 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
13588 * happens, the caller needs to desist from continuing parsing, and return
13589 * this information to its caller. This is not set for when there is only one
13590 * code point, as this can be called as part of an ANYOF node, and they can
13591 * store above-Latin1 code points without the pattern having to be in UTF-8.
13593 * For non-single-quoted regexes, the tokenizer has resolved character and
13594 * sequence names inside \N{...} into their Unicode values, normalizing the
13595 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
13596 * hex-represented code points in the sequence. This is done there because
13597 * the names can vary based on what charnames pragma is in scope at the time,
13598 * so we need a way to take a snapshot of what they resolve to at the time of
13599 * the original parse. [perl #56444].
13601 * That parsing is skipped for single-quoted regexes, so here we may get
13602 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
13603 * like '\N{U+41}', that code point is Unicode, and has to be translated into
13604 * the native character set for non-ASCII platforms. The other possibilities
13605 * are already native, so no translation is done. */
13607 char * endbrace; /* points to '}' following the name */
13608 char * e; /* points to final non-blank before endbrace */
13609 char* p = RExC_parse; /* Temporary */
13611 SV * substitute_parse = NULL;
13616 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13618 PERL_ARGS_ASSERT_GROK_BSLASH_N;
13620 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
13621 assert(! (node_p && cp_count)); /* At most 1 should be set */
13623 if (cp_count) { /* Initialize return for the most common case */
13627 /* The [^\n] meaning of \N ignores spaces and comments under the /x
13628 * modifier. The other meanings do not (except blanks adjacent to and
13629 * within the braces), so use a temporary until we find out which we are
13630 * being called with */
13631 skip_to_be_ignored_text(pRExC_state, &p,
13632 FALSE /* Don't force to /x */ );
13634 /* Disambiguate between \N meaning a named character versus \N meaning
13635 * [^\n]. The latter is assumed when the {...} following the \N is a legal
13636 * quantifier, or if there is no '{' at all */
13637 if (*p != '{' || regcurly(p, RExC_end, NULL)) {
13647 *node_p = reg_node(pRExC_state, REG_ANY);
13648 *flagp |= HASWIDTH|SIMPLE;
13653 /* The test above made sure that the next real character is a '{', but
13654 * under the /x modifier, it could be separated by space (or a comment and
13655 * \n) and this is not allowed (for consistency with \x{...} and the
13656 * tokenizer handling of \N{NAME}). */
13657 if (*RExC_parse != '{') {
13658 vFAIL("Missing braces on \\N{}");
13661 RExC_parse_inc_by(1); /* Skip past the '{' */
13663 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13664 if (! endbrace) { /* no trailing brace */
13665 vFAIL2("Missing right brace on \\%c{}", 'N');
13668 /* Here, we have decided it should be a named character or sequence. These
13669 * imply Unicode semantics */
13670 REQUIRE_UNI_RULES(flagp, FALSE);
13672 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13673 * nothing at all (not allowed under strict) */
13674 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13675 RExC_parse_set(endbrace);
13677 RExC_parse_inc_by(1); /* Position after the "}" */
13678 vFAIL("Zero length \\N{}");
13684 nextchar(pRExC_state);
13689 *node_p = reg_node(pRExC_state, NOTHING);
13693 while (isBLANK(*RExC_parse)) {
13694 RExC_parse_inc_by(1);
13698 while (RExC_parse < e && isBLANK(*(e-1))) {
13702 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13704 /* Here, the name isn't of the form U+.... This can happen if the
13705 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13706 * is the time to find out what the name means */
13708 const STRLEN name_len = e - RExC_parse;
13709 SV * value_sv; /* What does this name evaluate to */
13711 const U8 * value; /* string of name's value */
13712 STRLEN value_len; /* and its length */
13714 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13715 * toke.c, and their values. Make sure is initialized */
13716 if (! RExC_unlexed_names) {
13717 RExC_unlexed_names = newHV();
13720 /* If we have already seen this name in this pattern, use that. This
13721 * allows us to only call the charnames handler once per name per
13722 * pattern. A broken or malicious handler could return something
13723 * different each time, which could cause the results to vary depending
13724 * on if something gets added or subtracted from the pattern that
13725 * causes the number of passes to change, for example */
13726 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13729 value_sv = *value_svp;
13731 else { /* Otherwise we have to go out and get the name */
13732 const char * error_msg = NULL;
13733 value_sv = get_and_check_backslash_N_name(RExC_parse, e,
13737 RExC_parse_set(endbrace);
13741 /* If no error message, should have gotten a valid return */
13744 /* Save the name's meaning for later use */
13745 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13748 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13752 /* Here, we have the value the name evaluates to in 'value_sv' */
13753 value = (U8 *) SvPV(value_sv, value_len);
13755 /* See if the result is one code point vs 0 or multiple */
13756 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13760 /* Here, exactly one code point. If that isn't what is wanted,
13762 if (! code_point_p) {
13767 /* Convert from string to numeric code point */
13768 *code_point_p = (SvUTF8(value_sv))
13769 ? valid_utf8_to_uvchr(value, NULL)
13772 /* Have parsed this entire single code point \N{...}. *cp_count
13773 * has already been set to 1, so don't do it again. */
13774 RExC_parse_set(endbrace);
13775 nextchar(pRExC_state);
13777 } /* End of is a single code point */
13779 /* Count the code points, if caller desires. The API says to do this
13780 * even if we will later return FALSE */
13784 *cp_count = (SvUTF8(value_sv))
13785 ? utf8_length(value, value + value_len)
13789 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13790 * But don't back the pointer up if the caller wants to know how many
13791 * code points there are (they need to handle it themselves in this
13800 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13801 * reg recursively to parse it. That way, it retains its atomicness,
13802 * while not having to worry about any special handling that some code
13803 * points may have. */
13805 substitute_parse = newSVpvs("?:");
13806 sv_catsv(substitute_parse, value_sv);
13807 sv_catpv(substitute_parse, ")");
13809 /* The value should already be native, so no need to convert on EBCDIC
13811 assert(! RExC_recode_x_to_native);
13814 else { /* \N{U+...} */
13815 Size_t count = 0; /* code point count kept internally */
13817 /* We can get to here when the input is \N{U+...} or when toke.c has
13818 * converted a name to the \N{U+...} form. This include changing a
13819 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13821 RExC_parse_inc_by(2); /* Skip past the 'U+' */
13823 /* Code points are separated by dots. The '}' terminates the whole
13826 do { /* Loop until the ending brace */
13827 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13828 | PERL_SCAN_SILENT_ILLDIGIT
13829 | PERL_SCAN_NOTIFY_ILLDIGIT
13830 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13831 | PERL_SCAN_DISALLOW_PREFIX;
13832 STRLEN len = e - RExC_parse;
13834 char * start_digit = RExC_parse;
13835 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13838 RExC_parse_inc_by(1);
13840 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13843 RExC_parse_inc_by(len);
13845 if (cp > MAX_LEGAL_CP) {
13846 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13849 if (RExC_parse >= e) { /* Got to the closing '}' */
13854 /* Here, is a single code point; fail if doesn't want that */
13855 if (! code_point_p) {
13860 /* A single code point is easy to handle; just return it */
13861 *code_point_p = UNI_TO_NATIVE(cp);
13862 RExC_parse_set(endbrace);
13863 nextchar(pRExC_state);
13867 /* Here, the parse stopped bfore the ending brace. This is legal
13868 * only if that character is a dot separating code points, like a
13869 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13870 * So the next character must be a dot (and the one after that
13871 * can't be the ending brace, or we'd have something like
13874 if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
13875 /*point to after 1st invalid */
13876 RExC_parse_incf(RExC_orig_utf8);
13877 /*Guard against malformed utf8*/
13878 RExC_parse_set(MIN(e, RExC_parse));
13882 /* Here, looks like its really a multiple character sequence. Fail
13883 * if that's not what the caller wants. But continue with counting
13884 * and error checking if they still want a count */
13885 if (! node_p && ! cp_count) {
13889 /* What is done here is to convert this to a sub-pattern of the
13890 * form \x{char1}\x{char2}... and then call reg recursively to
13891 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13892 * atomicness, while not having to worry about special handling
13893 * that some code points may have. We don't create a subpattern,
13894 * but go through the motions of code point counting and error
13895 * checking, if the caller doesn't want a node returned. */
13897 if (node_p && ! substitute_parse) {
13898 substitute_parse = newSVpvs("?:");
13904 /* Convert to notation the rest of the code understands */
13905 sv_catpvs(substitute_parse, "\\x{");
13906 sv_catpvn(substitute_parse, start_digit,
13907 RExC_parse - start_digit);
13908 sv_catpvs(substitute_parse, "}");
13911 /* Move to after the dot (or ending brace the final time through.)
13913 RExC_parse_inc_by(1);
13916 } while (RExC_parse < e);
13918 if (! node_p) { /* Doesn't want the node */
13925 sv_catpvs(substitute_parse, ")");
13927 /* The values are Unicode, and therefore have to be converted to native
13928 * on a non-Unicode (meaning non-ASCII) platform. */
13929 SET_recode_x_to_native(1);
13932 /* Here, we have the string the name evaluates to, ready to be parsed,
13933 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13934 * constructs. This can be called from within a substitute parse already.
13935 * The error reporting mechanism doesn't work for 2 levels of this, but the
13936 * code above has validated this new construct, so there should be no
13937 * errors generated by the below. And this isn't an exact copy, so the
13938 * mechanism to seamlessly deal with this won't work, so turn off warnings
13940 save_start = RExC_start;
13941 orig_end = RExC_end;
13943 RExC_start = SvPVX(substitute_parse);
13944 RExC_parse_set(RExC_start);
13945 RExC_end = RExC_parse + SvCUR(substitute_parse);
13946 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13948 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13950 /* Restore the saved values */
13952 RExC_start = save_start;
13953 RExC_parse_set(endbrace);
13954 RExC_end = orig_end;
13955 SET_recode_x_to_native(0);
13957 SvREFCNT_dec_NN(substitute_parse);
13960 RETURN_FAIL_ON_RESTART(flags, flagp);
13961 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13964 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13966 nextchar(pRExC_state);
13973 S_compute_EXACTish(RExC_state_t *pRExC_state)
13977 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13985 op = get_regex_charset(RExC_flags);
13986 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13987 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13988 been, so there is no hole */
13991 return op + EXACTF;
13994 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13995 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13998 S_backref_value(char *p, char *e)
14000 const char* endptr = e;
14002 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
14008 #define REGNODE_GUTS(state,op,extra_size) \
14009 regnode_guts_debug(state,op,extra_size)
14011 #define REGNODE_GUTS(state,op,extra_size) \
14012 regnode_guts(state,extra_size)
14017 - regatom - the lowest level
14019 Try to identify anything special at the start of the current parse position.
14020 If there is, then handle it as required. This may involve generating a
14021 single regop, such as for an assertion; or it may involve recursing, such as
14022 to handle a () structure.
14024 If the string doesn't start with something special then we gobble up
14025 as much literal text as we can. If we encounter a quantifier, we have to
14026 back off the final literal character, as that quantifier applies to just it
14027 and not to the whole string of literals.
14029 Once we have been able to handle whatever type of thing started the
14030 sequence, we return the offset into the regex engine program being compiled
14031 at which any next regnode should be placed.
14033 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
14034 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
14035 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
14036 Otherwise does not return 0.
14038 Note: we have to be careful with escapes, as they can be both literal
14039 and special, and in the case of \10 and friends, context determines which.
14041 A summary of the code structure is:
14043 switch (first_byte) {
14044 cases for each special:
14045 handle this special;
14048 switch (2nd byte) {
14049 cases for each unambiguous special:
14050 handle this special;
14052 cases for each ambigous special/literal:
14054 if (special) handle here
14056 default: // unambiguously literal:
14059 default: // is a literal char
14062 create EXACTish node for literal;
14063 while (more input and node isn't full) {
14064 switch (input_byte) {
14065 cases for each special;
14066 make sure parse pointer is set so that the next call to
14067 regatom will see this special first
14068 goto loopdone; // EXACTish node terminated by prev. char
14070 append char to EXACTISH node;
14072 get next input byte;
14076 return the generated node;
14078 Specifically there are two separate switches for handling
14079 escape sequences, with the one for handling literal escapes requiring
14080 a dummy entry for all of the special escapes that are actually handled
14085 STATIC regnode_offset
14086 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
14088 regnode_offset ret = 0;
14090 char *atom_parse_start;
14094 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14096 *flagp = 0; /* Initialize. */
14098 DEBUG_PARSE("atom");
14100 PERL_ARGS_ASSERT_REGATOM;
14103 atom_parse_start = RExC_parse;
14104 assert(RExC_parse < RExC_end);
14105 switch ((U8)*RExC_parse) {
14107 RExC_seen_zerolen++;
14108 nextchar(pRExC_state);
14109 if (RExC_flags & RXf_PMf_MULTILINE)
14110 ret = reg_node(pRExC_state, MBOL);
14112 ret = reg_node(pRExC_state, SBOL);
14115 nextchar(pRExC_state);
14117 RExC_seen_zerolen++;
14118 if (RExC_flags & RXf_PMf_MULTILINE)
14119 ret = reg_node(pRExC_state, MEOL);
14121 ret = reg_node(pRExC_state, SEOL);
14124 nextchar(pRExC_state);
14125 if (RExC_flags & RXf_PMf_SINGLELINE)
14126 ret = reg_node(pRExC_state, SANY);
14128 ret = reg_node(pRExC_state, REG_ANY);
14129 *flagp |= HASWIDTH|SIMPLE;
14134 char * const cc_parse_start = ++RExC_parse;
14135 ret = regclass(pRExC_state, flagp, depth+1,
14136 FALSE, /* means parse the whole char class */
14137 TRUE, /* allow multi-char folds */
14138 FALSE, /* don't silence non-portable warnings. */
14139 (bool) RExC_strict,
14140 TRUE, /* Allow an optimized regnode result */
14143 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14144 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
14147 if (*RExC_parse != ']') {
14148 RExC_parse_set(cc_parse_start);
14149 vFAIL("Unmatched [");
14151 nextchar(pRExC_state);
14155 nextchar(pRExC_state);
14156 ret = reg(pRExC_state, 2, &flags, depth+1);
14158 if (flags & TRYAGAIN) {
14159 if (RExC_parse >= RExC_end) {
14160 /* Make parent create an empty node if needed. */
14161 *flagp |= TRYAGAIN;
14166 RETURN_FAIL_ON_RESTART(flags, flagp);
14167 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
14170 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
14174 if (flags & TRYAGAIN) {
14175 *flagp |= TRYAGAIN;
14178 vFAIL("Internal urp");
14179 /* Supposed to be caught earlier. */
14184 RExC_parse_inc_by(1);
14185 vFAIL("Quantifier follows nothing");
14190 This switch handles escape sequences that resolve to some kind
14191 of special regop and not to literal text. Escape sequences that
14192 resolve to literal text are handled below in the switch marked
14195 Every entry in this switch *must* have a corresponding entry
14196 in the literal escape switch. However, the opposite is not
14197 required, as the default for this switch is to jump to the
14198 literal text handling code.
14200 RExC_parse_inc_by(1);
14201 switch ((U8)*RExC_parse) {
14202 /* Special Escapes */
14204 RExC_seen_zerolen++;
14205 /* Under wildcards, this is changed to match \n; should be
14206 * invisible to the user, as they have to compile under /m */
14207 if (RExC_pm_flags & PMf_WILDCARD) {
14208 ret = reg_node(pRExC_state, MBOL);
14211 ret = reg_node(pRExC_state, SBOL);
14212 /* SBOL is shared with /^/ so we set the flags so we can tell
14213 * /\A/ from /^/ in split. */
14214 FLAGS(REGNODE_p(ret)) = 1;
14216 goto finish_meta_pat;
14218 if (RExC_pm_flags & PMf_WILDCARD) {
14219 RExC_parse_inc_by(1);
14220 /* diag_listed_as: Use of %s is not allowed in Unicode property
14221 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
14223 vFAIL("Use of '\\G' is not allowed in Unicode property"
14224 " wildcard subpatterns");
14226 ret = reg_node(pRExC_state, GPOS);
14227 RExC_seen |= REG_GPOS_SEEN;
14228 goto finish_meta_pat;
14230 if (!RExC_in_lookaround) {
14231 RExC_seen_zerolen++;
14232 ret = reg_node(pRExC_state, KEEPS);
14233 /* XXX:dmq : disabling in-place substitution seems to
14234 * be necessary here to avoid cases of memory corruption, as
14235 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
14237 RExC_seen |= REG_LOOKBEHIND_SEEN;
14238 goto finish_meta_pat;
14241 ++RExC_parse; /* advance past the 'K' */
14242 vFAIL("\\K not permitted in lookahead/lookbehind");
14245 if (RExC_pm_flags & PMf_WILDCARD) {
14246 /* See comment under \A above */
14247 ret = reg_node(pRExC_state, MEOL);
14250 ret = reg_node(pRExC_state, SEOL);
14252 RExC_seen_zerolen++; /* Do not optimize RE away */
14253 goto finish_meta_pat;
14255 if (RExC_pm_flags & PMf_WILDCARD) {
14256 /* See comment under \A above */
14257 ret = reg_node(pRExC_state, MEOL);
14260 ret = reg_node(pRExC_state, EOS);
14262 RExC_seen_zerolen++; /* Do not optimize RE away */
14263 goto finish_meta_pat;
14265 vFAIL("\\C no longer supported");
14267 ret = reg_node(pRExC_state, CLUMP);
14268 *flagp |= HASWIDTH;
14269 goto finish_meta_pat;
14277 regex_charset charset = get_regex_charset(RExC_flags);
14279 RExC_seen_zerolen++;
14280 RExC_seen |= REG_LOOKBEHIND_SEEN;
14281 op = BOUND + charset;
14283 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
14284 flags = TRADITIONAL_BOUND;
14285 if (op > BOUNDA) { /* /aa is same as /a */
14291 char name = *RExC_parse;
14292 char * endbrace = (char *) memchr(RExC_parse, '}',
14293 RExC_end - RExC_parse);
14294 char * e = endbrace;
14296 RExC_parse_inc_by(2);
14299 vFAIL2("Missing right brace on \\%c{}", name);
14302 while (isBLANK(*RExC_parse)) {
14303 RExC_parse_inc_by(1);
14306 while (RExC_parse < e && isBLANK(*(e - 1))) {
14310 if (e == RExC_parse) {
14311 RExC_parse_set(endbrace + 1); /* After the '}' */
14312 vFAIL2("Empty \\%c{}", name);
14315 length = e - RExC_parse;
14317 switch (*RExC_parse) {
14320 && (memNEs(RExC_parse + 1, length - 1, "cb")))
14322 goto bad_bound_type;
14327 if (length != 2 || *(RExC_parse + 1) != 'b') {
14328 goto bad_bound_type;
14333 if (length != 2 || *(RExC_parse + 1) != 'b') {
14334 goto bad_bound_type;
14339 if (length != 2 || *(RExC_parse + 1) != 'b') {
14340 goto bad_bound_type;
14348 "'%" UTF8f "' is an unknown bound type",
14349 UTF8fARG(UTF, length, e - length));
14350 NOT_REACHED; /*NOTREACHED*/
14352 RExC_parse_set(endbrace);
14353 REQUIRE_UNI_RULES(flagp, 0);
14358 else if (op >= BOUNDA) { /* /aa is same as /a */
14362 /* Don't have to worry about UTF-8, in this message because
14363 * to get here the contents of the \b must be ASCII */
14364 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
14365 "Using /u for '%.*s' instead of /%s",
14367 endbrace - length + 1,
14368 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
14369 ? ASCII_RESTRICT_PAT_MODS
14370 : ASCII_MORE_RESTRICT_PAT_MODS);
14375 RExC_seen_d_op = TRUE;
14377 else if (op == BOUNDL) {
14378 RExC_contains_locale = 1;
14382 op += NBOUND - BOUND;
14385 ret = reg_node(pRExC_state, op);
14386 FLAGS(REGNODE_p(ret)) = flags;
14388 goto finish_meta_pat;
14392 ret = reg_node(pRExC_state, LNBREAK);
14393 *flagp |= HASWIDTH|SIMPLE;
14394 goto finish_meta_pat;
14408 /* These all have the same meaning inside [brackets], and it knows
14409 * how to do the best optimizations for them. So, pretend we found
14410 * these within brackets, and let it do the work */
14413 ret = regclass(pRExC_state, flagp, depth+1,
14414 TRUE, /* means just parse this element */
14415 FALSE, /* don't allow multi-char folds */
14416 FALSE, /* don't silence non-portable warnings. It
14417 would be a bug if these returned
14419 (bool) RExC_strict,
14420 TRUE, /* Allow an optimized regnode result */
14422 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14423 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
14424 * multi-char folds are allowed. */
14426 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
14429 RExC_parse--; /* regclass() leaves this one too far ahead */
14432 /* The escapes above that don't take a parameter can't be
14433 * followed by a '{'. But 'pX', 'p{foo}' and
14434 * correspondingly 'P' can be */
14435 if ( RExC_parse - atom_parse_start == 1
14436 && UCHARAT(RExC_parse + 1) == '{'
14437 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
14439 RExC_parse_inc_by(2);
14440 vFAIL("Unescaped left brace in regex is illegal here");
14442 nextchar(pRExC_state);
14445 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
14446 * \N{...} evaluates to a sequence of more than one code points).
14447 * The function call below returns a regnode, which is our result.
14448 * The parameters cause it to fail if the \N{} evaluates to a
14449 * single code point; we handle those like any other literal. The
14450 * reason that the multicharacter case is handled here and not as
14451 * part of the EXACtish code is because of quantifiers. In
14452 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
14453 * this way makes that Just Happen. dmq.
14454 * join_exact() will join this up with adjacent EXACTish nodes
14455 * later on, if appropriate. */
14457 if (grok_bslash_N(pRExC_state,
14458 &ret, /* Want a regnode returned */
14459 NULL, /* Fail if evaluates to a single code
14461 NULL, /* Don't need a count of how many code
14470 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14472 /* Here, evaluates to a single code point. Go get that */
14473 RExC_parse_set(atom_parse_start);
14476 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
14477 parse_named_seq: /* Also handle non-numeric \g{...} */
14480 if ( RExC_parse >= RExC_end - 1
14481 || (( ch = RExC_parse[1]) != '<'
14485 RExC_parse_inc_by(1);
14486 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
14487 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
14489 RExC_parse_inc_by(2);
14491 while (isBLANK(*RExC_parse)) {
14492 RExC_parse_inc_by(1);
14495 ret = handle_named_backref(pRExC_state,
14507 case '1': case '2': case '3': case '4':
14508 case '5': case '6': case '7': case '8': case '9':
14511 char * endbrace = NULL;
14512 char * s = RExC_parse;
14513 char * e = RExC_end;
14520 endbrace = (char *) memchr(s, '}', RExC_end - s);
14523 /* Missing '}'. Position after the number to give
14524 * a better indication to the user of where the
14531 /* If it looks to be a name and not a number, go
14532 * handle it there */
14533 if (! isDIGIT(*s)) {
14534 goto parse_named_seq;
14539 } while isDIGIT(*s);
14542 vFAIL("Unterminated \\g{...} pattern");
14545 s++; /* Past the '{' */
14547 while (isBLANK(*s)) {
14551 /* Ignore trailing blanks */
14553 while (s < e && isBLANK(*(e - 1))) {
14558 /* Here, have isolated the meat of the construct from any
14559 * surrounding braces */
14566 if (endbrace && !isDIGIT(*s)) {
14567 goto parse_named_seq;
14571 num = S_backref_value(RExC_parse, RExC_end);
14573 vFAIL("Reference to invalid group 0");
14574 else if (num == I32_MAX) {
14575 if (isDIGIT(*RExC_parse))
14576 vFAIL("Reference to nonexistent group");
14578 vFAIL("Unterminated \\g... pattern");
14582 num = RExC_npar - num;
14584 vFAIL("Reference to nonexistent or unclosed group");
14588 num = S_backref_value(RExC_parse, RExC_end);
14589 /* bare \NNN might be backref or octal - if it is larger
14590 * than or equal RExC_npar then it is assumed to be an
14591 * octal escape. Note RExC_npar is +1 from the actual
14592 * number of parens. */
14593 /* Note we do NOT check if num == I32_MAX here, as that is
14594 * handled by the RExC_npar check */
14596 if ( /* any numeric escape < 10 is always a backref */
14598 /* any numeric escape < RExC_npar is a backref */
14599 && num >= RExC_npar
14600 /* cannot be an octal escape if it starts with [89]
14602 && ! inRANGE(*RExC_parse, '8', '9')
14604 /* Probably not meant to be a backref, instead likely
14605 * to be an octal character escape, e.g. \35 or \777.
14606 * The above logic should make it obvious why using
14607 * octal escapes in patterns is problematic. - Yves */
14608 RExC_parse_set(atom_parse_start);
14613 /* At this point RExC_parse points at a numeric escape like
14614 * \12 or \88 or the digits in \g{34} or \g34 or something
14615 * similar, which we should NOT treat as an octal escape. It
14616 * may or may not be a valid backref escape. For instance
14617 * \88888888 is unlikely to be a valid backref.
14619 * We've already figured out what value the digits represent.
14620 * Now, move the parse to beyond them. */
14622 RExC_parse_set(endbrace + 1);
14624 else while (isDIGIT(*RExC_parse)) {
14625 RExC_parse_inc_by(1);
14628 if (num >= (I32)RExC_npar) {
14630 /* It might be a forward reference; we can't fail until we
14631 * know, by completing the parse to get all the groups, and
14632 * then reparsing */
14633 if (ALL_PARENS_COUNTED) {
14634 if (num >= RExC_total_parens) {
14635 vFAIL("Reference to nonexistent group");
14639 REQUIRE_PARENS_PASS;
14643 ret = reganode(pRExC_state,
14646 : (ASCII_FOLD_RESTRICTED)
14648 : (AT_LEAST_UNI_SEMANTICS)
14654 if (OP(REGNODE_p(ret)) == REFF) {
14655 RExC_seen_d_op = TRUE;
14657 *flagp |= HASWIDTH;
14659 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14660 FALSE /* Don't force to /x */ );
14664 if (RExC_parse >= RExC_end)
14665 FAIL("Trailing \\");
14668 /* Do not generate "unrecognized" warnings here, we fall
14669 back into the quick-grab loop below */
14670 RExC_parse_set(atom_parse_start);
14672 } /* end of switch on a \foo sequence */
14677 /* '#' comments should have been spaced over before this function was
14679 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14681 if (RExC_flags & RXf_PMf_EXTENDED) {
14682 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
14683 if (RExC_parse < RExC_end)
14693 /* Here, we have determined that the next thing is probably a
14694 * literal character. RExC_parse points to the first byte of its
14695 * definition. (It still may be an escape sequence that evaluates
14696 * to a single character) */
14701 char *s, *old_s = NULL, *old_old_s = NULL;
14703 U32 max_string_len = 255;
14705 /* We may have to reparse the node, artificially stopping filling
14706 * it early, based on info gleaned in the first parse. This
14707 * variable gives where we stop. Make it above the normal stopping
14708 * place first time through; otherwise it would stop too early */
14709 U32 upper_fill = max_string_len + 1;
14711 /* We start out as an EXACT node, even if under /i, until we find a
14712 * character which is in a fold. The algorithm now segregates into
14713 * separate nodes, characters that fold from those that don't under
14714 * /i. (This hopefully will create nodes that are fixed strings
14715 * even under /i, giving the optimizer something to grab on to.)
14716 * So, if a node has something in it and the next character is in
14717 * the opposite category, that node is closed up, and the function
14718 * returns. Then regatom is called again, and a new node is
14719 * created for the new category. */
14720 U8 node_type = EXACT;
14722 /* Assume the node will be fully used; the excess is given back at
14723 * the end. Under /i, we may need to temporarily add the fold of
14724 * an extra character or two at the end to check for splitting
14725 * multi-char folds, so allocate extra space for that. We can't
14726 * make any other length assumptions, as a byte input sequence
14727 * could shrink down. */
14728 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14732 ? UTF8_MAXBYTES_CASE
14733 /* Max non-UTF-8 expansion is 2 */ : 2)));
14735 bool next_is_quantifier;
14736 char * oldp = NULL;
14738 /* We can convert EXACTF nodes to EXACTFU if they contain only
14739 * characters that match identically regardless of the target
14740 * string's UTF8ness. The reason to do this is that EXACTF is not
14741 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14744 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14745 * contain only above-Latin1 characters (hence must be in UTF8),
14746 * which don't participate in folds with Latin1-range characters,
14747 * as the latter's folds aren't known until runtime. */
14748 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14750 /* Single-character EXACTish nodes are almost always SIMPLE. This
14751 * allows us to override this as encountered */
14752 U8 maybe_SIMPLE = SIMPLE;
14754 /* Does this node contain something that can't match unless the
14755 * target string is (also) in UTF-8 */
14756 bool requires_utf8_target = FALSE;
14758 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14759 bool has_ss = FALSE;
14761 /* So is the MICRO SIGN */
14762 bool has_micro_sign = FALSE;
14764 /* Set when we fill up the current node and there is still more
14765 * text to process */
14768 /* Allocate an EXACT node. The node_type may change below to
14769 * another EXACTish node, but since the size of the node doesn't
14770 * change, it works */
14771 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
14772 FILL_NODE(ret, node_type);
14773 RExC_emit += NODE_STEP_REGNODE;
14775 s = STRING(REGNODE_p(ret));
14786 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14787 maybe_SIMPLE = SIMPLE;
14788 requires_utf8_target = FALSE;
14790 has_micro_sign = FALSE;
14794 /* This breaks under rare circumstances. If folding, we do not
14795 * want to split a node at a character that is a non-final in a
14796 * multi-char fold, as an input string could just happen to want to
14797 * match across the node boundary. The code at the end of the loop
14798 * looks for this, and backs off until it finds not such a
14799 * character, but it is possible (though extremely, extremely
14800 * unlikely) for all characters in the node to be non-final fold
14801 * ones, in which case we just leave the node fully filled, and
14802 * hope that it doesn't match the string in just the wrong place */
14804 assert( ! UTF /* Is at the beginning of a character */
14805 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14806 || UTF8_IS_START(UCHARAT(RExC_parse)));
14808 overflowed = FALSE;
14810 /* Here, we have a literal character. Find the maximal string of
14811 * them in the input that we can fit into a single EXACTish node.
14812 * We quit at the first non-literal or when the node gets full, or
14813 * under /i the categorization of folding/non-folding character
14815 while (p < RExC_end && len < upper_fill) {
14817 /* In most cases each iteration adds one byte to the output.
14818 * The exceptions override this */
14819 Size_t added_len = 1;
14825 /* White space has already been ignored */
14826 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14827 || ! is_PATWS_safe((p), RExC_end, UTF));
14830 const char* message;
14843 /* Literal Escapes Switch
14845 This switch is meant to handle escape sequences that
14846 resolve to a literal character.
14848 Every escape sequence that represents something
14849 else, like an assertion or a char class, is handled
14850 in the switch marked 'Special Escapes' above in this
14851 routine, but also has an entry here as anything that
14852 isn't explicitly mentioned here will be treated as
14853 an unescaped equivalent literal.
14856 switch ((U8)*++p) {
14858 /* These are all the special escapes. */
14859 case 'A': /* Start assertion */
14860 case 'b': case 'B': /* Word-boundary assertion*/
14861 case 'C': /* Single char !DANGEROUS! */
14862 case 'd': case 'D': /* digit class */
14863 case 'g': case 'G': /* generic-backref, pos assertion */
14864 case 'h': case 'H': /* HORIZWS */
14865 case 'k': case 'K': /* named backref, keep marker */
14866 case 'p': case 'P': /* Unicode property */
14867 case 'R': /* LNBREAK */
14868 case 's': case 'S': /* space class */
14869 case 'v': case 'V': /* VERTWS */
14870 case 'w': case 'W': /* word class */
14871 case 'X': /* eXtended Unicode "combining
14872 character sequence" */
14873 case 'z': case 'Z': /* End of line/string assertion */
14877 /* Anything after here is an escape that resolves to a
14878 literal. (Except digits, which may or may not)
14884 case 'N': /* Handle a single-code point named character. */
14885 RExC_parse_set( p + 1 );
14886 if (! grok_bslash_N(pRExC_state,
14887 NULL, /* Fail if evaluates to
14888 anything other than a
14889 single code point */
14890 &ender, /* The returned single code
14892 NULL, /* Don't need a count of
14893 how many code points */
14898 if (*flagp & NEED_UTF8)
14899 FAIL("panic: grok_bslash_N set NEED_UTF8");
14900 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14902 /* Here, it wasn't a single code point. Go close
14903 * up this EXACTish node. The switch() prior to
14904 * this switch handles the other cases */
14910 RExC_parse_set(atom_parse_start);
14912 /* The \N{} means the pattern, if previously /d,
14913 * becomes /u. That means it can't be an EXACTF node,
14914 * but an EXACTFU */
14915 if (node_type == EXACTF) {
14916 node_type = EXACTFU;
14918 /* If the node already contains something that
14919 * differs between EXACTF and EXACTFU, reparse it
14921 if (! maybe_exactfu) {
14942 ender = ESC_NATIVE;
14950 if (! grok_bslash_o(&p,
14955 (bool) RExC_strict,
14956 FALSE, /* No illegal cp's */
14959 RExC_parse_set(p); /* going to die anyway; point to
14960 exact spot of failure */
14964 if (message && TO_OUTPUT_WARNINGS(p)) {
14965 warn_non_literal_string(p, packed_warn, message);
14969 if (! grok_bslash_x(&p,
14974 (bool) RExC_strict,
14975 FALSE, /* No illegal cp's */
14978 RExC_parse_set(p); /* going to die anyway; point
14979 to exact spot of failure */
14983 if (message && TO_OUTPUT_WARNINGS(p)) {
14984 warn_non_literal_string(p, packed_warn, message);
14988 if (ender < 0x100) {
14989 if (RExC_recode_x_to_native) {
14990 ender = LATIN1_TO_NATIVE(ender);
14997 if (! grok_bslash_c(*p, &grok_c_char,
14998 &message, &packed_warn))
15000 /* going to die anyway; point to exact spot of
15002 char *new_p= p + ((UTF)
15003 ? UTF8_SAFE_SKIP(p, RExC_end)
15005 RExC_parse_set(new_p);
15009 ender = grok_c_char;
15011 if (message && TO_OUTPUT_WARNINGS(p)) {
15012 warn_non_literal_string(p, packed_warn, message);
15016 case '8': case '9': /* must be a backreference */
15018 /* we have an escape like \8 which cannot be an octal escape
15019 * so we exit the loop, and let the outer loop handle this
15020 * escape which may or may not be a legitimate backref. */
15022 case '1': case '2': case '3':case '4':
15023 case '5': case '6': case '7':
15025 /* When we parse backslash escapes there is ambiguity
15026 * between backreferences and octal escapes. Any escape
15027 * from \1 - \9 is a backreference, any multi-digit
15028 * escape which does not start with 0 and which when
15029 * evaluated as decimal could refer to an already
15030 * parsed capture buffer is a back reference. Anything
15033 * Note this implies that \118 could be interpreted as
15034 * 118 OR as "\11" . "8" depending on whether there
15035 * were 118 capture buffers defined already in the
15038 /* NOTE, RExC_npar is 1 more than the actual number of
15039 * parens we have seen so far, hence the "<" as opposed
15041 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
15042 { /* Not to be treated as an octal constant, go
15050 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
15051 | PERL_SCAN_NOTIFY_ILLDIGIT;
15053 ender = grok_oct(p, &numlen, &flags, NULL);
15055 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
15056 && isDIGIT(*p) /* like \08, \178 */
15057 && ckWARN(WARN_REGEXP))
15059 reg_warn_non_literal_string(
15061 form_alien_digit_msg(8, numlen, p,
15062 RExC_end, UTF, FALSE));
15068 FAIL("Trailing \\");
15071 if (isALPHANUMERIC(*p)) {
15072 /* An alpha followed by '{' is going to fail next
15073 * iteration, so don't output this warning in that
15075 if (! isALPHA(*p) || *(p + 1) != '{') {
15076 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
15077 " passed through", p);
15080 goto normal_default;
15081 } /* End of switch on '\' */
15084 /* Trying to gain new uses for '{' without breaking too
15085 * much existing code is hard. The solution currently
15087 * 1) If there is no ambiguity that a '{' should always
15088 * be taken literally, at the start of a construct, we
15090 * 2) If the literal '{' conflicts with our desired use
15091 * of it as a metacharacter, we die. The deprecation
15092 * cycles for this have come and gone.
15093 * 3) If there is ambiguity, we raise a simple warning.
15094 * This could happen, for example, if the user
15095 * intended it to introduce a quantifier, but slightly
15096 * misspelled the quantifier. Without this warning,
15097 * the quantifier would silently be taken as a literal
15098 * string of characters instead of a meta construct */
15099 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
15101 || ( p > atom_parse_start + 1
15102 && isALPHA_A(*(p - 1))
15103 && *(p - 2) == '\\'))
15105 RExC_parse_set(p + 1);
15106 vFAIL("Unescaped left brace in regex is "
15109 ckWARNreg(p + 1, "Unescaped left brace in regex is"
15110 " passed through");
15112 goto normal_default;
15115 if (p > RExC_parse && RExC_strict) {
15116 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
15119 default: /* A literal character */
15121 if (! UTF8_IS_INVARIANT(*p) && UTF) {
15123 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
15124 &numlen, UTF8_ALLOW_DEFAULT);
15130 } /* End of switch on the literal */
15132 /* Here, have looked at the literal character, and <ender>
15133 * contains its ordinal; <p> points to the character after it.
15137 REQUIRE_UTF8(flagp);
15138 if ( UNICODE_IS_PERL_EXTENDED(ender)
15139 && TO_OUTPUT_WARNINGS(p))
15141 ckWARN2_non_literal_string(p,
15142 packWARN(WARN_PORTABLE),
15143 PL_extended_cp_format,
15148 /* We need to check if the next non-ignored thing is a
15149 * quantifier. Move <p> to after anything that should be
15150 * ignored, which, as a side effect, positions <p> for the next
15151 * loop iteration */
15152 skip_to_be_ignored_text(pRExC_state, &p,
15153 FALSE /* Don't force to /x */ );
15155 /* If the next thing is a quantifier, it applies to this
15156 * character only, which means that this character has to be in
15157 * its own node and can't just be appended to the string in an
15158 * existing node, so if there are already other characters in
15159 * the node, close the node with just them, and set up to do
15160 * this character again next time through, when it will be the
15161 * only thing in its new node */
15163 next_is_quantifier = LIKELY(p < RExC_end)
15164 && UNLIKELY(isQUANTIFIER(p, RExC_end));
15166 if (next_is_quantifier && LIKELY(len)) {
15171 /* Ready to add 'ender' to the node */
15173 if (! FOLD) { /* The simple case, just append the literal */
15176 /* Don't output if it would overflow */
15177 if (UNLIKELY(len > max_string_len - ((UTF)
15178 ? UVCHR_SKIP(ender)
15185 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
15186 *(s++) = (char) ender;
15189 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
15190 added_len = (char *) new_s - s;
15191 s = (char *) new_s;
15194 requires_utf8_target = TRUE;
15198 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
15200 /* Here are folding under /l, and the code point is
15201 * problematic. If this is the first character in the
15202 * node, change the node type to folding. Otherwise, if
15203 * this is the first problematic character, close up the
15204 * existing node, so can start a new node with this one */
15206 node_type = EXACTFL;
15207 RExC_contains_locale = 1;
15209 else if (node_type == EXACT) {
15214 /* This problematic code point means we can't simplify
15216 maybe_exactfu = FALSE;
15218 /* Although these two characters have folds that are
15219 * locale-problematic, they also have folds to above Latin1
15220 * that aren't a problem. Doing these now helps at
15222 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
15223 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
15228 /* Here, we are adding a problematic fold character.
15229 * "Problematic" in this context means that its fold isn't
15230 * known until runtime. (The non-problematic code points
15231 * are the above-Latin1 ones that fold to also all
15232 * above-Latin1. Their folds don't vary no matter what the
15233 * locale is.) But here we have characters whose fold
15234 * depends on the locale. We just add in the unfolded
15235 * character, and wait until runtime to fold it */
15236 goto not_fold_common;
15238 else /* regular fold; see if actually is in a fold */
15239 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
15241 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
15243 /* Here, folding, but the character isn't in a fold.
15245 * Start a new node if previous characters in the node were
15247 if (len && node_type != EXACT) {
15252 /* Here, continuing a node with non-folded characters. Add
15254 goto not_fold_common;
15256 else { /* Here, does participate in some fold */
15258 /* If this is the first character in the node, change its
15259 * type to folding. Otherwise, if this is the first
15260 * folding character in the node, close up the existing
15261 * node, so can start a new node with this one. */
15263 node_type = compute_EXACTish(pRExC_state);
15265 else if (node_type == EXACT) {
15270 if (UTF) { /* Alway use the folded value for UTF-8
15272 if (UVCHR_IS_INVARIANT(ender)) {
15273 if (UNLIKELY(len + 1 > max_string_len)) {
15278 *(s)++ = (U8) toFOLD(ender);
15284 folded = _to_uni_fold_flags(
15286 (U8 *) s, /* We have allocated extra space
15287 in 's' so can't run off the
15291 | (( ASCII_FOLD_RESTRICTED
15292 || node_type == EXACTFL)
15293 ? FOLD_FLAGS_NOMIX_ASCII
15295 if (UNLIKELY(len + added_len > max_string_len)) {
15303 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
15305 /* U+B5 folds to the MU, so its possible for a
15306 * non-UTF-8 target to match it */
15307 requires_utf8_target = TRUE;
15311 else { /* Here is non-UTF8. */
15313 /* The fold will be one or (rarely) two characters.
15314 * Check that there's room for at least a single one
15315 * before setting any flags, etc. Because otherwise an
15316 * overflowing character could cause a flag to be set
15317 * even though it doesn't end up in this node. (For
15318 * the two character fold, we check again, before
15319 * setting any flags) */
15320 if (UNLIKELY(len + 1 > max_string_len)) {
15325 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
15326 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
15327 || UNICODE_DOT_DOT_VERSION > 0)
15329 /* On non-ancient Unicodes, check for the only possible
15330 * multi-char fold */
15331 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
15333 /* This potential multi-char fold means the node
15334 * can't be simple (because it could match more
15335 * than a single char). And in some cases it will
15336 * match 'ss', so set that flag */
15340 /* It can't change to be an EXACTFU (unless already
15341 * is one). We fold it iff under /u rules. */
15342 if (node_type != EXACTFU) {
15343 maybe_exactfu = FALSE;
15346 if (UNLIKELY(len + 2 > max_string_len)) {
15355 goto done_with_this_char;
15358 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
15360 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
15362 /* Also, the sequence 'ss' is special when not
15363 * under /u. If the target string is UTF-8, it
15364 * should match SHARP S; otherwise it won't. So,
15365 * here we have to exclude the possibility of this
15366 * node moving to /u.*/
15368 maybe_exactfu = FALSE;
15371 /* Here, the fold will be a single character */
15373 if (UNLIKELY(ender == MICRO_SIGN)) {
15374 has_micro_sign = TRUE;
15376 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
15378 /* If the character's fold differs between /d and
15379 * /u, this can't change to be an EXACTFU node */
15380 maybe_exactfu = FALSE;
15383 *(s++) = (DEPENDS_SEMANTICS)
15384 ? (char) toFOLD(ender)
15386 /* Under /u, the fold of any character in
15387 * the 0-255 range happens to be its
15388 * lowercase equivalent, except for LATIN
15389 * SMALL LETTER SHARP S, which was handled
15390 * above, and the MICRO SIGN, whose fold
15391 * requires UTF-8 to represent. */
15392 : (char) toLOWER_L1(ender);
15394 } /* End of adding current character to the node */
15396 done_with_this_char:
15400 if (next_is_quantifier) {
15402 /* Here, the next input is a quantifier, and to get here,
15403 * the current character is the only one in the node. */
15407 } /* End of loop through literal characters */
15409 /* Here we have either exhausted the input or run out of room in
15410 * the node. If the former, we are done. (If we encountered a
15411 * character that can't be in the node, transfer is made directly
15412 * to <loopdone>, and so we wouldn't have fallen off the end of the
15414 if (LIKELY(! overflowed)) {
15418 /* Here we have run out of room. We can grow plain EXACT and
15419 * LEXACT nodes. If the pattern is gigantic enough, though,
15420 * eventually we'll have to artificially chunk the pattern into
15421 * multiple nodes. */
15422 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
15423 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
15424 Size_t overhead_expansion = 0;
15426 Size_t max_nodes_for_string;
15430 /* Here we couldn't fit the final character in the current
15431 * node, so it will have to be reparsed, no matter what else we
15435 /* If would have overflowed a regular EXACT node, switch
15436 * instead to an LEXACT. The code below is structured so that
15437 * the actual growing code is common to changing from an EXACT
15438 * or just increasing the LEXACT size. This means that we have
15439 * to save the string in the EXACT case before growing, and
15440 * then copy it afterwards to its new location */
15441 if (node_type == EXACT) {
15442 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
15443 RExC_emit += overhead_expansion;
15444 Copy(s0, temp, len, char);
15447 /* Ready to grow. If it was a plain EXACT, the string was
15448 * saved, and the first few bytes of it overwritten by adding
15449 * an argument field. We assume, as we do elsewhere in this
15450 * file, that one byte of remaining input will translate into
15451 * one byte of output, and if that's too small, we grow again,
15452 * if too large the excess memory is freed at the end */
15454 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
15455 achievable = MIN(max_nodes_for_string,
15456 current_string_nodes + STR_SZ(RExC_end - p));
15457 delta = achievable - current_string_nodes;
15459 /* If there is just no more room, go finish up this chunk of
15465 change_engine_size(pRExC_state, delta + overhead_expansion);
15466 current_string_nodes += delta;
15468 = sizeof(struct regnode) * current_string_nodes;
15469 upper_fill = max_string_len + 1;
15471 /* If the length was small, we know this was originally an
15472 * EXACT node now converted to LEXACT, and the string has to be
15473 * restored. Otherwise the string was untouched. 260 is just
15474 * a number safely above 255 so don't have to worry about
15475 * getting it precise */
15477 node_type = LEXACT;
15478 FILL_NODE(ret, node_type);
15479 s0 = STRING(REGNODE_p(ret));
15480 Copy(temp, s0, len, char);
15484 goto continue_parse;
15487 bool splittable = FALSE;
15488 bool backed_up = FALSE;
15489 char * e; /* should this be U8? */
15490 char * s_start; /* should this be U8? */
15492 /* Here is /i. Running out of room creates a problem if we are
15493 * folding, and the split happens in the middle of a
15494 * multi-character fold, as a match that should have occurred,
15495 * won't, due to the way nodes are matched, and our artificial
15496 * boundary. So back off until we aren't splitting such a
15497 * fold. If there is no such place to back off to, we end up
15498 * taking the entire node as-is. This can happen if the node
15499 * consists entirely of 'f' or entirely of 's' characters (or
15500 * things that fold to them) as 'ff' and 'ss' are
15501 * multi-character folds.
15503 * The Unicode standard says that multi character folds consist
15504 * of either two or three characters. That means we would be
15505 * splitting one if the final character in the node is at the
15506 * beginning of either type, or is the second of a three
15510 * ender is the code point of the character that won't fit
15512 * s points to just beyond the final byte in the node.
15513 * It's where we would place ender if there were
15514 * room, and where in fact we do place ender's fold
15515 * in the code below, as we've over-allocated space
15516 * for s0 (hence s) to allow for this
15517 * e starts at 's' and advances as we append things.
15518 * old_s is the same as 's'. (If ender had fit, 's' would
15519 * have been advanced to beyond it).
15520 * old_old_s points to the beginning byte of the final
15521 * character in the node
15522 * p points to the beginning byte in the input of the
15523 * character beyond 'ender'.
15524 * oldp points to the beginning byte in the input of
15527 * In the case of /il, we haven't folded anything that could be
15528 * affected by the locale. That means only above-Latin1
15529 * characters that fold to other above-latin1 characters get
15530 * folded at compile time. To check where a good place to
15531 * split nodes is, everything in it will have to be folded.
15532 * The boolean 'maybe_exactfu' keeps track in /il if there are
15533 * any unfolded characters in the node. */
15534 bool need_to_fold_loc = LOC && ! maybe_exactfu;
15536 /* If we do need to fold the node, we need a place to store the
15537 * folded copy, and a way to map back to the unfolded original
15539 char * locfold_buf = NULL;
15540 Size_t * loc_correspondence = NULL;
15542 if (! need_to_fold_loc) { /* The normal case. Just
15543 initialize to the actual node */
15546 s = old_old_s; /* Point to the beginning of the final char
15547 that fits in the node */
15551 /* Here, we have filled a /il node, and there are unfolded
15552 * characters in it. If the runtime locale turns out to be
15553 * UTF-8, there are possible multi-character folds, just
15554 * like when not under /l. The node hence can't terminate
15555 * in the middle of such a fold. To determine this, we
15556 * have to create a folded copy of this node. That means
15557 * reparsing the node, folding everything assuming a UTF-8
15558 * locale. (If at runtime it isn't such a locale, the
15559 * actions here wouldn't have been necessary, but we have
15560 * to assume the worst case.) If we find we need to back
15561 * off the folded string, we do so, and then map that
15562 * position back to the original unfolded node, which then
15563 * gets output, truncated at that spot */
15565 char * redo_p = RExC_parse;
15569 /* Allow enough space assuming a single byte input folds to
15570 * a single byte output, plus assume that the two unparsed
15571 * characters (that we may need) fold to the largest number
15572 * of bytes possible, plus extra for one more worst case
15573 * scenario. In the loop below, if we start eating into
15574 * that final spare space, we enlarge this initial space */
15575 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
15577 Newxz(locfold_buf, size, char);
15578 Newxz(loc_correspondence, size, Size_t);
15580 /* Redo this node's parse, folding into 'locfold_buf' */
15581 redo_p = RExC_parse;
15582 old_redo_e = redo_e = locfold_buf;
15583 while (redo_p <= oldp) {
15585 old_redo_e = redo_e;
15586 loc_correspondence[redo_e - locfold_buf]
15587 = redo_p - RExC_parse;
15592 (void) _to_utf8_fold_flags((U8 *) redo_p,
15597 redo_e += added_len;
15598 redo_p += UTF8SKIP(redo_p);
15602 /* Note that if this code is run on some ancient
15603 * Unicode versions, SHARP S doesn't fold to 'ss',
15604 * but rather than clutter the code with #ifdef's,
15605 * as is done above, we ignore that possibility.
15606 * This is ok because this code doesn't affect what
15607 * gets matched, but merely where the node gets
15609 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
15610 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
15620 /* If we're getting so close to the end that a
15621 * worst-case fold in the next character would cause us
15622 * to overflow, increase, assuming one byte output byte
15623 * per one byte input one, plus room for another worst
15625 if ( redo_p <= oldp
15626 && redo_e > locfold_buf + size
15627 - (UTF8_MAXBYTES_CASE + 1))
15629 Size_t new_size = size
15631 + UTF8_MAXBYTES_CASE + 1;
15632 Ptrdiff_t e_offset = redo_e - locfold_buf;
15634 Renew(locfold_buf, new_size, char);
15635 Renew(loc_correspondence, new_size, Size_t);
15638 redo_e = locfold_buf + e_offset;
15642 /* Set so that things are in terms of the folded, temporary
15645 s_start = locfold_buf;
15650 /* Here, we have 's', 's_start' and 'e' set up to point to the
15651 * input that goes into the node, folded.
15653 * If the final character of the node and the fold of ender
15654 * form the first two characters of a three character fold, we
15655 * need to peek ahead at the next (unparsed) character in the
15656 * input to determine if the three actually do form such a
15657 * fold. Just looking at that character is not generally
15658 * sufficient, as it could be, for example, an escape sequence
15659 * that evaluates to something else, and it needs to be folded.
15661 * khw originally thought to just go through the parse loop one
15662 * extra time, but that doesn't work easily as that iteration
15663 * could cause things to think that the parse is over and to
15664 * goto loopdone. The character could be a '$' for example, or
15665 * the character beyond could be a quantifier, and other
15666 * glitches as well.
15668 * The solution used here for peeking ahead is to look at that
15669 * next character. If it isn't ASCII punctuation, then it will
15670 * be something that would continue on in an EXACTish node if
15671 * there were space. We append the fold of it to s, having
15672 * reserved enough room in s0 for the purpose. If we can't
15673 * reasonably peek ahead, we instead assume the worst case:
15674 * that it is something that would form the completion of a
15677 * If we can't split between s and ender, we work backwards
15678 * character-by-character down to s0. At each current point
15679 * see if we are at the beginning of a multi-char fold. If so,
15680 * that means we would be splitting the fold across nodes, and
15681 * so we back up one and try again.
15683 * If we're not at the beginning, we still could be at the
15684 * final two characters of a (rare) three character fold. We
15685 * check if the sequence starting at the character before the
15686 * current position (and including the current and next
15687 * characters) is a three character fold. If not, the node can
15688 * be split here. If it is, we have to backup two characters
15691 * Otherwise, the node can be split at the current position.
15693 * The same logic is used for UTF-8 patterns and not */
15697 /* Append the fold of ender */
15698 (void) _to_uni_fold_flags(
15702 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15703 ? FOLD_FLAGS_NOMIX_ASCII
15707 /* 's' and the character folded to by ender may be the
15708 * first two of a three-character fold, in which case the
15709 * node should not be split here. That may mean examining
15710 * the so-far unparsed character starting at 'p'. But if
15711 * ender folded to more than one character, we already have
15712 * three characters to look at. Also, we first check if
15713 * the sequence consisting of s and the next character form
15714 * the first two of some three character fold. If not,
15715 * there's no need to peek ahead. */
15716 if ( added_len <= UTF8SKIP(e - added_len)
15717 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15719 /* Here, the two do form the beginning of a potential
15720 * three character fold. The unexamined character may
15721 * or may not complete it. Peek at it. It might be
15722 * something that ends the node or an escape sequence,
15723 * in which case we don't know without a lot of work
15724 * what it evaluates to, so we have to assume the worst
15725 * case: that it does complete the fold, and so we
15726 * can't split here. All such instances will have
15727 * that character be an ASCII punctuation character,
15728 * like a backslash. So, for that case, backup one and
15729 * drop down to try at that position */
15731 s = (char *) utf8_hop_back((U8 *) s, -1,
15736 /* Here, since it's not punctuation, it must be a
15737 * real character, and we can append its fold to
15738 * 'e' (having deliberately reserved enough space
15739 * for this eventuality) and drop down to check if
15740 * the three actually do form a folded sequence */
15741 (void) _to_utf8_fold_flags(
15742 (U8 *) p, (U8 *) RExC_end,
15745 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15746 ? FOLD_FLAGS_NOMIX_ASCII
15752 /* Here, we either have three characters available in
15753 * sequence starting at 's', or we have two characters and
15754 * know that the following one can't possibly be part of a
15755 * three character fold. We go through the node backwards
15756 * until we find a place where we can split it without
15757 * breaking apart a multi-character fold. At any given
15758 * point we have to worry about if such a fold begins at
15759 * the current 's', and also if a three-character fold
15760 * begins at s-1, (containing s and s+1). Splitting in
15761 * either case would break apart a fold */
15763 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15766 /* If is a multi-char fold, can't split here. Backup
15767 * one char and try again */
15768 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15774 /* If the two characters beginning at 's' are part of a
15775 * three character fold starting at the character
15776 * before s, we can't split either before or after s.
15777 * Backup two chars and try again */
15778 if ( LIKELY(s > s_start)
15779 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15782 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15787 /* Here there's no multi-char fold between s and the
15788 * next character following it. We can split */
15792 } while (s > s_start); /* End of loops backing up through the node */
15794 /* Here we either couldn't find a place to split the node,
15795 * or else we broke out of the loop setting 'splittable' to
15796 * true. In the latter case, the place to split is between
15797 * the first and second characters in the sequence starting
15803 else { /* Pattern not UTF-8 */
15804 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15805 || ASCII_FOLD_RESTRICTED)
15807 assert( toLOWER_L1(ender) < 256 );
15808 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15816 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15823 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15824 || ASCII_FOLD_RESTRICTED)
15826 assert( toLOWER_L1(ender) < 256 );
15827 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15837 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15843 if ( LIKELY(s > s_start)
15844 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15854 } while (s > s_start);
15861 /* Here, we are done backing up. If we didn't backup at all
15862 * (the likely case), just proceed */
15865 /* If we did find a place to split, reparse the entire node
15866 * stopping where we have calculated. */
15869 /* If we created a temporary folded string under /l, we
15870 * have to map that back to the original */
15871 if (need_to_fold_loc) {
15872 upper_fill = loc_correspondence[s - s_start];
15873 if (upper_fill == 0) {
15874 FAIL2("panic: loc_correspondence[%d] is 0",
15875 (int) (s - s_start));
15877 Safefree(locfold_buf);
15878 Safefree(loc_correspondence);
15881 upper_fill = s - s0;
15886 /* Here the node consists entirely of non-final multi-char
15887 * folds. (Likely it is all 'f's or all 's's.) There's no
15888 * decent place to split it, so give up and just take the
15893 if (need_to_fold_loc) {
15894 Safefree(locfold_buf);
15895 Safefree(loc_correspondence);
15897 } /* End of verifying node ends with an appropriate char */
15899 /* We need to start the next node at the character that didn't fit
15903 loopdone: /* Jumped to when encounters something that shouldn't be
15906 /* Free up any over-allocated space; cast is to silence bogus
15907 * warning in MS VC */
15908 change_engine_size(pRExC_state,
15909 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15911 /* I (khw) don't know if you can get here with zero length, but the
15912 * old code handled this situation by creating a zero-length EXACT
15913 * node. Might as well be NOTHING instead */
15915 OP(REGNODE_p(ret)) = NOTHING;
15919 /* If the node type is EXACT here, check to see if it
15920 * should be EXACTL, or EXACT_REQ8. */
15921 if (node_type == EXACT) {
15923 node_type = EXACTL;
15925 else if (requires_utf8_target) {
15926 node_type = EXACT_REQ8;
15929 else if (node_type == LEXACT) {
15930 if (requires_utf8_target) {
15931 node_type = LEXACT_REQ8;
15935 if ( UNLIKELY(has_micro_sign || has_ss)
15936 && (node_type == EXACTFU || ( node_type == EXACTF
15937 && maybe_exactfu)))
15938 { /* These two conditions are problematic in non-UTF-8
15941 node_type = EXACTFUP;
15943 else if (node_type == EXACTFL) {
15945 /* 'maybe_exactfu' is deliberately set above to
15946 * indicate this node type, where all code points in it
15948 if (maybe_exactfu) {
15949 node_type = EXACTFLU8;
15952 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15954 /* A character that folds to more than one will
15955 * match multiple characters, so can't be SIMPLE.
15956 * We don't have to worry about this with EXACTFLU8
15957 * nodes just above, as they have already been
15958 * folded (since the fold doesn't vary at run
15959 * time). Here, if the final character in the node
15960 * folds to multiple, it can't be simple. (This
15961 * only has an effect if the node has only a single
15962 * character, hence the final one, as elsewhere we
15963 * turn off simple for nodes whose length > 1 */
15967 else if (node_type == EXACTF) { /* Means is /di */
15969 /* This intermediate variable is needed solely because
15970 * the asserts in the macro where used exceed Win32's
15971 * literal string capacity */
15972 char first_char = * STRING(REGNODE_p(ret));
15974 /* If 'maybe_exactfu' is clear, then we need to stay
15975 * /di. If it is set, it means there are no code
15976 * points that match differently depending on UTF8ness
15977 * of the target string, so it can become an EXACTFU
15979 if (! maybe_exactfu) {
15980 RExC_seen_d_op = TRUE;
15982 else if ( isALPHA_FOLD_EQ(first_char, 's')
15983 || isALPHA_FOLD_EQ(ender, 's'))
15985 /* But, if the node begins or ends in an 's' we
15986 * have to defer changing it into an EXACTFU, as
15987 * the node could later get joined with another one
15988 * that ends or begins with 's' creating an 'ss'
15989 * sequence which would then wrongly match the
15990 * sharp s without the target being UTF-8. We
15991 * create a special node that we resolve later when
15992 * we join nodes together */
15994 node_type = EXACTFU_S_EDGE;
15997 node_type = EXACTFU;
16001 if (requires_utf8_target && node_type == EXACTFU) {
16002 node_type = EXACTFU_REQ8;
16006 OP(REGNODE_p(ret)) = node_type;
16007 setSTR_LEN(REGNODE_p(ret), len);
16008 RExC_emit += STR_SZ(len);
16010 /* If the node isn't a single character, it can't be SIMPLE */
16011 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
16015 *flagp |= HASWIDTH | maybe_SIMPLE;
16021 /* len is STRLEN which is unsigned, need to copy to signed */
16024 vFAIL("Internal disaster");
16027 } /* End of label 'defchar:' */
16029 } /* End of giant switch on input character */
16031 /* Position parse to next real character */
16032 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16033 FALSE /* Don't force to /x */ );
16034 if ( *RExC_parse == '{'
16035 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
16038 RExC_parse_inc_by(1);
16039 vFAIL("Unescaped left brace in regex is illegal here");
16041 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
16042 " passed through");
16050 S_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
16052 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
16053 * sets up the bitmap and any flags, removing those code points from the
16054 * inversion list, setting it to NULL should it become completely empty */
16057 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
16059 /* There is no bitmap for this node type */
16060 if (REGNODE_TYPE(OP(node)) != ANYOF) {
16064 ANYOF_BITMAP_ZERO(node);
16065 if (*invlist_ptr) {
16067 /* This gets set if we actually need to modify things */
16068 bool change_invlist = FALSE;
16072 /* Start looking through *invlist_ptr */
16073 invlist_iterinit(*invlist_ptr);
16074 while (invlist_iternext(*invlist_ptr, &start, &end)) {
16078 /* Quit if are above what we should change */
16079 if (start >= NUM_ANYOF_CODE_POINTS) {
16083 change_invlist = TRUE;
16085 /* Set all the bits in the range, up to the max that we are doing */
16086 high = (end < NUM_ANYOF_CODE_POINTS - 1)
16088 : NUM_ANYOF_CODE_POINTS - 1;
16089 for (i = start; i <= (int) high; i++) {
16090 ANYOF_BITMAP_SET(node, i);
16093 invlist_iterfinish(*invlist_ptr);
16095 /* Done with loop; remove any code points that are in the bitmap from
16097 if (change_invlist) {
16098 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
16101 /* If have completely emptied it, remove it completely */
16102 if (_invlist_len(*invlist_ptr) == 0) {
16103 SvREFCNT_dec_NN(*invlist_ptr);
16104 *invlist_ptr = NULL;
16109 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
16110 Character classes ([:foo:]) can also be negated ([:^foo:]).
16111 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
16112 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
16113 but trigger failures because they are currently unimplemented. */
16115 #define POSIXCC_DONE(c) ((c) == ':')
16116 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
16117 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
16118 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
16120 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
16121 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
16122 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
16124 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
16126 /* 'posix_warnings' and 'warn_text' are names of variables in the following
16128 #define ADD_POSIX_WARNING(p, text) STMT_START { \
16129 if (posix_warnings) { \
16130 if (! RExC_warn_text ) RExC_warn_text = \
16131 (AV *) sv_2mortal((SV *) newAV()); \
16132 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
16136 REPORT_LOCATION_ARGS(p))); \
16139 #define CLEAR_POSIX_WARNINGS() \
16141 if (posix_warnings && RExC_warn_text) \
16142 av_clear(RExC_warn_text); \
16145 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
16147 CLEAR_POSIX_WARNINGS(); \
16152 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
16154 const char * const s, /* Where the putative posix class begins.
16155 Normally, this is one past the '['. This
16156 parameter exists so it can be somewhere
16157 besides RExC_parse. */
16158 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
16160 AV ** posix_warnings, /* Where to place any generated warnings, or
16162 const bool check_only /* Don't die if error */
16165 /* This parses what the caller thinks may be one of the three POSIX
16167 * 1) a character class, like [:blank:]
16168 * 2) a collating symbol, like [. .]
16169 * 3) an equivalence class, like [= =]
16170 * In the latter two cases, it croaks if it finds a syntactically legal
16171 * one, as these are not handled by Perl.
16173 * The main purpose is to look for a POSIX character class. It returns:
16174 * a) the class number
16175 * if it is a completely syntactically and semantically legal class.
16176 * 'updated_parse_ptr', if not NULL, is set to point to just after the
16177 * closing ']' of the class
16178 * b) OOB_NAMEDCLASS
16179 * if it appears that one of the three POSIX constructs was meant, but
16180 * its specification was somehow defective. 'updated_parse_ptr', if
16181 * not NULL, is set to point to the character just after the end
16182 * character of the class. See below for handling of warnings.
16183 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
16184 * if it doesn't appear that a POSIX construct was intended.
16185 * 'updated_parse_ptr' is not changed. No warnings nor errors are
16188 * In b) there may be errors or warnings generated. If 'check_only' is
16189 * TRUE, then any errors are discarded. Warnings are returned to the
16190 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
16191 * instead it is NULL, warnings are suppressed.
16193 * The reason for this function, and its complexity is that a bracketed
16194 * character class can contain just about anything. But it's easy to
16195 * mistype the very specific posix class syntax but yielding a valid
16196 * regular bracketed class, so it silently gets compiled into something
16197 * quite unintended.
16199 * The solution adopted here maintains backward compatibility except that
16200 * it adds a warning if it looks like a posix class was intended but
16201 * improperly specified. The warning is not raised unless what is input
16202 * very closely resembles one of the 14 legal posix classes. To do this,
16203 * it uses fuzzy parsing. It calculates how many single-character edits it
16204 * would take to transform what was input into a legal posix class. Only
16205 * if that number is quite small does it think that the intention was a
16206 * posix class. Obviously these are heuristics, and there will be cases
16207 * where it errs on one side or another, and they can be tweaked as
16208 * experience informs.
16210 * The syntax for a legal posix class is:
16212 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
16214 * What this routine considers syntactically to be an intended posix class
16215 * is this (the comments indicate some restrictions that the pattern
16218 * qr/(?x: \[? # The left bracket, possibly
16220 * \h* # possibly followed by blanks
16221 * (?: \^ \h* )? # possibly a misplaced caret
16222 * [:;]? # The opening class character,
16223 * # possibly omitted. A typo
16224 * # semi-colon can also be used.
16226 * \^? # possibly a correctly placed
16227 * # caret, but not if there was also
16228 * # a misplaced one
16230 * .{3,15} # The class name. If there are
16231 * # deviations from the legal syntax,
16232 * # its edit distance must be close
16233 * # to a real class name in order
16234 * # for it to be considered to be
16235 * # an intended posix class.
16237 * [[:punct:]]? # The closing class character,
16238 * # possibly omitted. If not a colon
16239 * # nor semi colon, the class name
16240 * # must be even closer to a valid
16243 * \]? # The right bracket, possibly
16247 * In the above, \h must be ASCII-only.
16249 * These are heuristics, and can be tweaked as field experience dictates.
16250 * There will be cases when someone didn't intend to specify a posix class
16251 * that this warns as being so. The goal is to minimize these, while
16252 * maximizing the catching of things intended to be a posix class that
16253 * aren't parsed as such.
16257 const char * const e = RExC_end;
16258 unsigned complement = 0; /* If to complement the class */
16259 bool found_problem = FALSE; /* Assume OK until proven otherwise */
16260 bool has_opening_bracket = FALSE;
16261 bool has_opening_colon = FALSE;
16262 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
16264 const char * possible_end = NULL; /* used for a 2nd parse pass */
16265 const char* name_start; /* ptr to class name first char */
16267 /* If the number of single-character typos the input name is away from a
16268 * legal name is no more than this number, it is considered to have meant
16269 * the legal name */
16270 int max_distance = 2;
16272 /* to store the name. The size determines the maximum length before we
16273 * decide that no posix class was intended. Should be at least
16274 * sizeof("alphanumeric") */
16276 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
16278 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
16280 CLEAR_POSIX_WARNINGS();
16283 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
16286 if (*(p - 1) != '[') {
16287 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
16288 found_problem = TRUE;
16291 has_opening_bracket = TRUE;
16294 /* They could be confused and think you can put spaces between the
16297 found_problem = TRUE;
16301 } while (p < e && isBLANK(*p));
16303 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16306 /* For [. .] and [= =]. These are quite different internally from [: :],
16307 * so they are handled separately. */
16308 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
16309 and 1 for at least one char in it
16312 const char open_char = *p;
16313 const char * temp_ptr = p + 1;
16315 /* These two constructs are not handled by perl, and if we find a
16316 * syntactically valid one, we croak. khw, who wrote this code, finds
16317 * this explanation of them very unclear:
16318 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
16319 * And searching the rest of the internet wasn't very helpful either.
16320 * It looks like just about any byte can be in these constructs,
16321 * depending on the locale. But unless the pattern is being compiled
16322 * under /l, which is very rare, Perl runs under the C or POSIX locale.
16323 * In that case, it looks like [= =] isn't allowed at all, and that
16324 * [. .] could be any single code point, but for longer strings the
16325 * constituent characters would have to be the ASCII alphabetics plus
16326 * the minus-hyphen. Any sensible locale definition would limit itself
16327 * to these. And any portable one definitely should. Trying to parse
16328 * the general case is a nightmare (see [perl #127604]). So, this code
16329 * looks only for interiors of these constructs that match:
16331 * Using \w relaxes the apparent rules a little, without adding much
16332 * danger of mistaking something else for one of these constructs.
16334 * [. .] in some implementations described on the internet is usable to
16335 * escape a character that otherwise is special in bracketed character
16336 * classes. For example [.].] means a literal right bracket instead of
16337 * the ending of the class
16339 * [= =] can legitimately contain a [. .] construct, but we don't
16340 * handle this case, as that [. .] construct will later get parsed
16341 * itself and croak then. And [= =] is checked for even when not under
16342 * /l, as Perl has long done so.
16344 * The code below relies on there being a trailing NUL, so it doesn't
16345 * have to keep checking if the parse ptr < e.
16347 if (temp_ptr[1] == open_char) {
16350 else while ( temp_ptr < e
16351 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
16356 if (*temp_ptr == open_char) {
16358 if (*temp_ptr == ']') {
16360 if (! found_problem && ! check_only) {
16361 RExC_parse_set((char *) temp_ptr);
16362 vFAIL3("POSIX syntax [%c %c] is reserved for future "
16363 "extensions", open_char, open_char);
16366 /* Here, the syntax wasn't completely valid, or else the call
16367 * is to check-only */
16368 if (updated_parse_ptr) {
16369 *updated_parse_ptr = (char *) temp_ptr;
16372 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
16376 /* If we find something that started out to look like one of these
16377 * constructs, but isn't, we continue below so that it can be checked
16378 * for being a class name with a typo of '.' or '=' instead of a colon.
16382 /* Here, we think there is a possibility that a [: :] class was meant, and
16383 * we have the first real character. It could be they think the '^' comes
16386 found_problem = TRUE;
16387 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
16392 found_problem = TRUE;
16396 } while (p < e && isBLANK(*p));
16398 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16402 /* But the first character should be a colon, which they could have easily
16403 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
16404 * distinguish from a colon, so treat that as a colon). */
16407 has_opening_colon = TRUE;
16409 else if (*p == ';') {
16410 found_problem = TRUE;
16412 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16413 has_opening_colon = TRUE;
16416 found_problem = TRUE;
16417 ADD_POSIX_WARNING(p, "there must be a starting ':'");
16419 /* Consider an initial punctuation (not one of the recognized ones) to
16420 * be a left terminator */
16421 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
16426 /* They may think that you can put spaces between the components */
16428 found_problem = TRUE;
16432 } while (p < e && isBLANK(*p));
16434 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16439 /* We consider something like [^:^alnum:]] to not have been intended to
16440 * be a posix class, but XXX maybe we should */
16442 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16449 /* Again, they may think that you can put spaces between the components */
16451 found_problem = TRUE;
16455 } while (p < e && isBLANK(*p));
16457 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16462 /* XXX This ']' may be a typo, and something else was meant. But
16463 * treating it as such creates enough complications, that that
16464 * possibility isn't currently considered here. So we assume that the
16465 * ']' is what is intended, and if we've already found an initial '[',
16466 * this leaves this construct looking like [:] or [:^], which almost
16467 * certainly weren't intended to be posix classes */
16468 if (has_opening_bracket) {
16469 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16472 /* But this function can be called when we parse the colon for
16473 * something like qr/[alpha:]]/, so we back up to look for the
16478 found_problem = TRUE;
16479 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16481 else if (*p != ':') {
16483 /* XXX We are currently very restrictive here, so this code doesn't
16484 * consider the possibility that, say, /[alpha.]]/ was intended to
16485 * be a posix class. */
16486 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16489 /* Here we have something like 'foo:]'. There was no initial colon,
16490 * and we back up over 'foo. XXX Unlike the going forward case, we
16491 * don't handle typos of non-word chars in the middle */
16492 has_opening_colon = FALSE;
16495 while (p > RExC_start && isWORDCHAR(*p)) {
16500 /* Here, we have positioned ourselves to where we think the first
16501 * character in the potential class is */
16504 /* Now the interior really starts. There are certain key characters that
16505 * can end the interior, or these could just be typos. To catch both
16506 * cases, we may have to do two passes. In the first pass, we keep on
16507 * going unless we come to a sequence that matches
16508 * qr/ [[:punct:]] [[:blank:]]* \] /xa
16509 * This means it takes a sequence to end the pass, so two typos in a row if
16510 * that wasn't what was intended. If the class is perfectly formed, just
16511 * this one pass is needed. We also stop if there are too many characters
16512 * being accumulated, but this number is deliberately set higher than any
16513 * real class. It is set high enough so that someone who thinks that
16514 * 'alphanumeric' is a correct name would get warned that it wasn't.
16515 * While doing the pass, we keep track of where the key characters were in
16516 * it. If we don't find an end to the class, and one of the key characters
16517 * was found, we redo the pass, but stop when we get to that character.
16518 * Thus the key character was considered a typo in the first pass, but a
16519 * terminator in the second. If two key characters are found, we stop at
16520 * the second one in the first pass. Again this can miss two typos, but
16521 * catches a single one
16523 * In the first pass, 'possible_end' starts as NULL, and then gets set to
16524 * point to the first key character. For the second pass, it starts as -1.
16530 bool has_blank = FALSE;
16531 bool has_upper = FALSE;
16532 bool has_terminating_colon = FALSE;
16533 bool has_terminating_bracket = FALSE;
16534 bool has_semi_colon = FALSE;
16535 unsigned int name_len = 0;
16536 int punct_count = 0;
16540 /* Squeeze out blanks when looking up the class name below */
16541 if (isBLANK(*p) ) {
16543 found_problem = TRUE;
16548 /* The name will end with a punctuation */
16550 const char * peek = p + 1;
16552 /* Treat any non-']' punctuation followed by a ']' (possibly
16553 * with intervening blanks) as trying to terminate the class.
16554 * ']]' is very likely to mean a class was intended (but
16555 * missing the colon), but the warning message that gets
16556 * generated shows the error position better if we exit the
16557 * loop at the bottom (eventually), so skip it here. */
16559 if (peek < e && isBLANK(*peek)) {
16561 found_problem = TRUE;
16564 } while (peek < e && isBLANK(*peek));
16567 if (peek < e && *peek == ']') {
16568 has_terminating_bracket = TRUE;
16570 has_terminating_colon = TRUE;
16572 else if (*p == ';') {
16573 has_semi_colon = TRUE;
16574 has_terminating_colon = TRUE;
16577 found_problem = TRUE;
16584 /* Here we have punctuation we thought didn't end the class.
16585 * Keep track of the position of the key characters that are
16586 * more likely to have been class-enders */
16587 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
16589 /* Allow just one such possible class-ender not actually
16590 * ending the class. */
16591 if (possible_end) {
16597 /* If we have too many punctuation characters, no use in
16599 if (++punct_count > max_distance) {
16603 /* Treat the punctuation as a typo. */
16604 input_text[name_len++] = *p;
16607 else if (isUPPER(*p)) { /* Use lowercase for lookup */
16608 input_text[name_len++] = toLOWER(*p);
16610 found_problem = TRUE;
16612 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
16613 input_text[name_len++] = *p;
16617 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
16621 /* The declaration of 'input_text' is how long we allow a potential
16622 * class name to be, before saying they didn't mean a class name at
16624 if (name_len >= C_ARRAY_LENGTH(input_text)) {
16629 /* We get to here when the possible class name hasn't been properly
16630 * terminated before:
16631 * 1) we ran off the end of the pattern; or
16632 * 2) found two characters, each of which might have been intended to
16633 * be the name's terminator
16634 * 3) found so many punctuation characters in the purported name,
16635 * that the edit distance to a valid one is exceeded
16636 * 4) we decided it was more characters than anyone could have
16637 * intended to be one. */
16639 found_problem = TRUE;
16641 /* In the final two cases, we know that looking up what we've
16642 * accumulated won't lead to a match, even a fuzzy one. */
16643 if ( name_len >= C_ARRAY_LENGTH(input_text)
16644 || punct_count > max_distance)
16646 /* If there was an intermediate key character that could have been
16647 * an intended end, redo the parse, but stop there */
16648 if (possible_end && possible_end != (char *) -1) {
16649 possible_end = (char *) -1; /* Special signal value to say
16650 we've done a first pass */
16655 /* Otherwise, it can't have meant to have been a class */
16656 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16659 /* If we ran off the end, and the final character was a punctuation
16660 * one, back up one, to look at that final one just below. Later, we
16661 * will restore the parse pointer if appropriate */
16662 if (name_len && p == e && isPUNCT(*(p-1))) {
16667 if (p < e && isPUNCT(*p)) {
16669 has_terminating_bracket = TRUE;
16671 /* If this is a 2nd ']', and the first one is just below this
16672 * one, consider that to be the real terminator. This gives a
16673 * uniform and better positioning for the warning message */
16675 && possible_end != (char *) -1
16676 && *possible_end == ']'
16677 && name_len && input_text[name_len - 1] == ']')
16682 /* And this is actually equivalent to having done the 2nd
16683 * pass now, so set it to not try again */
16684 possible_end = (char *) -1;
16689 has_terminating_colon = TRUE;
16691 else if (*p == ';') {
16692 has_semi_colon = TRUE;
16693 has_terminating_colon = TRUE;
16701 /* Here, we have a class name to look up. We can short circuit the
16702 * stuff below for short names that can't possibly be meant to be a
16703 * class name. (We can do this on the first pass, as any second pass
16704 * will yield an even shorter name) */
16705 if (name_len < 3) {
16706 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16709 /* Find which class it is. Initially switch on the length of the name.
16711 switch (name_len) {
16713 if (memEQs(name_start, 4, "word")) {
16714 /* this is not POSIX, this is the Perl \w */
16715 class_number = ANYOF_WORDCHAR;
16719 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16720 * graph lower print punct space upper
16721 * Offset 4 gives the best switch position. */
16722 switch (name_start[4]) {
16724 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16725 class_number = ANYOF_ALPHA;
16728 if (memBEGINs(name_start, 5, "spac")) /* space */
16729 class_number = ANYOF_SPACE;
16732 if (memBEGINs(name_start, 5, "grap")) /* graph */
16733 class_number = ANYOF_GRAPH;
16736 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16737 class_number = ANYOF_ASCII;
16740 if (memBEGINs(name_start, 5, "blan")) /* blank */
16741 class_number = ANYOF_BLANK;
16744 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16745 class_number = ANYOF_CNTRL;
16748 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16749 class_number = ANYOF_ALPHANUMERIC;
16752 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16753 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16754 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16755 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16758 if (memBEGINs(name_start, 5, "digi")) /* digit */
16759 class_number = ANYOF_DIGIT;
16760 else if (memBEGINs(name_start, 5, "prin")) /* print */
16761 class_number = ANYOF_PRINT;
16762 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16763 class_number = ANYOF_PUNCT;
16768 if (memEQs(name_start, 6, "xdigit"))
16769 class_number = ANYOF_XDIGIT;
16773 /* If the name exactly matches a posix class name the class number will
16774 * here be set to it, and the input almost certainly was meant to be a
16775 * posix class, so we can skip further checking. If instead the syntax
16776 * is exactly correct, but the name isn't one of the legal ones, we
16777 * will return that as an error below. But if neither of these apply,
16778 * it could be that no posix class was intended at all, or that one
16779 * was, but there was a typo. We tease these apart by doing fuzzy
16780 * matching on the name */
16781 if (class_number == OOB_NAMEDCLASS && found_problem) {
16782 const UV posix_names[][6] = {
16783 { 'a', 'l', 'n', 'u', 'm' },
16784 { 'a', 'l', 'p', 'h', 'a' },
16785 { 'a', 's', 'c', 'i', 'i' },
16786 { 'b', 'l', 'a', 'n', 'k' },
16787 { 'c', 'n', 't', 'r', 'l' },
16788 { 'd', 'i', 'g', 'i', 't' },
16789 { 'g', 'r', 'a', 'p', 'h' },
16790 { 'l', 'o', 'w', 'e', 'r' },
16791 { 'p', 'r', 'i', 'n', 't' },
16792 { 'p', 'u', 'n', 'c', 't' },
16793 { 's', 'p', 'a', 'c', 'e' },
16794 { 'u', 'p', 'p', 'e', 'r' },
16795 { 'w', 'o', 'r', 'd' },
16796 { 'x', 'd', 'i', 'g', 'i', 't' }
16798 /* The names of the above all have added NULs to make them the same
16799 * size, so we need to also have the real lengths */
16800 const UV posix_name_lengths[] = {
16801 sizeof("alnum") - 1,
16802 sizeof("alpha") - 1,
16803 sizeof("ascii") - 1,
16804 sizeof("blank") - 1,
16805 sizeof("cntrl") - 1,
16806 sizeof("digit") - 1,
16807 sizeof("graph") - 1,
16808 sizeof("lower") - 1,
16809 sizeof("print") - 1,
16810 sizeof("punct") - 1,
16811 sizeof("space") - 1,
16812 sizeof("upper") - 1,
16813 sizeof("word") - 1,
16814 sizeof("xdigit")- 1
16817 int temp_max = max_distance; /* Use a temporary, so if we
16818 reparse, we haven't changed the
16821 /* Use a smaller max edit distance if we are missing one of the
16823 if ( has_opening_bracket + has_opening_colon < 2
16824 || has_terminating_bracket + has_terminating_colon < 2)
16829 /* See if the input name is close to a legal one */
16830 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16832 /* Short circuit call if the lengths are too far apart to be
16834 if (abs( (int) (name_len - posix_name_lengths[i]))
16840 if (edit_distance(input_text,
16843 posix_name_lengths[i],
16847 { /* If it is close, it probably was intended to be a class */
16848 goto probably_meant_to_be;
16852 /* Here the input name is not close enough to a valid class name
16853 * for us to consider it to be intended to be a posix class. If
16854 * we haven't already done so, and the parse found a character that
16855 * could have been terminators for the name, but which we absorbed
16856 * as typos during the first pass, repeat the parse, signalling it
16857 * to stop at that character */
16858 if (possible_end && possible_end != (char *) -1) {
16859 possible_end = (char *) -1;
16864 /* Here neither pass found a close-enough class name */
16865 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16868 probably_meant_to_be:
16870 /* Here we think that a posix specification was intended. Update any
16872 if (updated_parse_ptr) {
16873 *updated_parse_ptr = (char *) p;
16876 /* If a posix class name was intended but incorrectly specified, we
16877 * output or return the warnings */
16878 if (found_problem) {
16880 /* We set flags for these issues in the parse loop above instead of
16881 * adding them to the list of warnings, because we can parse it
16882 * twice, and we only want one warning instance */
16884 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16887 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16889 if (has_semi_colon) {
16890 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16892 else if (! has_terminating_colon) {
16893 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16895 if (! has_terminating_bracket) {
16896 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16899 if ( posix_warnings
16901 && av_count(RExC_warn_text) > 0)
16903 *posix_warnings = RExC_warn_text;
16906 else if (class_number != OOB_NAMEDCLASS) {
16907 /* If it is a known class, return the class. The class number
16908 * #defines are structured so each complement is +1 to the normal
16910 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16912 else if (! check_only) {
16914 /* Here, it is an unrecognized class. This is an error (unless the
16915 * call is to check only, which we've already handled above) */
16916 const char * const complement_string = (complement)
16919 RExC_parse_set((char *) p);
16920 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16922 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16926 return OOB_NAMEDCLASS;
16928 #undef ADD_POSIX_WARNING
16930 STATIC unsigned int
16931 S_regex_set_precedence(const U8 my_operator) {
16933 /* Returns the precedence in the (?[...]) construct of the input operator,
16934 * specified by its character representation. The precedence follows
16935 * general Perl rules, but it extends this so that ')' and ']' have (low)
16936 * precedence even though they aren't really operators */
16938 switch (my_operator) {
16954 NOT_REACHED; /* NOTREACHED */
16955 return 0; /* Silence compiler warning */
16958 STATIC regnode_offset
16959 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16960 I32 *flagp, U32 depth)
16962 /* Handle the (?[...]) construct to do set operations */
16964 U8 curchar; /* Current character being parsed */
16965 UV start, end; /* End points of code point ranges */
16966 SV* final = NULL; /* The end result inversion list */
16967 SV* result_string; /* 'final' stringified */
16968 AV* stack; /* stack of operators and operands not yet
16970 AV* fence_stack = NULL; /* A stack containing the positions in
16971 'stack' of where the undealt-with left
16972 parens would be if they were actually
16974 /* The 'volatile' is a workaround for an optimiser bug
16975 * in Solaris Studio 12.3. See RT #127455 */
16976 volatile IV fence = 0; /* Position of where most recent undealt-
16977 with left paren in stack is; -1 if none.
16979 STRLEN len; /* Temporary */
16980 regnode_offset node; /* Temporary, and final regnode returned by
16982 const bool save_fold = FOLD; /* Temporary */
16983 char *save_end, *save_parse; /* Temporaries */
16984 const bool in_locale = LOC; /* we turn off /l during processing */
16986 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16988 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16990 DEBUG_PARSE("xcls");
16993 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16996 /* The use of this operator implies /u. This is required so that the
16997 * compile time values are valid in all runtime cases */
16998 REQUIRE_UNI_RULES(flagp, 0);
17000 /* Everything in this construct is a metacharacter. Operands begin with
17001 * either a '\' (for an escape sequence), or a '[' for a bracketed
17002 * character class. Any other character should be an operator, or
17003 * parenthesis for grouping. Both types of operands are handled by calling
17004 * regclass() to parse them. It is called with a parameter to indicate to
17005 * return the computed inversion list. The parsing here is implemented via
17006 * a stack. Each entry on the stack is a single character representing one
17007 * of the operators; or else a pointer to an operand inversion list. */
17009 #define IS_OPERATOR(a) SvIOK(a)
17010 #define IS_OPERAND(a) (! IS_OPERATOR(a))
17012 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
17013 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
17014 * with pronouncing it called it Reverse Polish instead, but now that YOU
17015 * know how to pronounce it you can use the correct term, thus giving due
17016 * credit to the person who invented it, and impressing your geek friends.
17017 * Wikipedia says that the pronounciation of "Ł" has been changing so that
17018 * it is now more like an English initial W (as in wonk) than an L.)
17020 * This means that, for example, 'a | b & c' is stored on the stack as
17028 * where the numbers in brackets give the stack [array] element number.
17029 * In this implementation, parentheses are not stored on the stack.
17030 * Instead a '(' creates a "fence" so that the part of the stack below the
17031 * fence is invisible except to the corresponding ')' (this allows us to
17032 * replace testing for parens, by using instead subtraction of the fence
17033 * position). As new operands are processed they are pushed onto the stack
17034 * (except as noted in the next paragraph). New operators of higher
17035 * precedence than the current final one are inserted on the stack before
17036 * the lhs operand (so that when the rhs is pushed next, everything will be
17037 * in the correct positions shown above. When an operator of equal or
17038 * lower precedence is encountered in parsing, all the stacked operations
17039 * of equal or higher precedence are evaluated, leaving the result as the
17040 * top entry on the stack. This makes higher precedence operations
17041 * evaluate before lower precedence ones, and causes operations of equal
17042 * precedence to left associate.
17044 * The only unary operator '!' is immediately pushed onto the stack when
17045 * encountered. When an operand is encountered, if the top of the stack is
17046 * a '!", the complement is immediately performed, and the '!' popped. The
17047 * resulting value is treated as a new operand, and the logic in the
17048 * previous paragraph is executed. Thus in the expression
17050 * the stack looks like
17056 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
17063 * A ')' is treated as an operator with lower precedence than all the
17064 * aforementioned ones, which causes all operations on the stack above the
17065 * corresponding '(' to be evaluated down to a single resultant operand.
17066 * Then the fence for the '(' is removed, and the operand goes through the
17067 * algorithm above, without the fence.
17069 * A separate stack is kept of the fence positions, so that the position of
17070 * the latest so-far unbalanced '(' is at the top of it.
17072 * The ']' ending the construct is treated as the lowest operator of all,
17073 * so that everything gets evaluated down to a single operand, which is the
17076 stack = (AV*)newSV_type_mortal(SVt_PVAV);
17077 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
17079 while (RExC_parse < RExC_end) {
17080 I32 top_index; /* Index of top-most element in 'stack' */
17081 SV** top_ptr; /* Pointer to top 'stack' element */
17082 SV* current = NULL; /* To contain the current inversion list
17084 SV* only_to_avoid_leaks;
17086 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
17087 TRUE /* Force /x */ );
17088 if (RExC_parse >= RExC_end) { /* Fail */
17092 curchar = UCHARAT(RExC_parse);
17096 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17097 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
17098 DEBUG_U(dump_regex_sets_structures(pRExC_state,
17099 stack, fence, fence_stack));
17102 top_index = av_tindex_skip_len_mg(stack);
17105 SV** stacked_ptr; /* Ptr to something already on 'stack' */
17106 char stacked_operator; /* The topmost operator on the 'stack'. */
17107 SV* lhs; /* Operand to the left of the operator */
17108 SV* rhs; /* Operand to the right of the operator */
17109 SV* fence_ptr; /* Pointer to top element of the fence
17113 if ( RExC_parse < RExC_end - 2
17114 && UCHARAT(RExC_parse + 1) == '?'
17115 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
17117 const regnode_offset orig_emit = RExC_emit;
17118 SV * resultant_invlist;
17120 /* Here it could be an embedded '(?flags:(?[...])'.
17121 * This happens when we have some thing like
17123 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
17125 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
17127 * Here we would be handling the interpolated
17128 * '$thai_or_lao'. We handle this by a recursive call to
17129 * reg which returns the inversion list the
17130 * interpolated expression evaluates to. Actually, the
17131 * return is a special regnode containing a pointer to that
17132 * inversion list. If the return isn't that regnode alone,
17133 * we know that this wasn't such an interpolation, which is
17134 * an error: we need to get a single inversion list back
17135 * from the recursion */
17137 RExC_parse_inc_by(1);
17140 node = reg(pRExC_state, 2, flagp, depth+1);
17141 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17143 if ( OP(REGNODE_p(node)) != REGEX_SET
17144 /* If more than a single node returned, the nested
17145 * parens evaluated to more than just a (?[...]),
17146 * which isn't legal */
17147 || RExC_emit != orig_emit
17148 + NODE_STEP_REGNODE
17149 + REGNODE_ARG_LEN(REGEX_SET))
17151 vFAIL("Expecting interpolated extended charclass");
17153 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
17154 current = invlist_clone(resultant_invlist, NULL);
17155 SvREFCNT_dec(resultant_invlist);
17158 RExC_emit = orig_emit;
17159 goto handle_operand;
17162 /* A regular '('. Look behind for illegal syntax */
17163 if (top_index - fence >= 0) {
17164 /* If the top entry on the stack is an operator, it had
17165 * better be a '!', otherwise the entry below the top
17166 * operand should be an operator */
17167 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
17168 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
17169 || ( IS_OPERAND(*top_ptr)
17170 && ( top_index - fence < 1
17171 || ! (stacked_ptr = av_fetch(stack,
17174 || ! IS_OPERATOR(*stacked_ptr))))
17176 RExC_parse_inc_by(1);
17177 vFAIL("Unexpected '(' with no preceding operator");
17181 /* Stack the position of this undealt-with left paren */
17182 av_push(fence_stack, newSViv(fence));
17183 fence = top_index + 1;
17187 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
17188 * multi-char folds are allowed. */
17189 if (!regclass(pRExC_state, flagp, depth+1,
17190 TRUE, /* means parse just the next thing */
17191 FALSE, /* don't allow multi-char folds */
17192 FALSE, /* don't silence non-portable warnings. */
17194 FALSE, /* Require return to be an ANYOF */
17197 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17198 goto regclass_failed;
17203 /* regclass() will return with parsing just the \ sequence,
17204 * leaving the parse pointer at the next thing to parse */
17206 goto handle_operand;
17208 case '[': /* Is a bracketed character class */
17210 /* See if this is a [:posix:] class. */
17211 bool is_posix_class = (OOB_NAMEDCLASS
17212 < handle_possible_posix(pRExC_state,
17216 TRUE /* checking only */));
17217 /* If it is a posix class, leave the parse pointer at the '['
17218 * to fool regclass() into thinking it is part of a
17219 * '[[:posix:]]'. */
17220 if (! is_posix_class) {
17221 RExC_parse_inc_by(1);
17224 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
17225 * multi-char folds are allowed. */
17226 if (!regclass(pRExC_state, flagp, depth+1,
17227 is_posix_class, /* parse the whole char
17228 class only if not a
17230 FALSE, /* don't allow multi-char folds */
17231 TRUE, /* silence non-portable warnings. */
17233 FALSE, /* Require return to be an ANYOF */
17236 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17237 goto regclass_failed;
17242 /* function call leaves parse pointing to the ']', except if we
17244 if (is_posix_class) {
17248 goto handle_operand;
17252 if (top_index >= 1) {
17253 goto join_operators;
17256 /* Only a single operand on the stack: are done */
17260 if (av_tindex_skip_len_mg(fence_stack) < 0) {
17261 if (UCHARAT(RExC_parse - 1) == ']') {
17264 RExC_parse_inc_by(1);
17265 vFAIL("Unexpected ')'");
17268 /* If nothing after the fence, is missing an operand */
17269 if (top_index - fence < 0) {
17270 RExC_parse_inc_by(1);
17273 /* If at least two things on the stack, treat this as an
17275 if (top_index - fence >= 1) {
17276 goto join_operators;
17279 /* Here only a single thing on the fenced stack, and there is a
17280 * fence. Get rid of it */
17281 fence_ptr = av_pop(fence_stack);
17283 fence = SvIV(fence_ptr);
17284 SvREFCNT_dec_NN(fence_ptr);
17291 /* Having gotten rid of the fence, we pop the operand at the
17292 * stack top and process it as a newly encountered operand */
17293 current = av_pop(stack);
17294 if (IS_OPERAND(current)) {
17295 goto handle_operand;
17298 RExC_parse_inc_by(1);
17307 /* These binary operators should have a left operand already
17309 if ( top_index - fence < 0
17310 || top_index - fence == 1
17311 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
17312 || ! IS_OPERAND(*top_ptr))
17314 goto unexpected_binary;
17317 /* If only the one operand is on the part of the stack visible
17318 * to us, we just place this operator in the proper position */
17319 if (top_index - fence < 2) {
17321 /* Place the operator before the operand */
17323 SV* lhs = av_pop(stack);
17324 av_push(stack, newSVuv(curchar));
17325 av_push(stack, lhs);
17329 /* But if there is something else on the stack, we need to
17330 * process it before this new operator if and only if the
17331 * stacked operation has equal or higher precedence than the
17336 /* The operator on the stack is supposed to be below both its
17338 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
17339 || IS_OPERAND(*stacked_ptr))
17341 /* But if not, it's legal and indicates we are completely
17342 * done if and only if we're currently processing a ']',
17343 * which should be the final thing in the expression */
17344 if (curchar == ']') {
17349 RExC_parse_inc_by(1);
17350 vFAIL2("Unexpected binary operator '%c' with no "
17351 "preceding operand", curchar);
17353 stacked_operator = (char) SvUV(*stacked_ptr);
17355 if (regex_set_precedence(curchar)
17356 > regex_set_precedence(stacked_operator))
17358 /* Here, the new operator has higher precedence than the
17359 * stacked one. This means we need to add the new one to
17360 * the stack to await its rhs operand (and maybe more
17361 * stuff). We put it before the lhs operand, leaving
17362 * untouched the stacked operator and everything below it
17364 lhs = av_pop(stack);
17365 assert(IS_OPERAND(lhs));
17367 av_push(stack, newSVuv(curchar));
17368 av_push(stack, lhs);
17372 /* Here, the new operator has equal or lower precedence than
17373 * what's already there. This means the operation already
17374 * there should be performed now, before the new one. */
17376 rhs = av_pop(stack);
17377 if (! IS_OPERAND(rhs)) {
17379 /* This can happen when a ! is not followed by an operand,
17380 * like in /(?[\t &!])/ */
17384 lhs = av_pop(stack);
17386 if (! IS_OPERAND(lhs)) {
17388 /* This can happen when there is an empty (), like in
17389 * /(?[[0]+()+])/ */
17393 switch (stacked_operator) {
17395 _invlist_intersection(lhs, rhs, &rhs);
17400 _invlist_union(lhs, rhs, &rhs);
17404 _invlist_subtract(lhs, rhs, &rhs);
17407 case '^': /* The union minus the intersection */
17412 _invlist_union(lhs, rhs, &u);
17413 _invlist_intersection(lhs, rhs, &i);
17414 _invlist_subtract(u, i, &rhs);
17415 SvREFCNT_dec_NN(i);
17416 SvREFCNT_dec_NN(u);
17422 /* Here, the higher precedence operation has been done, and the
17423 * result is in 'rhs'. We overwrite the stacked operator with
17424 * the result. Then we redo this code to either push the new
17425 * operator onto the stack or perform any higher precedence
17426 * stacked operation */
17427 only_to_avoid_leaks = av_pop(stack);
17428 SvREFCNT_dec(only_to_avoid_leaks);
17429 av_push(stack, rhs);
17432 case '!': /* Highest priority, right associative */
17434 /* If what's already at the top of the stack is another '!",
17435 * they just cancel each other out */
17436 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
17437 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
17439 only_to_avoid_leaks = av_pop(stack);
17440 SvREFCNT_dec(only_to_avoid_leaks);
17442 else { /* Otherwise, since it's right associative, just push
17444 av_push(stack, newSVuv(curchar));
17450 if (RExC_parse >= RExC_end) {
17453 vFAIL("Unexpected character");
17457 /* Here 'current' is the operand. If something is already on the
17458 * stack, we have to check if it is a !. But first, the code above
17459 * may have altered the stack in the time since we earlier set
17462 top_index = av_tindex_skip_len_mg(stack);
17463 if (top_index - fence >= 0) {
17464 /* If the top entry on the stack is an operator, it had better
17465 * be a '!', otherwise the entry below the top operand should
17466 * be an operator */
17467 top_ptr = av_fetch(stack, top_index, FALSE);
17469 if (IS_OPERATOR(*top_ptr)) {
17471 /* The only permissible operator at the top of the stack is
17472 * '!', which is applied immediately to this operand. */
17473 curchar = (char) SvUV(*top_ptr);
17474 if (curchar != '!') {
17475 SvREFCNT_dec(current);
17476 vFAIL2("Unexpected binary operator '%c' with no "
17477 "preceding operand", curchar);
17480 _invlist_invert(current);
17482 only_to_avoid_leaks = av_pop(stack);
17483 SvREFCNT_dec(only_to_avoid_leaks);
17485 /* And we redo with the inverted operand. This allows
17486 * handling multiple ! in a row */
17487 goto handle_operand;
17489 /* Single operand is ok only for the non-binary ')'
17491 else if ((top_index - fence == 0 && curchar != ')')
17492 || (top_index - fence > 0
17493 && (! (stacked_ptr = av_fetch(stack,
17496 || IS_OPERAND(*stacked_ptr))))
17498 SvREFCNT_dec(current);
17499 vFAIL("Operand with no preceding operator");
17503 /* Here there was nothing on the stack or the top element was
17504 * another operand. Just add this new one */
17505 av_push(stack, current);
17507 } /* End of switch on next parse token */
17510 } /* End of loop parsing through the construct */
17512 vFAIL("Syntax error in (?[...])");
17516 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
17517 if (RExC_parse < RExC_end) {
17518 RExC_parse_inc_by(1);
17521 vFAIL("Unexpected ']' with no following ')' in (?[...");
17524 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
17525 vFAIL("Unmatched (");
17528 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
17529 || ((final = av_pop(stack)) == NULL)
17530 || ! IS_OPERAND(final)
17531 || ! is_invlist(final)
17532 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
17535 SvREFCNT_dec(final);
17536 vFAIL("Incomplete expression within '(?[ ])'");
17539 /* Here, 'final' is the resultant inversion list from evaluating the
17540 * expression. Return it if so requested */
17541 if (return_invlist) {
17542 *return_invlist = final;
17546 if (RExC_sets_depth) { /* If within a recursive call, return in a special
17548 RExC_parse_inc_by(1);
17549 node = regpnode(pRExC_state, REGEX_SET, final);
17553 /* Otherwise generate a resultant node, based on 'final'. regclass()
17554 * is expecting a string of ranges and individual code points */
17555 invlist_iterinit(final);
17556 result_string = newSVpvs("");
17557 while (invlist_iternext(final, &start, &end)) {
17558 if (start == end) {
17559 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
17562 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
17563 UVXf "}", start, end);
17567 /* About to generate an ANYOF (or similar) node from the inversion list
17568 * we have calculated */
17569 save_parse = RExC_parse;
17570 RExC_parse_set(SvPV(result_string, len));
17571 save_end = RExC_end;
17572 RExC_end = RExC_parse + len;
17573 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
17575 /* We turn off folding around the call, as the class we have
17576 * constructed already has all folding taken into consideration, and we
17577 * don't want regclass() to add to that */
17578 RExC_flags &= ~RXf_PMf_FOLD;
17579 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
17580 * folds are allowed. */
17581 node = regclass(pRExC_state, flagp, depth+1,
17582 FALSE, /* means parse the whole char class */
17583 FALSE, /* don't allow multi-char folds */
17584 TRUE, /* silence non-portable warnings. The above may
17585 very well have generated non-portable code
17586 points, but they're valid on this machine */
17587 FALSE, /* similarly, no need for strict */
17589 /* We can optimize into something besides an ANYOF,
17590 * except under /l, which needs to be ANYOF because of
17591 * runtime checks for locale sanity, etc */
17597 RExC_parse_set(save_parse + 1);
17598 RExC_end = save_end;
17599 SvREFCNT_dec_NN(final);
17600 SvREFCNT_dec_NN(result_string);
17603 RExC_flags |= RXf_PMf_FOLD;
17607 RETURN_FAIL_ON_RESTART(*flagp, flagp);
17608 goto regclass_failed;
17611 /* Fix up the node type if we are in locale. (We have pretended we are
17612 * under /u for the purposes of regclass(), as this construct will only
17613 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
17614 * (so as to cause any warnings about bad locales to be output in
17615 * regexec.c), and add the flag that indicates to check if not in a
17616 * UTF-8 locale. The reason we above forbid optimization into
17617 * something other than an ANYOF node is simply to minimize the number
17618 * of code changes in regexec.c. Otherwise we would have to create new
17619 * EXACTish node types and deal with them. This decision could be
17620 * revisited should this construct become popular.
17622 * (One might think we could look at the resulting ANYOF node and
17623 * suppress the flag if everything is above 255, as those would be
17624 * UTF-8 only, but this isn't true, as the components that led to that
17625 * result could have been locale-affected, and just happen to cancel
17626 * each other out under UTF-8 locales.) */
17628 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
17630 assert(OP(REGNODE_p(node)) == ANYOF);
17632 OP(REGNODE_p(node)) = ANYOFL;
17633 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
17637 nextchar(pRExC_state);
17641 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
17645 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17648 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
17649 AV * stack, const IV fence, AV * fence_stack)
17650 { /* Dumps the stacks in handle_regex_sets() */
17652 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
17653 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
17656 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
17658 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
17660 if (stack_top < 0) {
17661 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
17664 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17665 for (i = stack_top; i >= 0; i--) {
17666 SV ** element_ptr = av_fetch(stack, i, FALSE);
17667 if (! element_ptr) {
17670 if (IS_OPERATOR(*element_ptr)) {
17671 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17672 (int) i, (int) SvIV(*element_ptr));
17675 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17676 sv_dump(*element_ptr);
17681 if (fence_stack_top < 0) {
17682 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17685 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17686 for (i = fence_stack_top; i >= 0; i--) {
17687 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17688 if (! element_ptr) {
17691 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17692 (int) i, (int) SvIV(*element_ptr));
17703 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17705 /* This adds the Latin1/above-Latin1 folding rules.
17707 * This should be called only for a Latin1-range code points, cp, which is
17708 * known to be involved in a simple fold with other code points above
17709 * Latin1. It would give false results if /aa has been specified.
17710 * Multi-char folds are outside the scope of this, and must be handled
17713 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17715 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17717 /* The rules that are valid for all Unicode versions are hard-coded in */
17722 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17726 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17729 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17730 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17732 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17733 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17734 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17736 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17737 *invlist = add_cp_to_invlist(*invlist,
17738 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17741 default: /* Other code points are checked against the data for the
17742 current Unicode version */
17744 Size_t folds_count;
17746 const U32 * remaining_folds;
17750 folded_cp = toFOLD(cp);
17753 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17755 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17758 if (folded_cp > 255) {
17759 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17762 folds_count = _inverse_folds(folded_cp, &first_fold,
17764 if (folds_count == 0) {
17766 /* Use deprecated warning to increase the chances of this being
17768 ckWARN2reg_d(RExC_parse,
17769 "Perl folding rules are not up-to-date for 0x%02X;"
17770 " please use the perlbug utility to report;", cp);
17775 if (first_fold > 255) {
17776 *invlist = add_cp_to_invlist(*invlist, first_fold);
17778 for (i = 0; i < folds_count - 1; i++) {
17779 if (remaining_folds[i] > 255) {
17780 *invlist = add_cp_to_invlist(*invlist,
17781 remaining_folds[i]);
17791 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17793 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17797 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17799 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17801 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17802 CLEAR_POSIX_WARNINGS();
17806 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17807 if (first_is_fatal) { /* Avoid leaking this */
17808 av_undef(posix_warnings); /* This isn't necessary if the
17809 array is mortal, but is a
17811 (void) sv_2mortal(msg);
17814 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17815 SvREFCNT_dec_NN(msg);
17818 UPDATE_WARNINGS_LOC(RExC_parse);
17821 PERL_STATIC_INLINE Size_t
17822 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17824 const U8 * const start = s1;
17825 const U8 * const send = start + max;
17827 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17829 while (s1 < send && *s1 == *s2) {
17837 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17839 /* This adds the string scalar <multi_string> to the array
17840 * <multi_char_matches>. <multi_string> is known to have exactly
17841 * <cp_count> code points in it. This is used when constructing a
17842 * bracketed character class and we find something that needs to match more
17843 * than a single character.
17845 * <multi_char_matches> is actually an array of arrays. Each top-level
17846 * element is an array that contains all the strings known so far that are
17847 * the same length. And that length (in number of code points) is the same
17848 * as the index of the top-level array. Hence, the [2] element is an
17849 * array, each element thereof is a string containing TWO code points;
17850 * while element [3] is for strings of THREE characters, and so on. Since
17851 * this is for multi-char strings there can never be a [0] nor [1] element.
17853 * When we rewrite the character class below, we will do so such that the
17854 * longest strings are written first, so that it prefers the longest
17855 * matching strings first. This is done even if it turns out that any
17856 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17857 * Christiansen has agreed that this is ok. This makes the test for the
17858 * ligature 'ffi' come before the test for 'ff', for example */
17861 AV** this_array_ptr;
17863 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17865 if (! multi_char_matches) {
17866 multi_char_matches = newAV();
17869 if (av_exists(multi_char_matches, cp_count)) {
17870 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17871 this_array = *this_array_ptr;
17874 this_array = newAV();
17875 av_store(multi_char_matches, cp_count,
17878 av_push(this_array, multi_string);
17880 return multi_char_matches;
17883 /* The names of properties whose definitions are not known at compile time are
17884 * stored in this SV, after a constant heading. So if the length has been
17885 * changed since initialization, then there is a run-time definition. */
17886 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17887 (SvCUR(listsv) != initial_listsv_len)
17889 /* There is a restricted set of white space characters that are legal when
17890 * ignoring white space in a bracketed character class. This generates the
17891 * code to skip them.
17893 * There is a line below that uses the same white space criteria but is outside
17894 * this macro. Both here and there must use the same definition */
17895 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
17898 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
17905 STATIC regnode_offset
17906 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17907 const bool stop_at_1, /* Just parse the next thing, don't
17908 look for a full character class */
17909 bool allow_mutiple_chars,
17910 const bool silence_non_portable, /* Don't output warnings
17914 bool optimizable, /* ? Allow a non-ANYOF return
17916 SV** ret_invlist /* Return an inversion list, not a node */
17919 /* parse a bracketed class specification. Most of these will produce an
17920 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17921 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17922 * under /i with multi-character folds: it will be rewritten following the
17923 * paradigm of this example, where the <multi-fold>s are characters which
17924 * fold to multiple character sequences:
17925 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17926 * gets effectively rewritten as:
17927 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17928 * reg() gets called (recursively) on the rewritten version, and this
17929 * function will return what it constructs. (Actually the <multi-fold>s
17930 * aren't physically removed from the [abcdefghi], it's just that they are
17931 * ignored in the recursion by means of a flag:
17932 * <RExC_in_multi_char_class>.)
17934 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17935 * characters, with the corresponding bit set if that character is in the
17936 * list. For characters above this, an inversion list is used. There
17937 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17938 * determinable at compile time
17940 * On success, returns the offset at which any next node should be placed
17941 * into the regex engine program being compiled.
17943 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17944 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17948 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17950 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17951 regnode_offset ret = -1; /* Initialized to an illegal value */
17953 int namedclass = OOB_NAMEDCLASS;
17954 char *rangebegin = NULL;
17955 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17956 aren't available at the time this was called */
17957 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17958 than just initialized. */
17959 SV* properties = NULL; /* Code points that match \p{} \P{} */
17960 SV* posixes = NULL; /* Code points that match classes like [:word:],
17961 extended beyond the Latin1 range. These have to
17962 be kept separate from other code points for much
17963 of this function because their handling is
17964 different under /i, and for most classes under
17966 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17967 separate for a while from the non-complemented
17968 versions because of complications with /d
17970 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17971 treated more simply than the general case,
17972 leading to less compilation and execution
17974 UV element_count = 0; /* Number of distinct elements in the class.
17975 Optimizations may be possible if this is tiny */
17976 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17977 character; used under /i */
17979 char * stop_ptr = RExC_end; /* where to stop parsing */
17981 /* ignore unescaped whitespace? */
17982 const bool skip_white = cBOOL( ret_invlist
17983 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17985 /* inversion list of code points this node matches only when the target
17986 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17988 SV* upper_latin1_only_utf8_matches = NULL;
17990 /* Inversion list of code points this node matches regardless of things
17991 * like locale, folding, utf8ness of the target string */
17992 SV* cp_list = NULL;
17994 /* Like cp_list, but code points on this list need to be checked for things
17995 * that fold to/from them under /i */
17996 SV* cp_foldable_list = NULL;
17998 /* Like cp_list, but code points on this list are valid only when the
17999 * runtime locale is UTF-8 */
18000 SV* only_utf8_locale_list = NULL;
18002 /* In a range, if one of the endpoints is non-character-set portable,
18003 * meaning that it hard-codes a code point that may mean a different
18004 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
18005 * mnemonic '\t' which each mean the same character no matter which
18006 * character set the platform is on. */
18007 unsigned int non_portable_endpoint = 0;
18009 /* Is the range unicode? which means on a platform that isn't 1-1 native
18010 * to Unicode (i.e. non-ASCII), each code point in it should be considered
18011 * to be a Unicode value. */
18012 bool unicode_range = FALSE;
18013 bool invert = FALSE; /* Is this class to be complemented */
18015 bool warn_super = ALWAYS_WARN_SUPER;
18017 const char * orig_parse = RExC_parse;
18019 /* This variable is used to mark where the end in the input is of something
18020 * that looks like a POSIX construct but isn't. During the parse, when
18021 * something looks like it could be such a construct is encountered, it is
18022 * checked for being one, but not if we've already checked this area of the
18023 * input. Only after this position is reached do we check again */
18024 char *not_posix_region_end = RExC_parse - 1;
18026 AV* posix_warnings = NULL;
18027 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
18028 U8 op = ANYOF; /* The returned node-type, initialized to the expected
18030 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
18031 U32 posixl = 0; /* bit field of posix classes matched under /l */
18034 /* Flags as to what things aren't knowable until runtime. (Note that these are
18035 * mutually exclusive.) */
18036 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
18037 haven't been defined as of yet */
18038 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
18040 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
18041 what gets folded */
18042 U32 has_runtime_dependency = 0; /* OR of the above flags */
18044 DECLARE_AND_GET_RE_DEBUG_FLAGS;
18046 PERL_ARGS_ASSERT_REGCLASS;
18048 PERL_UNUSED_ARG(depth);
18051 assert(! (ret_invlist && allow_mutiple_chars));
18053 /* If wants an inversion list returned, we can't optimize to something
18056 optimizable = FALSE;
18059 DEBUG_PARSE("clas");
18061 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
18062 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
18063 && UNICODE_DOT_DOT_VERSION == 0)
18064 allow_mutiple_chars = FALSE;
18067 /* We include the /i status at the beginning of this so that we can
18068 * know it at runtime */
18069 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
18070 initial_listsv_len = SvCUR(listsv);
18071 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
18073 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18075 assert(RExC_parse <= RExC_end);
18077 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
18078 RExC_parse_inc_by(1);
18080 allow_mutiple_chars = FALSE;
18082 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18085 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
18086 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
18087 int maybe_class = handle_possible_posix(pRExC_state,
18089 ¬_posix_region_end,
18091 TRUE /* checking only */);
18092 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
18093 ckWARN4reg(not_posix_region_end,
18094 "POSIX syntax [%c %c] belongs inside character classes%s",
18095 *RExC_parse, *RExC_parse,
18096 (maybe_class == OOB_NAMEDCLASS)
18097 ? ((POSIXCC_NOTYET(*RExC_parse))
18098 ? " (but this one isn't implemented)"
18099 : " (but this one isn't fully valid)")
18105 /* If the caller wants us to just parse a single element, accomplish this
18106 * by faking the loop ending condition */
18107 if (stop_at_1 && RExC_end > RExC_parse) {
18108 stop_ptr = RExC_parse + 1;
18111 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
18112 if (UCHARAT(RExC_parse) == ']')
18113 goto charclassloop;
18117 if ( posix_warnings
18118 && av_tindex_skip_len_mg(posix_warnings) >= 0
18119 && RExC_parse > not_posix_region_end)
18121 /* Warnings about posix class issues are considered tentative until
18122 * we are far enough along in the parse that we can no longer
18123 * change our mind, at which point we output them. This is done
18124 * each time through the loop so that a later class won't zap them
18125 * before they have been dealt with. */
18126 output_posix_warnings(pRExC_state, posix_warnings);
18129 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18131 if (RExC_parse >= stop_ptr) {
18135 if (UCHARAT(RExC_parse) == ']') {
18141 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
18142 save_value = value;
18143 save_prevvalue = prevvalue;
18146 rangebegin = RExC_parse;
18148 non_portable_endpoint = 0;
18150 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
18151 value = utf8n_to_uvchr((U8*)RExC_parse,
18152 RExC_end - RExC_parse,
18153 &numlen, UTF8_ALLOW_DEFAULT);
18154 RExC_parse_inc_by(numlen);
18157 value = UCHARAT(RExC_parse);
18158 RExC_parse_inc_by(1);
18161 if (value == '[') {
18162 char * posix_class_end;
18163 namedclass = handle_possible_posix(pRExC_state,
18166 do_posix_warnings ? &posix_warnings : NULL,
18167 FALSE /* die if error */);
18168 if (namedclass > OOB_NAMEDCLASS) {
18170 /* If there was an earlier attempt to parse this particular
18171 * posix class, and it failed, it was a false alarm, as this
18172 * successful one proves */
18173 if ( posix_warnings
18174 && av_tindex_skip_len_mg(posix_warnings) >= 0
18175 && not_posix_region_end >= RExC_parse
18176 && not_posix_region_end <= posix_class_end)
18178 av_undef(posix_warnings);
18181 RExC_parse_set(posix_class_end);
18183 else if (namedclass == OOB_NAMEDCLASS) {
18184 not_posix_region_end = posix_class_end;
18187 namedclass = OOB_NAMEDCLASS;
18190 else if ( RExC_parse - 1 > not_posix_region_end
18191 && MAYBE_POSIXCC(value))
18193 (void) handle_possible_posix(
18195 RExC_parse - 1, /* -1 because parse has already been
18197 ¬_posix_region_end,
18198 do_posix_warnings ? &posix_warnings : NULL,
18199 TRUE /* checking only */);
18201 else if ( strict && ! skip_white
18202 && ( generic_isCC_(value, CC_VERTSPACE_)
18203 || is_VERTWS_cp_high(value)))
18205 vFAIL("Literal vertical space in [] is illegal except under /x");
18207 else if (value == '\\') {
18208 /* Is a backslash; get the code point of the char after it */
18210 if (RExC_parse >= RExC_end) {
18211 vFAIL("Unmatched [");
18214 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
18215 value = utf8n_to_uvchr((U8*)RExC_parse,
18216 RExC_end - RExC_parse,
18217 &numlen, UTF8_ALLOW_DEFAULT);
18218 RExC_parse_inc_by(numlen);
18221 value = UCHARAT(RExC_parse);
18222 RExC_parse_inc_by(1);
18225 /* Some compilers cannot handle switching on 64-bit integer
18226 * values, therefore value cannot be an UV. Yes, this will
18227 * be a problem later if we want switch on Unicode.
18228 * A similar issue a little bit later when switching on
18229 * namedclass. --jhi */
18231 /* If the \ is escaping white space when white space is being
18232 * skipped, it means that that white space is wanted literally, and
18233 * is already in 'value'. Otherwise, need to translate the escape
18234 * into what it signifies. */
18235 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
18236 const char * message;
18240 case 'w': namedclass = ANYOF_WORDCHAR; break;
18241 case 'W': namedclass = ANYOF_NWORDCHAR; break;
18242 case 's': namedclass = ANYOF_SPACE; break;
18243 case 'S': namedclass = ANYOF_NSPACE; break;
18244 case 'd': namedclass = ANYOF_DIGIT; break;
18245 case 'D': namedclass = ANYOF_NDIGIT; break;
18246 case 'v': namedclass = ANYOF_VERTWS; break;
18247 case 'V': namedclass = ANYOF_NVERTWS; break;
18248 case 'h': namedclass = ANYOF_HORIZWS; break;
18249 case 'H': namedclass = ANYOF_NHORIZWS; break;
18250 case 'N': /* Handle \N{NAME} in class */
18252 const char * const backslash_N_beg = RExC_parse - 2;
18255 if (! grok_bslash_N(pRExC_state,
18256 NULL, /* No regnode */
18257 &value, /* Yes single value */
18258 &cp_count, /* Multiple code pt count */
18264 if (*flagp & NEED_UTF8)
18265 FAIL("panic: grok_bslash_N set NEED_UTF8");
18267 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
18269 if (cp_count < 0) {
18270 vFAIL("\\N in a character class must be a named character: \\N{...}");
18272 else if (cp_count == 0) {
18273 ckWARNreg(RExC_parse,
18274 "Ignoring zero length \\N{} in character class");
18276 else { /* cp_count > 1 */
18277 assert(cp_count > 1);
18278 if (! RExC_in_multi_char_class) {
18279 if ( ! allow_mutiple_chars
18282 || *RExC_parse == '-')
18286 vFAIL("\\N{} here is restricted to one character");
18288 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
18289 break; /* <value> contains the first code
18290 point. Drop out of the switch to
18294 SV * multi_char_N = newSVpvn(backslash_N_beg,
18295 RExC_parse - backslash_N_beg);
18297 = add_multi_match(multi_char_matches,
18302 } /* End of cp_count != 1 */
18304 /* This element should not be processed further in this
18307 value = save_value;
18308 prevvalue = save_prevvalue;
18309 continue; /* Back to top of loop to get next char */
18312 /* Here, is a single code point, and <value> contains it */
18313 unicode_range = TRUE; /* \N{} are Unicode */
18321 if (RExC_pm_flags & PMf_WILDCARD) {
18322 RExC_parse_inc_by(1);
18323 /* diag_listed_as: Use of %s is not allowed in Unicode
18324 property wildcard subpatterns in regex; marked by <--
18326 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
18327 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
18330 /* \p means they want Unicode semantics */
18331 REQUIRE_UNI_RULES(flagp, 0);
18333 if (RExC_parse >= RExC_end)
18334 vFAIL2("Empty \\%c", (U8)value);
18335 if (*RExC_parse == '{') {
18336 const U8 c = (U8)value;
18337 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
18339 RExC_parse_inc_by(1);
18340 vFAIL2("Missing right brace on \\%c{}", c);
18343 RExC_parse_inc_by(1);
18345 /* White space is allowed adjacent to the braces and after
18346 * any '^', even when not under /x */
18347 while (isSPACE(*RExC_parse)) {
18348 RExC_parse_inc_by(1);
18351 if (UCHARAT(RExC_parse) == '^') {
18353 /* toggle. (The rhs xor gets the single bit that
18354 * differs between P and p; the other xor inverts just
18356 value ^= 'P' ^ 'p';
18358 RExC_parse_inc_by(1);
18359 while (isSPACE(*RExC_parse)) {
18360 RExC_parse_inc_by(1);
18364 if (e == RExC_parse)
18365 vFAIL2("Empty \\%c{}", c);
18367 n = e - RExC_parse;
18368 while (isSPACE(*(RExC_parse + n - 1)))
18371 } /* The \p isn't immediately followed by a '{' */
18372 else if (! isALPHA(*RExC_parse)) {
18373 RExC_parse_inc_safe();
18374 vFAIL2("Character following \\%c must be '{' or a "
18375 "single-character Unicode property name",
18383 char* name = RExC_parse;
18385 /* Any message returned about expanding the definition */
18386 SV* msg = newSVpvs_flags("", SVs_TEMP);
18388 /* If set TRUE, the property is user-defined as opposed to
18389 * official Unicode */
18390 bool user_defined = FALSE;
18391 AV * strings = NULL;
18393 SV * prop_definition = parse_uniprop_string(
18394 name, n, UTF, FOLD,
18395 FALSE, /* This is compile-time */
18397 /* We can't defer this defn when
18398 * the full result is required in
18400 ! cBOOL(ret_invlist),
18407 if (SvCUR(msg)) { /* Assumes any error causes a msg */
18408 assert(prop_definition == NULL);
18409 RExC_parse_set(e + 1);
18410 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
18411 thing so, or else the display is
18415 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
18416 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
18417 SvCUR(msg), SvPVX(msg)));
18420 assert(prop_definition || strings);
18424 if (! prop_definition) {
18425 RExC_parse_set(e + 1);
18426 vFAIL("Unicode string properties are not implemented in (?[...])");
18430 "Using just the single character results"
18431 " returned by \\p{} in (?[...])");
18434 else if (! RExC_in_multi_char_class) {
18435 if (invert ^ (value == 'P')) {
18436 RExC_parse_set(e + 1);
18437 vFAIL("Inverting a character class which contains"
18438 " a multi-character sequence is illegal");
18441 /* For each multi-character string ... */
18442 while (av_count(strings) > 0) {
18443 /* ... Each entry is itself an array of code
18445 AV * this_string = (AV *) av_shift( strings);
18446 STRLEN cp_count = av_count(this_string);
18447 SV * final = newSV(cp_count * 4);
18450 /* Create another string of sequences of \x{...} */
18451 while (av_count(this_string) > 0) {
18452 SV * character = av_shift(this_string);
18453 UV cp = SvUV(character);
18456 REQUIRE_UTF8(flagp);
18458 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
18460 SvREFCNT_dec_NN(character);
18462 SvREFCNT_dec_NN(this_string);
18464 /* And add that to the list of such things */
18466 = add_multi_match(multi_char_matches,
18471 SvREFCNT_dec_NN(strings);
18474 if (! prop_definition) { /* If we got only a string,
18475 this iteration didn't really
18476 find a character */
18479 else if (! is_invlist(prop_definition)) {
18481 /* Here, the definition isn't known, so we have gotten
18482 * returned a string that will be evaluated if and when
18483 * encountered at runtime. We add it to the list of
18484 * such properties, along with whether it should be
18485 * complemented or not */
18486 if (value == 'P') {
18487 sv_catpvs(listsv, "!");
18490 sv_catpvs(listsv, "+");
18492 sv_catsv(listsv, prop_definition);
18494 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
18496 /* We don't know yet what this matches, so have to flag
18498 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
18501 assert (prop_definition && is_invlist(prop_definition));
18503 /* Here we do have the complete property definition
18505 * Temporary workaround for [perl #133136]. For this
18506 * precise input that is in the .t that is failing,
18507 * load utf8.pm, which is what the test wants, so that
18508 * that .t passes */
18509 if ( memEQs(RExC_start, e + 1 - RExC_start,
18511 && ! hv_common(GvHVn(PL_incgv),
18513 "utf8.pm", sizeof("utf8.pm") - 1,
18514 0, HV_FETCH_ISEXISTS, NULL, 0))
18516 require_pv("utf8.pm");
18519 if (! user_defined &&
18520 /* We warn on matching an above-Unicode code point
18521 * if the match would return true, except don't
18522 * warn for \p{All}, which has exactly one element
18524 (_invlist_contains_cp(prop_definition, 0x110000)
18525 && (! (_invlist_len(prop_definition) == 1
18526 && *invlist_array(prop_definition) == 0))))
18531 /* Invert if asking for the complement */
18532 if (value == 'P') {
18533 _invlist_union_complement_2nd(properties,
18538 _invlist_union(properties, prop_definition, &properties);
18543 RExC_parse_set(e + 1);
18544 namedclass = ANYOF_UNIPROP; /* no official name, but it's
18548 case 'n': value = '\n'; break;
18549 case 'r': value = '\r'; break;
18550 case 't': value = '\t'; break;
18551 case 'f': value = '\f'; break;
18552 case 'b': value = '\b'; break;
18553 case 'e': value = ESC_NATIVE; break;
18554 case 'a': value = '\a'; break;
18556 RExC_parse--; /* function expects to be pointed at the 'o' */
18557 if (! grok_bslash_o(&RExC_parse,
18563 cBOOL(range), /* MAX_UV allowed for range
18569 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18570 warn_non_literal_string(RExC_parse, packed_warn, message);
18574 non_portable_endpoint++;
18578 RExC_parse--; /* function expects to be pointed at the 'x' */
18579 if (! grok_bslash_x(&RExC_parse,
18585 cBOOL(range), /* MAX_UV allowed for range
18591 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18592 warn_non_literal_string(RExC_parse, packed_warn, message);
18596 non_portable_endpoint++;
18600 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
18603 /* going to die anyway; point to exact spot of
18605 RExC_parse_inc_safe();
18609 value = grok_c_char;
18610 RExC_parse_inc_by(1);
18611 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
18612 warn_non_literal_string(RExC_parse, packed_warn, message);
18615 non_portable_endpoint++;
18617 case '0': case '1': case '2': case '3': case '4':
18618 case '5': case '6': case '7':
18620 /* Take 1-3 octal digits */
18621 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
18622 | PERL_SCAN_NOTIFY_ILLDIGIT;
18623 numlen = (strict) ? 4 : 3;
18624 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
18625 RExC_parse_inc_by(numlen);
18628 RExC_parse_inc_safe();
18629 vFAIL("Need exactly 3 octal digits");
18631 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
18632 && RExC_parse < RExC_end
18633 && isDIGIT(*RExC_parse)
18634 && ckWARN(WARN_REGEXP))
18636 reg_warn_non_literal_string(
18638 form_alien_digit_msg(8, numlen, RExC_parse,
18639 RExC_end, UTF, FALSE));
18643 non_portable_endpoint++;
18648 /* Allow \_ to not give an error */
18649 if (isWORDCHAR(value) && value != '_') {
18651 vFAIL2("Unrecognized escape \\%c in character class",
18655 ckWARN2reg(RExC_parse,
18656 "Unrecognized escape \\%c in character class passed through",
18661 } /* End of switch on char following backslash */
18662 } /* end of handling backslash escape sequences */
18664 /* Here, we have the current token in 'value' */
18666 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18669 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
18670 * literal, as is the character that began the false range, i.e.
18671 * the 'a' in the examples */
18673 const int w = (RExC_parse >= rangebegin)
18674 ? RExC_parse - rangebegin
18678 "False [] range \"%" UTF8f "\"",
18679 UTF8fARG(UTF, w, rangebegin));
18682 ckWARN2reg(RExC_parse,
18683 "False [] range \"%" UTF8f "\"",
18684 UTF8fARG(UTF, w, rangebegin));
18685 cp_list = add_cp_to_invlist(cp_list, '-');
18686 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18690 range = 0; /* this was not a true range */
18691 element_count += 2; /* So counts for three values */
18694 classnum = namedclass_to_classnum(namedclass);
18696 if (LOC && namedclass < ANYOF_POSIXL_MAX
18697 #ifndef HAS_ISASCII
18698 && classnum != CC_ASCII_
18701 SV* scratch_list = NULL;
18703 /* What the Posix classes (like \w, [:space:]) match isn't
18704 * generally knowable under locale until actual match time. A
18705 * special node is used for these which has extra space for a
18706 * bitmap, with a bit reserved for each named class that is to
18707 * be matched against. (This isn't needed for \p{} and
18708 * pseudo-classes, as they are not affected by locale, and
18709 * hence are dealt with separately.) However, if a named class
18710 * and its complement are both present, then it matches
18711 * everything, and there is no runtime dependency. Odd numbers
18712 * are the complements of the next lower number, so xor works.
18713 * (Note that something like [\w\D] should match everything,
18714 * because \d should be a proper subset of \w. But rather than
18715 * trust that the locale is well behaved, we leave this to
18716 * runtime to sort out) */
18717 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18718 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18719 POSIXL_ZERO(posixl);
18720 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18721 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18722 continue; /* We could ignore the rest of the class, but
18723 best to parse it for any errors */
18725 else { /* Here, isn't the complement of any already parsed
18727 POSIXL_SET(posixl, namedclass);
18728 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18729 anyof_flags |= ANYOF_MATCHES_POSIXL;
18731 /* The above-Latin1 characters are not subject to locale
18732 * rules. Just add them to the unconditionally-matched
18735 /* Get the list of the above-Latin1 code points this
18737 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18738 PL_XPosix_ptrs[classnum],
18740 /* Odd numbers are complements,
18741 * like NDIGIT, NASCII, ... */
18742 namedclass % 2 != 0,
18744 /* Checking if 'cp_list' is NULL first saves an extra
18745 * clone. Its reference count will be decremented at the
18746 * next union, etc, or if this is the only instance, at the
18747 * end of the routine */
18749 cp_list = scratch_list;
18752 _invlist_union(cp_list, scratch_list, &cp_list);
18753 SvREFCNT_dec_NN(scratch_list);
18755 continue; /* Go get next character */
18760 /* Here, is not /l, or is a POSIX class for which /l doesn't
18761 * matter (or is a Unicode property, which is skipped here). */
18762 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18763 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18765 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18766 * nor /l make a difference in what these match,
18767 * therefore we just add what they match to cp_list. */
18768 if (classnum != CC_VERTSPACE_) {
18769 assert( namedclass == ANYOF_HORIZWS
18770 || namedclass == ANYOF_NHORIZWS);
18772 /* It turns out that \h is just a synonym for
18774 classnum = CC_BLANK_;
18777 _invlist_union_maybe_complement_2nd(
18779 PL_XPosix_ptrs[classnum],
18780 namedclass % 2 != 0, /* Complement if odd
18781 (NHORIZWS, NVERTWS)
18786 else if ( AT_LEAST_UNI_SEMANTICS
18787 || classnum == CC_ASCII_
18788 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
18789 || classnum == CC_XDIGIT_)))
18791 /* We usually have to worry about /d affecting what POSIX
18792 * classes match, with special code needed because we won't
18793 * know until runtime what all matches. But there is no
18794 * extra work needed under /u and /a; and [:ascii:] is
18795 * unaffected by /d; and :digit: and :xdigit: don't have
18796 * runtime differences under /d. So we can special case
18797 * these, and avoid some extra work below, and at runtime.
18799 _invlist_union_maybe_complement_2nd(
18801 ((AT_LEAST_ASCII_RESTRICTED)
18802 ? PL_Posix_ptrs[classnum]
18803 : PL_XPosix_ptrs[classnum]),
18804 namedclass % 2 != 0,
18807 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18808 complement and use nposixes */
18809 SV** posixes_ptr = namedclass % 2 == 0
18812 _invlist_union_maybe_complement_2nd(
18814 PL_XPosix_ptrs[classnum],
18815 namedclass % 2 != 0,
18819 } /* end of namedclass \blah */
18821 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18823 /* If 'range' is set, 'value' is the ending of a range--check its
18824 * validity. (If value isn't a single code point in the case of a
18825 * range, we should have figured that out above in the code that
18826 * catches false ranges). Later, we will handle each individual code
18827 * point in the range. If 'range' isn't set, this could be the
18828 * beginning of a range, so check for that by looking ahead to see if
18829 * the next real character to be processed is the range indicator--the
18834 /* For unicode ranges, we have to test that the Unicode as opposed
18835 * to the native values are not decreasing. (Above 255, there is
18836 * no difference between native and Unicode) */
18837 if (unicode_range && prevvalue < 255 && value < 255) {
18838 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18839 goto backwards_range;
18844 if (prevvalue > value) /* b-a */ {
18849 w = RExC_parse - rangebegin;
18851 "Invalid [] range \"%" UTF8f "\"",
18852 UTF8fARG(UTF, w, rangebegin));
18853 NOT_REACHED; /* NOTREACHED */
18857 prevvalue = value; /* save the beginning of the potential range */
18858 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18859 && *RExC_parse == '-')
18861 char* next_char_ptr = RExC_parse + 1;
18863 /* Get the next real char after the '-' */
18864 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18866 /* If the '-' is at the end of the class (just before the ']',
18867 * it is a literal minus; otherwise it is a range */
18868 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18869 RExC_parse_set(next_char_ptr);
18871 /* a bad range like \w-, [:word:]- ? */
18872 if (namedclass > OOB_NAMEDCLASS) {
18873 if (strict || ckWARN(WARN_REGEXP)) {
18874 const int w = RExC_parse >= rangebegin
18875 ? RExC_parse - rangebegin
18878 vFAIL4("False [] range \"%*.*s\"",
18883 "False [] range \"%*.*s\"",
18887 cp_list = add_cp_to_invlist(cp_list, '-');
18890 range = 1; /* yeah, it's a range! */
18891 continue; /* but do it the next time */
18896 if (namedclass > OOB_NAMEDCLASS) {
18900 /* Here, we have a single value this time through the loop, and
18901 * <prevvalue> is the beginning of the range, if any; or <value> if
18904 /* non-Latin1 code point implies unicode semantics. */
18906 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18907 || prevvalue > MAX_LEGAL_CP))
18909 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18911 REQUIRE_UNI_RULES(flagp, 0);
18912 if ( ! silence_non_portable
18913 && UNICODE_IS_PERL_EXTENDED(value)
18914 && TO_OUTPUT_WARNINGS(RExC_parse))
18916 ckWARN2_non_literal_string(RExC_parse,
18917 packWARN(WARN_PORTABLE),
18918 PL_extended_cp_format,
18923 /* Ready to process either the single value, or the completed range.
18924 * For single-valued non-inverted ranges, we consider the possibility
18925 * of multi-char folds. (We made a conscious decision to not do this
18926 * for the other cases because it can often lead to non-intuitive
18927 * results. For example, you have the peculiar case that:
18928 * "s s" =~ /^[^\xDF]+$/i => Y
18929 * "ss" =~ /^[^\xDF]+$/i => N
18931 * See [perl #89750] */
18932 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18933 if ( value == LATIN_SMALL_LETTER_SHARP_S
18934 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18937 /* Here <value> is indeed a multi-char fold. Get what it is */
18939 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18942 UV folded = _to_uni_fold_flags(
18946 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18947 ? FOLD_FLAGS_NOMIX_ASCII
18951 /* Here, <folded> should be the first character of the
18952 * multi-char fold of <value>, with <foldbuf> containing the
18953 * whole thing. But, if this fold is not allowed (because of
18954 * the flags), <fold> will be the same as <value>, and should
18955 * be processed like any other character, so skip the special
18957 if (folded != value) {
18959 /* Skip if we are recursed, currently parsing the class
18960 * again. Otherwise add this character to the list of
18961 * multi-char folds. */
18962 if (! RExC_in_multi_char_class) {
18963 STRLEN cp_count = utf8_length(foldbuf,
18964 foldbuf + foldlen);
18965 SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
18967 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18970 = add_multi_match(multi_char_matches,
18976 /* This element should not be processed further in this
18979 value = save_value;
18980 prevvalue = save_prevvalue;
18986 if (strict && ckWARN(WARN_REGEXP)) {
18989 /* If the range starts above 255, everything is portable and
18990 * likely to be so for any forseeable character set, so don't
18992 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18993 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18995 else if (prevvalue != value) {
18997 /* Under strict, ranges that stop and/or end in an ASCII
18998 * printable should have each end point be a portable value
18999 * for it (preferably like 'A', but we don't warn if it is
19000 * a (portable) Unicode name or code point), and the range
19001 * must be all digits or all letters of the same case.
19002 * Otherwise, the range is non-portable and unclear as to
19003 * what it contains */
19004 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
19005 && ( non_portable_endpoint
19006 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
19007 || (isLOWER_A(prevvalue) && isLOWER_A(value))
19008 || (isUPPER_A(prevvalue) && isUPPER_A(value))
19010 vWARN(RExC_parse, "Ranges of ASCII printables should"
19011 " be some subset of \"0-9\","
19012 " \"A-Z\", or \"a-z\"");
19014 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
19015 SSize_t index_start;
19016 SSize_t index_final;
19018 /* But the nature of Unicode and languages mean we
19019 * can't do the same checks for above-ASCII ranges,
19020 * except in the case of digit ones. These should
19021 * contain only digits from the same group of 10. The
19022 * ASCII case is handled just above. Hence here, the
19023 * range could be a range of digits. First some
19024 * unlikely special cases. Grandfather in that a range
19025 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
19026 * if its starting value is one of the 10 digits prior
19027 * to it. This is because it is an alternate way of
19028 * writing 19D1, and some people may expect it to be in
19029 * that group. But it is bad, because it won't give
19030 * the expected results. In Unicode 5.2 it was
19031 * considered to be in that group (of 11, hence), but
19032 * this was fixed in the next version */
19034 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
19035 goto warn_bad_digit_range;
19037 else if (UNLIKELY( prevvalue >= 0x1D7CE
19038 && value <= 0x1D7FF))
19040 /* This is the only other case currently in Unicode
19041 * where the algorithm below fails. The code
19042 * points just above are the end points of a single
19043 * range containing only decimal digits. It is 5
19044 * different series of 0-9. All other ranges of
19045 * digits currently in Unicode are just a single
19046 * series. (And mktables will notify us if a later
19047 * Unicode version breaks this.)
19049 * If the range being checked is at most 9 long,
19050 * and the digit values represented are in
19051 * numerical order, they are from the same series.
19053 if ( value - prevvalue > 9
19054 || ((( value - 0x1D7CE) % 10)
19055 <= (prevvalue - 0x1D7CE) % 10))
19057 goto warn_bad_digit_range;
19062 /* For all other ranges of digits in Unicode, the
19063 * algorithm is just to check if both end points
19064 * are in the same series, which is the same range.
19066 index_start = _invlist_search(
19067 PL_XPosix_ptrs[CC_DIGIT_],
19070 /* Warn if the range starts and ends with a digit,
19071 * and they are not in the same group of 10. */
19072 if ( index_start >= 0
19073 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
19075 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
19076 value)) != index_start
19077 && index_final >= 0
19078 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
19080 warn_bad_digit_range:
19081 vWARN(RExC_parse, "Ranges of digits should be"
19082 " from the same group of"
19089 if ((! range || prevvalue == value) && non_portable_endpoint) {
19090 if (isPRINT_A(value)) {
19093 if (isBACKSLASHED_PUNCT(value)) {
19094 literal[d++] = '\\';
19096 literal[d++] = (char) value;
19097 literal[d++] = '\0';
19100 "\"%.*s\" is more clearly written simply as \"%s\"",
19101 (int) (RExC_parse - rangebegin),
19106 else if (isMNEMONIC_CNTRL(value)) {
19108 "\"%.*s\" is more clearly written simply as \"%s\"",
19109 (int) (RExC_parse - rangebegin),
19111 cntrl_to_mnemonic((U8) value)
19117 /* Deal with this element of the class */
19120 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
19123 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
19124 * that don't require special handling, we can just add the range like
19125 * we do for ASCII platforms */
19126 if ((UNLIKELY(prevvalue == 0) && value >= 255)
19127 || ! (prevvalue < 256
19129 || (! non_portable_endpoint
19130 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
19131 || (isUPPER_A(prevvalue)
19132 && isUPPER_A(value)))))))
19134 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
19138 /* Here, requires special handling. This can be because it is a
19139 * range whose code points are considered to be Unicode, and so
19140 * must be individually translated into native, or because its a
19141 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
19142 * EBCDIC, but we have defined them to include only the "expected"
19143 * upper or lower case ASCII alphabetics. Subranges above 255 are
19144 * the same in native and Unicode, so can be added as a range */
19145 U8 start = NATIVE_TO_LATIN1(prevvalue);
19147 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
19148 for (j = start; j <= end; j++) {
19149 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
19152 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
19158 range = 0; /* this range (if it was one) is done now */
19159 } /* End of loop through all the text within the brackets */
19161 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
19162 output_posix_warnings(pRExC_state, posix_warnings);
19165 /* If anything in the class expands to more than one character, we have to
19166 * deal with them by building up a substitute parse string, and recursively
19167 * calling reg() on it, instead of proceeding */
19168 if (multi_char_matches) {
19169 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
19172 char *save_end = RExC_end;
19173 char *save_parse = RExC_parse;
19174 char *save_start = RExC_start;
19175 Size_t constructed_prefix_len = 0; /* This gives the length of the
19176 constructed portion of the
19177 substitute parse. */
19178 bool first_time = TRUE; /* First multi-char occurrence doesn't get
19183 /* Only one level of recursion allowed */
19184 assert(RExC_copy_start_in_constructed == RExC_precomp);
19186 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
19187 because too confusing */
19189 sv_catpvs(substitute_parse, "(?:");
19193 /* Look at the longest strings first */
19194 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
19199 if (av_exists(multi_char_matches, cp_count)) {
19200 AV** this_array_ptr;
19203 this_array_ptr = (AV**) av_fetch(multi_char_matches,
19205 while ((this_sequence = av_pop(*this_array_ptr)) !=
19208 if (! first_time) {
19209 sv_catpvs(substitute_parse, "|");
19211 first_time = FALSE;
19213 sv_catpv(substitute_parse, SvPVX(this_sequence));
19218 /* If the character class contains anything else besides these
19219 * multi-character strings, have to include it in recursive parsing */
19220 if (element_count) {
19221 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
19223 sv_catpvs(substitute_parse, "|");
19224 if (has_l_bracket) { /* Add an [ if the original had one */
19225 sv_catpvs(substitute_parse, "[");
19227 constructed_prefix_len = SvCUR(substitute_parse);
19228 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
19230 /* Put in a closing ']' to match any opening one, but not if going
19231 * off the end, as otherwise we are adding something that really
19233 if (has_l_bracket && RExC_parse < RExC_end) {
19234 sv_catpvs(substitute_parse, "]");
19238 sv_catpvs(substitute_parse, ")");
19241 /* This is a way to get the parse to skip forward a whole named
19242 * sequence instead of matching the 2nd character when it fails the
19244 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
19248 /* Set up the data structure so that any errors will be properly
19249 * reported. See the comments at the definition of
19250 * REPORT_LOCATION_ARGS for details */
19251 RExC_copy_start_in_input = (char *) orig_parse;
19252 RExC_start = SvPV(substitute_parse, len);
19253 RExC_parse_set( RExC_start );
19254 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
19255 RExC_end = RExC_parse + len;
19256 RExC_in_multi_char_class = 1;
19258 ret = reg(pRExC_state, 1, ®_flags, depth+1);
19260 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
19262 /* And restore so can parse the rest of the pattern */
19263 RExC_parse_set(save_parse);
19264 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
19265 RExC_end = save_end;
19266 RExC_in_multi_char_class = 0;
19267 SvREFCNT_dec_NN(multi_char_matches);
19268 SvREFCNT_dec(properties);
19269 SvREFCNT_dec(cp_list);
19270 SvREFCNT_dec(simple_posixes);
19271 SvREFCNT_dec(posixes);
19272 SvREFCNT_dec(nposixes);
19273 SvREFCNT_dec(cp_foldable_list);
19277 /* If folding, we calculate all characters that could fold to or from the
19278 * ones already on the list */
19279 if (cp_foldable_list) {
19281 UV start, end; /* End points of code point ranges */
19283 SV* fold_intersection = NULL;
19286 /* Our calculated list will be for Unicode rules. For locale
19287 * matching, we have to keep a separate list that is consulted at
19288 * runtime only when the locale indicates Unicode rules (and we
19289 * don't include potential matches in the ASCII/Latin1 range, as
19290 * any code point could fold to any other, based on the run-time
19291 * locale). For non-locale, we just use the general list */
19293 use_list = &only_utf8_locale_list;
19296 use_list = &cp_list;
19299 /* Only the characters in this class that participate in folds need
19300 * be checked. Get the intersection of this class and all the
19301 * possible characters that are foldable. This can quickly narrow
19302 * down a large class */
19303 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
19304 &fold_intersection);
19306 /* Now look at the foldable characters in this class individually */
19307 invlist_iterinit(fold_intersection);
19308 while (invlist_iternext(fold_intersection, &start, &end)) {
19312 /* Look at every character in the range */
19313 for (j = start; j <= end; j++) {
19314 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
19317 Size_t folds_count;
19319 const U32 * remaining_folds;
19323 /* Under /l, we don't know what code points below 256
19324 * fold to, except we do know the MICRO SIGN folds to
19325 * an above-255 character if the locale is UTF-8, so we
19326 * add it to the special list (in *use_list) Otherwise
19327 * we know now what things can match, though some folds
19328 * are valid under /d only if the target is UTF-8.
19329 * Those go in a separate list */
19330 if ( IS_IN_SOME_FOLD_L1(j)
19331 && ! (LOC && j != MICRO_SIGN))
19334 /* ASCII is always matched; non-ASCII is matched
19335 * only under Unicode rules (which could happen
19336 * under /l if the locale is a UTF-8 one */
19337 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
19338 *use_list = add_cp_to_invlist(*use_list,
19339 PL_fold_latin1[j]);
19341 else if (j != PL_fold_latin1[j]) {
19342 upper_latin1_only_utf8_matches
19343 = add_cp_to_invlist(
19344 upper_latin1_only_utf8_matches,
19345 PL_fold_latin1[j]);
19349 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
19350 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
19352 add_above_Latin1_folds(pRExC_state,
19359 /* Here is an above Latin1 character. We don't have the
19360 * rules hard-coded for it. First, get its fold. This is
19361 * the simple fold, as the multi-character folds have been
19362 * handled earlier and separated out */
19363 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
19364 (ASCII_FOLD_RESTRICTED)
19365 ? FOLD_FLAGS_NOMIX_ASCII
19368 /* Single character fold of above Latin1. Add everything
19369 * in its fold closure to the list that this node should
19371 folds_count = _inverse_folds(folded, &first_fold,
19373 for (k = 0; k <= folds_count; k++) {
19374 UV c = (k == 0) /* First time through use itself */
19376 : (k == 1) /* 2nd time use, the first fold */
19379 /* Then the remaining ones */
19380 : remaining_folds[k-2];
19382 /* /aa doesn't allow folds between ASCII and non- */
19383 if (( ASCII_FOLD_RESTRICTED
19384 && (isASCII(c) != isASCII(j))))
19389 /* Folds under /l which cross the 255/256 boundary are
19390 * added to a separate list. (These are valid only
19391 * when the locale is UTF-8.) */
19392 if (c < 256 && LOC) {
19393 *use_list = add_cp_to_invlist(*use_list, c);
19397 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
19399 cp_list = add_cp_to_invlist(cp_list, c);
19402 /* Similarly folds involving non-ascii Latin1
19403 * characters under /d are added to their list */
19404 upper_latin1_only_utf8_matches
19405 = add_cp_to_invlist(
19406 upper_latin1_only_utf8_matches,
19412 SvREFCNT_dec_NN(fold_intersection);
19415 /* Now that we have finished adding all the folds, there is no reason
19416 * to keep the foldable list separate */
19417 _invlist_union(cp_list, cp_foldable_list, &cp_list);
19418 SvREFCNT_dec_NN(cp_foldable_list);
19421 /* And combine the result (if any) with any inversion lists from posix
19422 * classes. The lists are kept separate up to now because we don't want to
19423 * fold the classes */
19424 if (simple_posixes) { /* These are the classes known to be unaffected by
19427 _invlist_union(cp_list, simple_posixes, &cp_list);
19428 SvREFCNT_dec_NN(simple_posixes);
19431 cp_list = simple_posixes;
19434 if (posixes || nposixes) {
19435 if (! DEPENDS_SEMANTICS) {
19437 /* For everything but /d, we can just add the current 'posixes' and
19438 * 'nposixes' to the main list */
19441 _invlist_union(cp_list, posixes, &cp_list);
19442 SvREFCNT_dec_NN(posixes);
19450 _invlist_union(cp_list, nposixes, &cp_list);
19451 SvREFCNT_dec_NN(nposixes);
19454 cp_list = nposixes;
19459 /* Under /d, things like \w match upper Latin1 characters only if
19460 * the target string is in UTF-8. But things like \W match all the
19461 * upper Latin1 characters if the target string is not in UTF-8.
19463 * Handle the case with something like \W separately */
19465 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
19467 /* A complemented posix class matches all upper Latin1
19468 * characters if not in UTF-8. And it matches just certain
19469 * ones when in UTF-8. That means those certain ones are
19470 * matched regardless, so can just be added to the
19471 * unconditional list */
19473 _invlist_union(cp_list, nposixes, &cp_list);
19474 SvREFCNT_dec_NN(nposixes);
19478 cp_list = nposixes;
19481 /* Likewise for 'posixes' */
19482 _invlist_union(posixes, cp_list, &cp_list);
19483 SvREFCNT_dec(posixes);
19485 /* Likewise for anything else in the range that matched only
19487 if (upper_latin1_only_utf8_matches) {
19488 _invlist_union(cp_list,
19489 upper_latin1_only_utf8_matches,
19491 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19492 upper_latin1_only_utf8_matches = NULL;
19495 /* If we don't match all the upper Latin1 characters regardless
19496 * of UTF-8ness, we have to set a flag to match the rest when
19498 _invlist_subtract(only_non_utf8_list, cp_list,
19499 &only_non_utf8_list);
19500 if (_invlist_len(only_non_utf8_list) != 0) {
19501 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
19503 SvREFCNT_dec_NN(only_non_utf8_list);
19506 /* Here there were no complemented posix classes. That means
19507 * the upper Latin1 characters in 'posixes' match only when the
19508 * target string is in UTF-8. So we have to add them to the
19509 * list of those types of code points, while adding the
19510 * remainder to the unconditional list.
19512 * First calculate what they are */
19513 SV* nonascii_but_latin1_properties = NULL;
19514 _invlist_intersection(posixes, PL_UpperLatin1,
19515 &nonascii_but_latin1_properties);
19517 /* And add them to the final list of such characters. */
19518 _invlist_union(upper_latin1_only_utf8_matches,
19519 nonascii_but_latin1_properties,
19520 &upper_latin1_only_utf8_matches);
19522 /* Remove them from what now becomes the unconditional list */
19523 _invlist_subtract(posixes, nonascii_but_latin1_properties,
19526 /* And add those unconditional ones to the final list */
19528 _invlist_union(cp_list, posixes, &cp_list);
19529 SvREFCNT_dec_NN(posixes);
19536 SvREFCNT_dec(nonascii_but_latin1_properties);
19538 /* Get rid of any characters from the conditional list that we
19539 * now know are matched unconditionally, which may make that
19541 _invlist_subtract(upper_latin1_only_utf8_matches,
19543 &upper_latin1_only_utf8_matches);
19544 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
19545 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19546 upper_latin1_only_utf8_matches = NULL;
19552 /* And combine the result (if any) with any inversion list from properties.
19553 * The lists are kept separate up to now so that we can distinguish the two
19554 * in regards to matching above-Unicode. A run-time warning is generated
19555 * if a Unicode property is matched against a non-Unicode code point. But,
19556 * we allow user-defined properties to match anything, without any warning,
19557 * and we also suppress the warning if there is a portion of the character
19558 * class that isn't a Unicode property, and which matches above Unicode, \W
19559 * or [\x{110000}] for example.
19560 * (Note that in this case, unlike the Posix one above, there is no
19561 * <upper_latin1_only_utf8_matches>, because having a Unicode property
19562 * forces Unicode semantics */
19566 /* If it matters to the final outcome, see if a non-property
19567 * component of the class matches above Unicode. If so, the
19568 * warning gets suppressed. This is true even if just a single
19569 * such code point is specified, as, though not strictly correct if
19570 * another such code point is matched against, the fact that they
19571 * are using above-Unicode code points indicates they should know
19572 * the issues involved */
19574 warn_super = ! (invert
19575 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
19578 _invlist_union(properties, cp_list, &cp_list);
19579 SvREFCNT_dec_NN(properties);
19582 cp_list = properties;
19586 anyof_flags |= ANYOF_WARN_SUPER__shared;
19588 /* Because an ANYOF node is the only one that warns, this node
19589 * can't be optimized into something else */
19590 optimizable = FALSE;
19594 /* Here, we have calculated what code points should be in the character
19597 * Now we can see about various optimizations. Fold calculation (which we
19598 * did above) needs to take place before inversion. Otherwise /[^k]/i
19599 * would invert to include K, which under /i would match k, which it
19600 * shouldn't. Therefore we can't invert folded locale now, as it won't be
19601 * folded until runtime */
19603 /* If we didn't do folding, it's because some information isn't available
19604 * until runtime; set the run-time fold flag for these We know to set the
19605 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
19606 * at least one 0-255 range code point */
19609 /* Some things on the list might be unconditionally included because of
19610 * other components. Remove them, and clean up the list if it goes to
19612 if (only_utf8_locale_list && cp_list) {
19613 _invlist_subtract(only_utf8_locale_list, cp_list,
19614 &only_utf8_locale_list);
19616 if (_invlist_len(only_utf8_locale_list) == 0) {
19617 SvREFCNT_dec_NN(only_utf8_locale_list);
19618 only_utf8_locale_list = NULL;
19621 if ( only_utf8_locale_list
19623 && ( _invlist_contains_cp(cp_list,
19624 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
19625 || _invlist_contains_cp(cp_list,
19626 LATIN_SMALL_LETTER_DOTLESS_I))))
19628 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19629 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
19631 else if (cp_list && invlist_lowest(cp_list) < 256) {
19632 /* If nothing is below 256, has no locale dependency; otherwise it
19634 anyof_flags |= ANYOFL_FOLD;
19635 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
19637 /* In a Turkish locale these could match, notify the run-time code
19638 * to check for that */
19639 if ( _invlist_contains_cp(cp_list, 'I')
19640 || _invlist_contains_cp(cp_list, 'i'))
19642 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
19646 else if ( DEPENDS_SEMANTICS
19647 && ( upper_latin1_only_utf8_matches
19649 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
19651 RExC_seen_d_op = TRUE;
19652 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
19655 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
19659 && ! has_runtime_dependency)
19661 _invlist_invert(cp_list);
19663 /* Clear the invert flag since have just done it here */
19667 /* All possible optimizations below still have these characteristics.
19668 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
19670 *flagp |= HASWIDTH|SIMPLE;
19673 *ret_invlist = cp_list;
19675 return (cp_list) ? RExC_emit : 0;
19678 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19679 RExC_contains_locale = 1;
19684 /* Some character classes are equivalent to other nodes. Such nodes
19685 * take up less room, and some nodes require fewer operations to
19686 * execute, than ANYOF nodes. EXACTish nodes may be joinable with
19687 * adjacent nodes to improve efficiency. */
19688 op = optimize_regclass(pRExC_state, cp_list,
19689 only_utf8_locale_list,
19690 upper_latin1_only_utf8_matches,
19691 has_runtime_dependency,
19693 &anyof_flags, &invert, &ret, flagp);
19694 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
19696 /* If optimized to something else and emitted, clean up and return */
19698 SvREFCNT_dec(cp_list);;
19699 SvREFCNT_dec(only_utf8_locale_list);
19700 SvREFCNT_dec(upper_latin1_only_utf8_matches);
19704 /* If no optimization was found, an END was returned and we will now
19711 /* Here are going to emit an ANYOF; set the particular type */
19713 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
19724 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
19725 FILL_NODE(ret, op); /* We set the argument later */
19726 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
19727 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19729 /* Here, <cp_list> contains all the code points we can determine at
19730 * compile time that match under all conditions. Go through it, and
19731 * for things that belong in the bitmap, put them there, and delete from
19732 * <cp_list>. While we are at it, see if everything above 255 is in the
19733 * list, and if so, set a flag to speed up execution */
19735 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
19738 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19742 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19745 /* Here, the bitmap has been populated with all the Latin1 code points that
19746 * always match. Can now add to the overall list those that match only
19747 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19749 if (upper_latin1_only_utf8_matches) {
19751 _invlist_union(cp_list,
19752 upper_latin1_only_utf8_matches,
19754 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19757 cp_list = upper_latin1_only_utf8_matches;
19759 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
19762 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19763 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19766 only_utf8_locale_list);
19768 SvREFCNT_dec(cp_list);;
19769 SvREFCNT_dec(only_utf8_locale_list);
19774 S_optimize_regclass(pTHX_
19775 RExC_state_t *pRExC_state,
19777 SV* only_utf8_locale_list,
19778 SV* upper_latin1_only_utf8_matches,
19779 const U32 has_runtime_dependency,
19783 regnode_offset * ret,
19787 /* This function exists just to make S_regclass() smaller. It extracts out
19788 * the code that looks for potential optimizations away from a full generic
19789 * ANYOF node. The parameter names are the same as the corresponding
19790 * variables in S_regclass.
19792 * It returns the new op (the impossible END one if no optimization found)
19793 * and sets *ret to any created regnode. If the new op is sufficiently
19794 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
19796 * Certain of the parameters may be updated as a result of the changes
19799 U8 op = END; /* The returned node-type, initialized to an impossible
19802 PERL_UINT_FAST8_T i;
19803 UV partial_cp_count = 0;
19804 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19805 UV end[MAX_FOLD_FROMS+1] = { 0 };
19806 bool single_range = FALSE;
19807 UV lowest_cp = 0, highest_cp = 0;
19809 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
19811 if (cp_list) { /* Count the code points in enough ranges that we would see
19812 all the ones possible in any fold in this version of
19815 invlist_iterinit(cp_list);
19816 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19817 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19820 partial_cp_count += end[i] - start[i] + 1;
19824 single_range = TRUE;
19826 invlist_iterfinish(cp_list);
19828 /* If we know at compile time that this matches every possible code
19829 * point, any run-time dependencies don't matter */
19830 if (start[0] == 0 && end[0] == UV_MAX) {
19832 goto return_OPFAIL;
19839 /* Use a clearer mnemonic for below */
19840 lowest_cp = start[0];
19842 highest_cp = invlist_highest(cp_list);
19845 /* Similarly, for /l posix classes, if both a class and its complement
19846 * match, any run-time dependencies don't matter */
19849 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
19850 if ( POSIXL_TEST(posixl, namedclass) /* class */
19851 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19854 goto return_OPFAIL;
19863 /* For well-behaved locales, some classes are subsets of others, so
19864 * complementing the subset and including the non-complemented superset
19865 * should match everything, like [\D[:alnum:]], and
19866 * [[:^alpha:][:alnum:]], but some implementations of locales are
19867 * buggy, and khw thinks its a bad idea to have optimization change
19868 * behavior, even if it avoids an OS bug in a given case */
19870 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19872 /* If is a single posix /l class, can optimize to just that op. Such a
19873 * node will not match anything in the Latin1 range, as that is not
19874 * determinable until runtime, but will match whatever the class does
19875 * outside that range. (Note that some classes won't match anything
19876 * outside the range, like [:ascii:]) */
19877 if ( isSINGLE_BIT_SET(posixl)
19878 && (partial_cp_count == 0 || lowest_cp > 255))
19881 SV * class_above_latin1 = NULL;
19882 bool already_inverted;
19883 bool are_equivalent;
19886 namedclass = single_1bit_pos32(posixl);
19887 classnum = namedclass_to_classnum(namedclass);
19889 /* The named classes are such that the inverted number is one
19890 * larger than the non-inverted one */
19891 already_inverted = namedclass - classnum_to_namedclass(classnum);
19893 /* Create an inversion list of the official property, inverted if
19894 * the constructed node list is inverted, and restricted to only
19895 * the above latin1 code points, which are the only ones known at
19897 _invlist_intersection_maybe_complement_2nd(
19899 PL_XPosix_ptrs[classnum],
19901 &class_above_latin1);
19902 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
19903 SvREFCNT_dec_NN(class_above_latin1);
19905 if (are_equivalent) {
19907 /* Resolve the run-time inversion flag with this possibly
19908 * inverted class */
19909 *invert = *invert ^ already_inverted;
19911 op = POSIXL + *invert * (NPOSIXL - POSIXL);
19912 *ret = reg_node(pRExC_state, op);
19913 FLAGS(REGNODE_p(*ret)) = classnum;
19919 /* khw can't think of any other possible transformation involving these. */
19920 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19924 if (! has_runtime_dependency) {
19926 /* If the list is empty, nothing matches. This happens, for example,
19927 * when a Unicode property that doesn't match anything is the only
19928 * element in the character class (perluniprops.pod notes such
19930 if (partial_cp_count == 0) {
19935 goto return_OPFAIL;
19939 /* If matches everything but \n */
19940 if ( start[0] == 0 && end[0] == '\n' - 1
19941 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19943 assert (! *invert);
19945 *ret = reg_node(pRExC_state, op);
19951 /* Next see if can optimize classes that contain just a few code points
19952 * into an EXACTish node. The reason to do this is to let the optimizer
19953 * join this node with adjacent EXACTish ones, and ANYOF nodes require
19954 * runtime conversion to code point from UTF-8, which we'd like to avoid.
19956 * An EXACTFish node can be generated even if not under /i, and vice versa.
19957 * But care must be taken. An EXACTFish node has to be such that it only
19958 * matches precisely the code points in the class, but we want to generate
19959 * the least restrictive one that does that, to increase the odds of being
19960 * able to join with an adjacent node. For example, if the class contains
19961 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
19962 * from matching. Whether we are under /i or not is irrelevant in this
19963 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
19964 * LETTER APOSTROPHE. That is supposed to match the single character U+0149
19965 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
19966 * is no simple fold that includes \X{02BC}, there is a multi-char fold
19967 * that does, and so the node generated for it must be an EXACTFish one.
19968 * On the other hand qr/:/i should generate a plain EXACT node since the
19969 * colon participates in no fold whatsoever, and having it be EXACT tells
19970 * the optimizer the target string cannot match unless it has a colon in
19975 /* Only try if there are no more code points in the class than in
19976 * the max possible fold */
19977 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19979 /* We can always make a single code point class into an EXACTish node.
19981 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
19984 /* Here is /l: Use EXACTL, except if there is a fold not known
19985 * until runtime so shows as only a single code point here.
19986 * For code points above 255, we know which can cause problems
19987 * by having a potential fold to the Latin1 range. */
19989 || ( lowest_cp > 255
19990 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
19998 else if (! FOLD) { /* Not /l and not /i */
19999 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
20001 else if (lowest_cp < 256) { /* /i, not /l, and the code point is
20004 /* Under /i, it gets a little tricky. A code point that
20005 * doesn't participate in a fold should be an EXACT node. We
20006 * know this one isn't the result of a simple fold, or there'd
20007 * be more than one code point in the list, but it could be
20008 * part of a multi-character fold. In that case we better not
20009 * create an EXACT node, as we would wrongly be telling the
20010 * optimizer that this code point must be in the target string,
20011 * and that is wrong. This is because if the sequence around
20012 * this code point forms a multi-char fold, what needs to be in
20013 * the string could be the code point that folds to the
20016 * This handles the case of below-255 code points, as we have
20017 * an easy look up for those. The next clause handles the
20019 op = IS_IN_SOME_FOLD_L1(lowest_cp)
20023 else { /* /i, larger code point. Since we are under /i, and have
20024 just this code point, we know that it can't fold to
20025 something else, so PL_InMultiCharFold applies to it */
20026 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
20033 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
20034 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
20036 /* Here, the only runtime dependency, if any, is from /d, and the
20037 * class matches more than one code point, and the lowest code
20038 * point participates in some fold. It might be that the other
20039 * code points are /i equivalent to this one, and hence they would
20040 * be representable by an EXACTFish node. Above, we eliminated
20041 * classes that contain too many code points to be EXACTFish, with
20042 * the test for MAX_FOLD_FROMS
20044 * First, special case the ASCII fold pairs, like 'B' and 'b'. We
20045 * do this because we have EXACTFAA at our disposal for the ASCII
20047 if (partial_cp_count == 2 && isASCII(lowest_cp)) {
20049 /* The only ASCII characters that participate in folds are
20051 assert(isALPHA(lowest_cp));
20052 if ( end[0] == start[0] /* First range is a single
20053 character, so 2nd exists */
20054 && isALPHA_FOLD_EQ(start[0], start[1]))
20056 /* Here, is part of an ASCII fold pair */
20058 if ( ASCII_FOLD_RESTRICTED
20059 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
20061 /* If the second clause just above was true, it means
20062 * we can't be under /i, or else the list would have
20063 * included more than this fold pair. Therefore we
20064 * have to exclude the possibility of whatever else it
20065 * is that folds to these, by using EXACTFAA */
20068 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
20070 /* Here, there's no simple fold that lowest_cp is part
20071 * of, but there is a multi-character one. If we are
20072 * not under /i, we want to exclude that possibility;
20073 * if under /i, we want to include it */
20074 op = (FOLD) ? EXACTFU : EXACTFAA;
20078 /* Here, the only possible fold lowest_cp particpates in
20079 * is with start[1]. /i or not isn't relevant */
20083 value = toFOLD(lowest_cp);
20086 else if ( ! upper_latin1_only_utf8_matches
20087 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
20089 invlist_highest(upper_latin1_only_utf8_matches)]
20092 /* Here, the smallest character is non-ascii or there are more
20093 * than 2 code points matched by this node. Also, we either
20094 * don't have /d UTF-8 dependent matches, or if we do, they
20095 * look like they could be a single character that is the fold
20096 * of the lowest one is in the always-match list. This test
20097 * quickly excludes most of the false positives when there are
20098 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
20099 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
20100 * iff the target string is UTF-8. (We don't have to worry
20101 * above about exceeding the array bounds of PL_fold_latin1[]
20102 * because any code point in 'upper_latin1_only_utf8_matches'
20105 * EXACTFAA would apply only to pairs (hence exactly 2 code
20106 * points) in the ASCII range, so we can't use it here to
20107 * artificially restrict the fold domain, so we check if the
20108 * class does or does not match some EXACTFish node. Further,
20109 * if we aren't under /i, and and the folded-to character is
20110 * part of a multi-character fold, we can't do this
20111 * optimization, as the sequence around it could be that
20112 * multi-character fold, and we don't here know the context, so
20113 * we have to assume it is that multi-char fold, to prevent
20116 * To do the general case, we first find the fold of the lowest
20117 * code point (which may be higher than that lowest unfolded
20118 * one), then find everything that folds to it. (The data
20119 * structure we have only maps from the folded code points, so
20120 * we have to do the earlier step.) */
20123 U8 foldbuf[UTF8_MAXBYTES_CASE];
20124 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
20126 const U32 * remaining_folds;
20127 Size_t folds_to_this_cp_count = _inverse_folds(
20131 Size_t folds_count = folds_to_this_cp_count + 1;
20132 SV * fold_list = _new_invlist(folds_count);
20135 /* If there are UTF-8 dependent matches, create a temporary
20136 * list of what this node matches, including them. */
20137 SV * all_cp_list = NULL;
20138 SV ** use_this_list = &cp_list;
20140 if (upper_latin1_only_utf8_matches) {
20141 all_cp_list = _new_invlist(0);
20142 use_this_list = &all_cp_list;
20143 _invlist_union(cp_list,
20144 upper_latin1_only_utf8_matches,
20148 /* Having gotten everything that participates in the fold
20149 * containing the lowest code point, we turn that into an
20150 * inversion list, making sure everything is included. */
20151 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
20152 fold_list = add_cp_to_invlist(fold_list, folded);
20153 if (folds_to_this_cp_count > 0) {
20154 fold_list = add_cp_to_invlist(fold_list, first_fold);
20155 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
20156 fold_list = add_cp_to_invlist(fold_list,
20157 remaining_folds[i]);
20161 /* If the fold list is identical to what's in this ANYOF node,
20162 * the node can be represented by an EXACTFish one instead */
20163 if (_invlistEQ(*use_this_list, fold_list,
20164 0 /* Don't complement */ )
20167 /* But, we have to be careful, as mentioned above. Just
20168 * the right sequence of characters could match this if it
20169 * is part of a multi-character fold. That IS what we want
20170 * if we are under /i. But it ISN'T what we want if not
20171 * under /i, as it could match when it shouldn't. So, when
20172 * we aren't under /i and this character participates in a
20173 * multi-char fold, we don't optimize into an EXACTFish
20174 * node. So, for each case below we have to check if we
20175 * are folding, and if not, if it is not part of a
20176 * multi-char fold. */
20177 if (lowest_cp > 255) { /* Highish code point */
20178 if (FOLD || ! _invlist_contains_cp(
20179 PL_InMultiCharFold, folded))
20183 : (ASCII_FOLD_RESTRICTED)
20188 } /* Below, the lowest code point < 256 */
20191 && DEPENDS_SEMANTICS)
20192 { /* An EXACTF node containing a single character 's',
20193 can be an EXACTFU if it doesn't get joined with an
20195 op = EXACTFU_S_EDGE;
20199 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
20201 if (upper_latin1_only_utf8_matches) {
20204 /* We can't use the fold, as that only matches
20208 else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
20210 { /* EXACTFUP is a special node for this character */
20211 op = (ASCII_FOLD_RESTRICTED)
20214 value = MICRO_SIGN;
20216 else if ( ASCII_FOLD_RESTRICTED
20217 && ! isASCII(lowest_cp))
20218 { /* For ASCII under /iaa, we can use EXACTFU below
20230 SvREFCNT_dec_NN(fold_list);
20231 SvREFCNT_dec(all_cp_list);
20238 /* Here, we have calculated what EXACTish node to use. Have to
20239 * convert to UTF-8 if not already there */
20242 SvREFCNT_dec(cp_list);;
20243 REQUIRE_UTF8(flagp);
20246 /* This is a kludge to the special casing issues with this
20247 * ligature under /aa. FB05 should fold to FB06, but the call
20248 * above to _to_uni_fold_flags() didn't find this, as it didn't
20249 * use the /aa restriction in order to not miss other folds
20250 * that would be affected. This is the only instance likely to
20251 * ever be a problem in all of Unicode. So special case it. */
20252 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
20253 && ASCII_FOLD_RESTRICTED)
20255 value = LATIN_SMALL_LIGATURE_ST;
20259 len = (UTF) ? UVCHR_SKIP(value) : 1;
20261 *ret = REGNODE_GUTS(pRExC_state, op, len);
20262 FILL_NODE(*ret, op);
20263 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
20264 setSTR_LEN(REGNODE_p(*ret), len);
20266 *STRINGs(REGNODE_p(*ret)) = (U8) value;
20269 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
20276 if (! has_runtime_dependency) {
20278 /* See if this can be turned into an ANYOFM node. Think about the bit
20279 * patterns in two different bytes. In some positions, the bits in
20280 * each will be 1; and in other positions both will be 0; and in some
20281 * positions the bit will be 1 in one byte, and 0 in the other. Let
20282 * 'n' be the number of positions where the bits differ. We create a
20283 * mask which has exactly 'n' 0 bits, each in a position where the two
20284 * bytes differ. Now take the set of all bytes that when ANDed with
20285 * the mask yield the same result. That set has 2**n elements, and is
20286 * representable by just two 8 bit numbers: the result and the mask.
20287 * Importantly, matching the set can be vectorized by creating a word
20288 * full of the result bytes, and a word full of the mask bytes,
20289 * yielding a significant speed up. Here, see if this node matches
20290 * such a set. As a concrete example consider [01], and the byte
20291 * representing '0' which is 0x30 on ASCII machines. It has the bits
20292 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
20293 * that mask we get 0x30. Any other bytes ANDed yield something else.
20294 * So [01], which is a common usage, is optimizable into ANYOFM, and
20295 * can benefit from the speed up. We can only do this on UTF-8
20296 * invariant bytes, because they have the same bit patterns under UTF-8
20298 PERL_UINT_FAST8_T inverted = 0;
20300 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
20302 const PERL_UINT_FAST8_T max_permissible
20303 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
20305 /* If doesn't fit the criteria for ANYOFM, invert and try again. If
20306 * that works we will instead later generate an NANYOFM, and invert
20307 * back when through */
20308 if (highest_cp > max_permissible) {
20309 _invlist_invert(cp_list);
20313 if (invlist_highest(cp_list) <= max_permissible) {
20314 UV this_start, this_end;
20315 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
20316 U8 bits_differing = 0;
20317 Size_t full_cp_count = 0;
20318 bool first_time = TRUE;
20320 /* Go through the bytes and find the bit positions that differ */
20321 invlist_iterinit(cp_list);
20322 while (invlist_iternext(cp_list, &this_start, &this_end)) {
20323 unsigned int i = this_start;
20326 if (! UVCHR_IS_INVARIANT(i)) {
20330 first_time = FALSE;
20331 lowest_cp = this_start;
20333 /* We have set up the code point to compare with. Don't
20334 * compare it with itself */
20338 /* Find the bit positions that differ from the lowest code
20339 * point in the node. Keep track of all such positions by
20341 for (; i <= this_end; i++) {
20342 if (! UVCHR_IS_INVARIANT(i)) {
20346 bits_differing |= i ^ lowest_cp;
20349 full_cp_count += this_end - this_start + 1;
20352 /* At the end of the loop, we count how many bits differ from the
20353 * bits in lowest code point, call the count 'd'. If the set we
20354 * found contains 2**d elements, it is the closure of all code
20355 * points that differ only in those bit positions. To convince
20356 * yourself of that, first note that the number in the closure must
20357 * be a power of 2, which we test for. The only way we could have
20358 * that count and it be some differing set, is if we got some code
20359 * points that don't differ from the lowest code point in any
20360 * position, but do differ from each other in some other position.
20361 * That means one code point has a 1 in that position, and another
20362 * has a 0. But that would mean that one of them differs from the
20363 * lowest code point in that position, which possibility we've
20364 * already excluded. */
20365 if ( (inverted || full_cp_count > 1)
20366 && full_cp_count == 1U << PL_bitcount[bits_differing])
20370 op = ANYOFM + inverted;;
20372 /* We need to make the bits that differ be 0's */
20373 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
20375 /* The argument is the lowest code point */
20376 *ret = reganode(pRExC_state, op, lowest_cp);
20377 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
20381 invlist_iterfinish(cp_list);
20385 _invlist_invert(cp_list);
20392 /* XXX We could create an ANYOFR_LOW node here if we saved above if all
20393 * were invariants, it wasn't inverted, and there is a single range.
20394 * This would be faster than some of the posix nodes we create below
20395 * like /\d/a, but would be twice the size. Without having actually
20396 * measured the gain, khw doesn't think the tradeoff is really worth it
20400 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
20401 PERL_UINT_FAST8_T type;
20402 SV * intersection = NULL;
20403 SV* d_invlist = NULL;
20405 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
20406 * ones are about the same speed as ANYOF ops, but take less room; the
20407 * ones that have above-Latin1 code point matches are somewhat faster
20410 for (type = POSIXA; type >= POSIXD; type--) {
20413 if (type == POSIXL) { /* But not /l posix classes */
20417 for (posix_class = 0;
20418 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
20421 SV** our_code_points = &cp_list;
20422 SV** official_code_points;
20425 if (type == POSIXA) {
20426 official_code_points = &PL_Posix_ptrs[posix_class];
20429 official_code_points = &PL_XPosix_ptrs[posix_class];
20432 /* Skip non-existent classes of this type. e.g. \v only has an
20433 * entry in PL_XPosix_ptrs */
20434 if (! *official_code_points) {
20438 /* Try both the regular class, and its inversion */
20439 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
20440 bool this_inverted = *invert ^ try_inverted;
20442 if (type != POSIXD) {
20444 /* This class that isn't /d can't match if we have /d
20446 if (has_runtime_dependency
20447 & HAS_D_RUNTIME_DEPENDENCY)
20452 else /* is /d */ if (! this_inverted) {
20454 /* /d classes don't match anything non-ASCII below 256
20455 * unconditionally (which cp_list contains) */
20456 _invlist_intersection(cp_list, PL_UpperLatin1,
20458 if (_invlist_len(intersection) != 0) {
20462 SvREFCNT_dec(d_invlist);
20463 d_invlist = invlist_clone(cp_list, NULL);
20465 /* But under UTF-8 it turns into using /u rules. Add
20466 * the things it matches under these conditions so that
20467 * we check below that these are identical to what the
20468 * tested class should match */
20469 if (upper_latin1_only_utf8_matches) {
20472 upper_latin1_only_utf8_matches,
20475 our_code_points = &d_invlist;
20477 else { /* POSIXD, inverted. If this doesn't have this
20478 flag set, it isn't /d. */
20479 if (! ( *anyof_flags
20480 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
20485 our_code_points = &cp_list;
20488 /* Here, have weeded out some things. We want to see if
20489 * the list of characters this node contains
20490 * ('*our_code_points') precisely matches those of the
20491 * class we are currently checking against
20492 * ('*official_code_points'). */
20493 if (_invlistEQ(*our_code_points,
20494 *official_code_points,
20497 /* Here, they precisely match. Optimize this ANYOF
20498 * node into its equivalent POSIX one of the correct
20499 * type, possibly inverted.
20501 * Some of these nodes match a single range of
20502 * characters (or [:alpha:] matches two parallel ranges
20503 * on ASCII platforms). The array lookup at execution
20504 * time could be replaced by a range check for such
20505 * nodes. But regnodes are a finite resource, and the
20506 * possible performance boost isn't large, so this
20507 * hasn't been done. An attempt to use just one node
20508 * (and its inverse) to encompass all such cases was
20509 * made in d62feba66bf43f35d092bb026694f927e9f94d38.
20510 * But the shifting/masking it used ended up being
20511 * slower than the array look up, so it was reverted */
20512 op = (try_inverted)
20513 ? type + NPOSIXA - POSIXA
20515 *ret = reg_node(pRExC_state, op);
20516 FLAGS(REGNODE_p(*ret)) = posix_class;
20517 SvREFCNT_dec(d_invlist);
20518 SvREFCNT_dec(intersection);
20524 SvREFCNT_dec(d_invlist);
20525 SvREFCNT_dec(intersection);
20528 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
20529 * in size and speed. Currently, a 20 bit range base (smallest code point
20530 * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
20531 * This allows for using it on all of the Unicode code points except for
20532 * the highest plane, which is only for private use code points. khw
20533 * doubts that a bigger delta is likely in real world applications */
20535 && ! has_runtime_dependency
20536 && *anyof_flags == 0
20537 && start[0] < (1 << ANYOFR_BASE_BITS)
20538 && end[0] - start[0]
20539 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
20540 * CHARBITS - ANYOFR_BASE_BITS))))
20543 U8 low_utf8[UTF8_MAXBYTES+1];
20544 U8 high_utf8[UTF8_MAXBYTES+1];
20547 *ret = reganode(pRExC_state, op,
20548 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
20550 /* Place the lowest UTF-8 start byte in the flags field, so as to allow
20551 * efficient ruling out at run time of many possible inputs. */
20552 (void) uvchr_to_utf8(low_utf8, start[0]);
20553 (void) uvchr_to_utf8(high_utf8, end[0]);
20555 /* If all code points share the same first byte, this can be an
20556 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
20557 * quickly rule out many inputs at run-time without having to compute
20558 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
20559 * transformation would not rule out nearly so many things */
20560 if (low_utf8[0] == high_utf8[0]) {
20562 OP(REGNODE_p(*ret)) = op;
20563 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
20566 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
20572 /* If didn't find an optimization and there is no need for a bitmap,
20573 * of the lowest code points, optimize to indicate that */
20574 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
20576 && ! upper_latin1_only_utf8_matches
20577 && *anyof_flags == 0)
20579 U8 low_utf8[UTF8_MAXBYTES+1];
20580 UV highest_cp = invlist_highest(cp_list);
20582 /* Currently the maximum allowed code point by the system is IV_MAX.
20583 * Higher ones are reserved for future internal use. This particular
20584 * regnode can be used for higher ones, but we can't calculate the code
20585 * point of those. IV_MAX suffices though, as it will be a large first
20587 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
20590 /* We store the lowest possible first byte of the UTF-8 representation,
20591 * using the flags field. This allows for quick ruling out of some
20592 * inputs without having to convert from UTF-8 to code point. For
20593 * EBCDIC, we use I8, as not doing that transformation would not rule
20594 * out nearly so many things */
20595 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
20599 /* If the first UTF-8 start byte for the highest code point in the
20600 * range is suitably small, we may be able to get an upper bound as
20602 if (highest_cp <= IV_MAX) {
20603 U8 high_utf8[UTF8_MAXBYTES+1];
20604 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
20606 /* If the lowest and highest are the same, we can get an exact
20607 * first byte instead of a just minimum or even a sequence of exact
20608 * leading bytes. We signal these with different regnodes */
20609 if (low_utf8[0] == high_utf8[0]) {
20610 Size_t len = find_first_differing_byte_pos(low_utf8,
20612 MIN(low_len, high_len));
20615 /* No need to convert to I8 for EBCDIC as this is an exact
20617 *anyof_flags = low_utf8[0];
20619 if (high_len == 2) {
20620 /* If the elements matched all have a 2-byte UTF-8
20621 * representation, with the first byte being the same,
20622 * we can use a compact, fast regnode. capable of
20623 * matching any combination of continuation byte
20626 * (A similar regnode could be created for the Latin1
20627 * range; the complication being that it could match
20628 * non-UTF8 targets. The internal bitmap would serve
20629 * both cases; with some extra code in regexec.c) */
20631 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
20632 FILL_NODE(*ret, op);
20633 ((struct regnode_bbm *) REGNODE_p(*ret))->first_byte = low_utf8[0],
20635 /* The 64 bit (or 32 on EBCCDIC) map can be looked up
20636 * directly based on the continuation byte, without
20637 * needing to convert to code point */
20638 populate_bitmap_from_invlist(
20641 /* The base code point is from the start byte */
20642 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
20643 UTF_CONTINUATION_MARK | 0),
20645 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
20646 REGNODE_BBM_BITMAP_LEN);
20647 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
20656 *ret = REGNODE_GUTS(pRExC_state, op,
20657 REGNODE_ARG_LEN(op) + STR_SZ(len));
20658 FILL_NODE(*ret, op);
20659 ((struct regnode_anyofhs *) REGNODE_p(*ret))->str_len
20661 Copy(low_utf8, /* Add the common bytes */
20662 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
20664 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
20665 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
20666 NULL, only_utf8_locale_list);
20670 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
20672 /* Here, the high byte is not the same as the low, but is small
20673 * enough that its reasonable to have a loose upper bound,
20674 * which is packed in with the strict lower bound. See
20675 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
20676 * platforms, I8 is used. On ASCII platforms I8 is the same
20677 * thing as UTF-8 */
20680 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
20681 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
20684 if (range_diff <= max_range_diff / 8) {
20687 else if (range_diff <= max_range_diff / 4) {
20690 else if (range_diff <= max_range_diff / 2) {
20693 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
20703 *ret = reganode(pRExC_state, op, 0);
20708 *ret = reg_node(pRExC_state, op);
20713 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
20716 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
20717 regnode* const node,
20719 SV* const runtime_defns,
20720 SV* const only_utf8_locale_list)
20722 /* Sets the arg field of an ANYOF-type node 'node', using information about
20723 * the node passed-in. If only the bitmap is needed to determine what
20724 * matches, the arg is set appropriately to either
20725 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
20726 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
20728 * Otherwise, it sets the argument to the count returned by add_data(),
20729 * having allocated and stored an array, av, as follows:
20730 * av[0] stores the inversion list defining this class as far as known at
20731 * this time, or PL_sv_undef if nothing definite is now known.
20732 * av[1] stores the inversion list of code points that match only if the
20733 * current locale is UTF-8, or if none, PL_sv_undef if there is an
20734 * av[2], or no entry otherwise.
20735 * av[2] stores the list of user-defined properties whose subroutine
20736 * definitions aren't known at this time, or no entry if none. */
20740 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
20742 /* If this is set, the final disposition won't be known until runtime, so
20743 * we can't do any of the compile time optimizations */
20744 if (! runtime_defns) {
20746 /* On plain ANYOF nodes without the possibility of a runtime locale
20747 * making a difference, maybe there's no information to be gleaned
20748 * except for what's in the bitmap */
20749 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
20751 /* There are two such cases:
20752 * 1) there is no list of code points matched outside the bitmap
20755 ARG_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
20759 /* 2) the list indicates everything outside the bitmap matches */
20760 if ( invlist_highest(cp_list) == UV_MAX
20761 && invlist_highest_range_start(cp_list)
20762 <= NUM_ANYOF_CODE_POINTS)
20764 ARG_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
20768 /* In all other cases there are things outside the bitmap that we
20769 * may need to check at runtime. */
20772 /* Here, we have resolved all the possible run-time matches, and they
20773 * are stored in one or both of two possible lists. (While some match
20774 * only under certain runtime circumstances, we know all the possible
20775 * ones for each such circumstance.)
20777 * It may very well be that the pattern being compiled contains an
20778 * identical class, already encountered. Reusing that class here saves
20779 * space. Look through all classes so far encountered. */
20780 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
20781 for (unsigned int i = 0; i < existing_items; i++) {
20783 /* Only look at auxiliary data of this type */
20784 if (RExC_rxi->data->what[i] != 's') {
20788 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
20789 AV * const av = MUTABLE_AV(SvRV(rv));
20791 /* If the already encountered class has data that won't be known
20792 * until runtime (stored in the final element of the array), we
20794 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
20798 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
20799 false /* no lvalue */);
20801 /* The new and the existing one both have to have or both not
20802 * have this element, for this one to duplicate that one */
20803 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
20807 /* If the inversion lists aren't equivalent, can't share */
20808 if (cp_list && ! _invlistEQ(cp_list,
20809 *stored_cp_list_ptr,
20810 FALSE /* don't complement */))
20815 /* Similarly for the other list */
20816 SV ** stored_only_utf8_locale_list_ptr = av_fetch(
20818 ONLY_LOCALE_MATCHES_INDEX,
20819 false /* no lvalue */);
20820 if ( cBOOL(only_utf8_locale_list)
20821 != cBOOL(stored_only_utf8_locale_list_ptr))
20826 if (only_utf8_locale_list && ! _invlistEQ(
20827 only_utf8_locale_list,
20828 *stored_only_utf8_locale_list_ptr,
20829 FALSE /* don't complement */))
20834 /* Here, the existence and contents of both compile-time lists
20835 * are identical between the new and existing data. Re-use the
20839 } /* end of loop through existing classes */
20842 /* Here, we need to create a new auxiliary data element; either because
20843 * this doesn't duplicate an existing one, or we can't tell at this time if
20844 * it eventually will */
20846 AV * const av = newAV();
20850 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20853 /* (Note that if any of this changes, the size calculations in
20854 * S_optimize_regclass() might need to be updated.) */
20856 if (only_utf8_locale_list) {
20857 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20858 SvREFCNT_inc_NN(only_utf8_locale_list));
20861 if (runtime_defns) {
20862 av_store(av, DEFERRED_USER_DEFINED_INDEX,
20863 SvREFCNT_inc_NN(runtime_defns));
20866 rv = newRV_noinc(MUTABLE_SV(av));
20867 n = add_data(pRExC_state, STR_WITH_LEN("s"));
20868 RExC_rxi->data->data[n] = (void*)rv;
20874 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20875 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20877 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20881 /* For internal core use only.
20882 * Returns the inversion list for the input 'node' in the regex 'prog'.
20883 * If <doinit> is 'true', will attempt to create the inversion list if not
20884 * already done. If it is created, it will add to the normal inversion
20885 * list any that comes from user-defined properties. It croaks if this
20886 * is called before such a list is ready to be generated, that is when a
20887 * user-defined property has been declared, buyt still not yet defined.
20888 * If <listsvp> is non-null, will return the printable contents of the
20889 * property definition. This can be used to get debugging information
20890 * even before the inversion list exists, by calling this function with
20891 * 'doinit' set to false, in which case the components that will be used
20892 * to eventually create the inversion list are returned (in a printable
20894 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20895 * store an inversion list of code points that should match only if the
20896 * execution-time locale is a UTF-8 one.
20897 * If <output_invlist> is not NULL, it is where this routine is to store an
20898 * inversion list of the code points that would be instead returned in
20899 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20900 * when this parameter is used, is just the non-code point data that
20901 * will go into creating the inversion list. This currently should be just
20902 * user-defined properties whose definitions were not known at compile
20903 * time. Using this parameter allows for easier manipulation of the
20904 * inversion list's data by the caller. It is illegal to call this
20905 * function with this parameter set, but not <listsvp>
20907 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20908 * that, in spite of this function's name, the inversion list it returns
20909 * may include the bitmap data as well */
20911 SV *si = NULL; /* Input initialization string */
20912 SV* invlist = NULL;
20914 RXi_GET_DECL_NULL(prog, progi);
20915 const struct reg_data * const data = prog ? progi->data : NULL;
20917 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20918 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
20920 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
20922 assert(! output_invlist || listsvp);
20924 if (data && data->count) {
20925 const U32 n = ARG(node);
20927 if (data->what[n] == 's') {
20928 SV * const rv = MUTABLE_SV(data->data[n]);
20929 AV * const av = MUTABLE_AV(SvRV(rv));
20930 SV **const ary = AvARRAY(av);
20932 invlist = ary[INVLIST_INDEX];
20934 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20935 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20938 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20939 si = ary[DEFERRED_USER_DEFINED_INDEX];
20942 if (doinit && (si || invlist)) {
20945 SV * msg = newSVpvs_flags("", SVs_TEMP);
20947 SV * prop_definition = handle_user_defined_property(
20948 "", 0, FALSE, /* There is no \p{}, \P{} */
20949 SvPVX_const(si)[1] - '0', /* /i or not has been
20950 stored here for just
20952 TRUE, /* run time */
20953 FALSE, /* This call must find the defn */
20954 si, /* The property definition */
20957 0 /* base level call */
20961 assert(prop_definition == NULL);
20963 Perl_croak(aTHX_ "%" UTF8f,
20964 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20968 _invlist_union(invlist, prop_definition, &invlist);
20969 SvREFCNT_dec_NN(prop_definition);
20972 invlist = prop_definition;
20975 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20976 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20978 ary[INVLIST_INDEX] = invlist;
20979 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20980 ? ONLY_LOCALE_MATCHES_INDEX
20988 /* If requested, return a printable version of what this ANYOF node matches
20991 SV* matches_string = NULL;
20993 /* This function can be called at compile-time, before everything gets
20994 * resolved, in which case we return the currently best available
20995 * information, which is the string that will eventually be used to do
20996 * that resolving, 'si' */
20998 /* Here, we only have 'si' (and possibly some passed-in data in
20999 * 'invlist', which is handled below) If the caller only wants
21000 * 'si', use that. */
21001 if (! output_invlist) {
21002 matches_string = newSVsv(si);
21005 /* But if the caller wants an inversion list of the node, we
21006 * need to parse 'si' and place as much as possible in the
21007 * desired output inversion list, making 'matches_string' only
21008 * contain the currently unresolvable things */
21009 const char *si_string = SvPVX(si);
21010 STRLEN remaining = SvCUR(si);
21014 /* Ignore everything before and including the first new-line */
21015 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
21016 assert (si_string != NULL);
21018 remaining = SvPVX(si) + SvCUR(si) - si_string;
21020 while (remaining > 0) {
21022 /* The data consists of just strings defining user-defined
21023 * property names, but in prior incarnations, and perhaps
21024 * somehow from pluggable regex engines, it could still
21025 * hold hex code point definitions, all of which should be
21026 * legal (or it wouldn't have gotten this far). Each
21027 * component of a range would be separated by a tab, and
21028 * each range by a new-line. If these are found, instead
21029 * add them to the inversion list */
21030 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
21031 |PERL_SCAN_SILENT_NON_PORTABLE;
21032 STRLEN len = remaining;
21033 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
21035 /* If the hex decode routine found something, it should go
21036 * up to the next \n */
21037 if ( *(si_string + len) == '\n') {
21038 if (count) { /* 2nd code point on line */
21039 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
21042 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
21045 goto prepare_for_next_iteration;
21048 /* If the hex decode was instead for the lower range limit,
21049 * save it, and go parse the upper range limit */
21050 if (*(si_string + len) == '\t') {
21051 assert(count == 0);
21055 prepare_for_next_iteration:
21056 si_string += len + 1;
21057 remaining -= len + 1;
21061 /* Here, didn't find a legal hex number. Just add the text
21062 * from here up to the next \n, omitting any trailing
21066 len = strcspn(si_string,
21067 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
21069 if (matches_string) {
21070 sv_catpvn(matches_string, si_string, len);
21073 matches_string = newSVpvn(si_string, len);
21075 sv_catpvs(matches_string, " ");
21079 && UCHARAT(si_string)
21080 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
21085 if (remaining && UCHARAT(si_string) == '\n') {
21089 } /* end of loop through the text */
21091 assert(matches_string);
21092 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
21093 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
21095 } /* end of has an 'si' */
21098 /* Add the stuff that's already known */
21101 /* Again, if the caller doesn't want the output inversion list, put
21102 * everything in 'matches-string' */
21103 if (! output_invlist) {
21104 if ( ! matches_string) {
21105 matches_string = newSVpvs("\n");
21107 sv_catsv(matches_string, invlist_contents(invlist,
21108 TRUE /* traditional style */
21111 else if (! *output_invlist) {
21112 *output_invlist = invlist_clone(invlist, NULL);
21115 _invlist_union(*output_invlist, invlist, output_invlist);
21119 *listsvp = matches_string;
21125 /* reg_skipcomment()
21127 Absorbs an /x style # comment from the input stream,
21128 returning a pointer to the first character beyond the comment, or if the
21129 comment terminates the pattern without anything following it, this returns
21130 one past the final character of the pattern (in other words, RExC_end) and
21131 sets the REG_RUN_ON_COMMENT_SEEN flag.
21133 Note it's the callers responsibility to ensure that we are
21134 actually in /x mode
21138 PERL_STATIC_INLINE char*
21139 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
21141 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
21145 while (p < RExC_end) {
21146 if (*(++p) == '\n') {
21151 /* we ran off the end of the pattern without ending the comment, so we have
21152 * to add an \n when wrapping */
21153 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
21158 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
21160 const bool force_to_xmod
21163 /* If the text at the current parse position '*p' is a '(?#...)' comment,
21164 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
21165 * is /x whitespace, advance '*p' so that on exit it points to the first
21166 * byte past all such white space and comments */
21168 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
21170 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
21172 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
21175 if (RExC_end - (*p) >= 3
21177 && *(*p + 1) == '?'
21178 && *(*p + 2) == '#')
21180 while (*(*p) != ')') {
21181 if ((*p) == RExC_end)
21182 FAIL("Sequence (?#... not terminated");
21190 const char * save_p = *p;
21191 while ((*p) < RExC_end) {
21193 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
21196 else if (*(*p) == '#') {
21197 (*p) = reg_skipcomment(pRExC_state, (*p));
21203 if (*p != save_p) {
21216 Advances the parse position by one byte, unless that byte is the beginning
21217 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
21218 those two cases, the parse position is advanced beyond all such comments and
21221 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
21225 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
21227 PERL_ARGS_ASSERT_NEXTCHAR;
21229 if (RExC_parse < RExC_end) {
21231 || UTF8_IS_INVARIANT(*RExC_parse)
21232 || UTF8_IS_START(*RExC_parse));
21234 RExC_parse_inc_safe();
21236 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
21237 FALSE /* Don't force /x */ );
21242 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
21244 /* 'size' is the delta number of smallest regnode equivalents to add or
21245 * subtract from the current memory allocated to the regex engine being
21248 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
21253 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
21254 /* +1 for REG_MAGIC */
21257 if ( RExC_rxi == NULL )
21258 FAIL("Regexp out of space");
21259 RXi_SET(RExC_rx, RExC_rxi);
21261 RExC_emit_start = RExC_rxi->program;
21263 Zero(REGNODE_p(RExC_emit), size, regnode);
21267 STATIC regnode_offset
21268 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
21270 /* Allocate a regnode that is (1 + extra_size) times as big as the
21271 * smallest regnode worth of space, and also aligns and increments
21272 * RExC_size appropriately.
21274 * It returns the regnode's offset into the regex engine program */
21276 const regnode_offset ret = RExC_emit;
21278 PERL_ARGS_ASSERT_REGNODE_GUTS;
21280 SIZE_ALIGN(RExC_size);
21281 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
21282 NODE_ALIGN_FILL(REGNODE_p(ret));
21288 STATIC regnode_offset
21289 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
21290 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
21291 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
21292 return S_regnode_guts(aTHX_ pRExC_state, extra_size);
21300 - reg_node - emit a node
21302 STATIC regnode_offset /* Location. */
21303 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
21305 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21306 regnode_offset ptr = ret;
21308 PERL_ARGS_ASSERT_REG_NODE;
21310 assert(REGNODE_ARG_LEN(op) == 0);
21312 FILL_ADVANCE_NODE(ptr, op);
21318 - reganode - emit a node with an argument
21320 STATIC regnode_offset /* Location. */
21321 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
21323 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21324 regnode_offset ptr = ret;
21326 PERL_ARGS_ASSERT_REGANODE;
21328 /* ANYOF are special cased to allow non-length 1 args */
21329 assert(REGNODE_ARG_LEN(op) == 1);
21331 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
21337 - regpnode - emit a temporary node with a SV* argument
21339 STATIC regnode_offset /* Location. */
21340 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
21342 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21343 regnode_offset ptr = ret;
21345 PERL_ARGS_ASSERT_REGPNODE;
21347 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
21352 STATIC regnode_offset
21353 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
21355 /* emit a node with U32 and I32 arguments */
21357 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
21358 regnode_offset ptr = ret;
21360 PERL_ARGS_ASSERT_REG2LANODE;
21362 assert(REGNODE_ARG_LEN(op) == 2);
21364 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
21370 - reginsert - insert an operator in front of already-emitted operand
21372 * That means that on exit 'operand' is the offset of the newly inserted
21373 * operator, and the original operand has been relocated.
21375 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
21376 * set up NEXT_OFF() of the inserted node if needed. Something like this:
21378 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
21379 * NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
21381 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
21384 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
21385 const regnode_offset operand, const U32 depth)
21390 const int offset = REGNODE_ARG_LEN((U8)op);
21391 const int size = NODE_STEP_REGNODE + offset;
21392 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21394 PERL_ARGS_ASSERT_REGINSERT;
21395 PERL_UNUSED_CONTEXT;
21396 PERL_UNUSED_ARG(depth);
21397 /* (REGNODE_TYPE((U8)op) == CURLY ? EXTRA_STEP_2ARGS : 0); */
21398 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
21399 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
21400 studying. If this is wrong then we need to adjust RExC_recurse
21401 below like we do with RExC_open_parens/RExC_close_parens. */
21402 change_engine_size(pRExC_state, (Ptrdiff_t) size);
21403 src = REGNODE_p(RExC_emit);
21405 dst = REGNODE_p(RExC_emit);
21407 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
21408 * and [perl #133871] shows this can lead to problems, so skip this
21409 * realignment of parens until a later pass when they are reliable */
21410 if (! IN_PARENS_PASS && RExC_open_parens) {
21412 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
21413 /* remember that RExC_npar is rex->nparens + 1,
21414 * iow it is 1 more than the number of parens seen in
21415 * the pattern so far. */
21416 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
21417 /* note, RExC_open_parens[0] is the start of the
21418 * regex, it can't move. RExC_close_parens[0] is the end
21419 * of the regex, it *can* move. */
21420 if ( paren && RExC_open_parens[paren] >= operand ) {
21421 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
21422 RExC_open_parens[paren] += size;
21424 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
21426 if ( RExC_close_parens[paren] >= operand ) {
21427 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
21428 RExC_close_parens[paren] += size;
21430 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
21435 RExC_end_op += size;
21437 while (src > REGNODE_p(operand)) {
21438 StructCopy(--src, --dst, regnode);
21441 place = REGNODE_p(operand); /* Op node, where operand used to be. */
21442 src = place + 1; /* NOT REGNODE_AFTER! */
21444 FILL_NODE(operand, op);
21446 /* Zero out any arguments in the new node */
21447 Zero(src, offset, regnode);
21451 - regtail - set the next-pointer at the end of a node chain of p to val. If
21452 that value won't fit in the space available, instead returns FALSE.
21453 (Except asserts if we can't fit in the largest space the regex
21454 engine is designed for.)
21455 - SEE ALSO: regtail_study
21458 S_regtail(pTHX_ RExC_state_t * pRExC_state,
21459 const regnode_offset p,
21460 const regnode_offset val,
21463 regnode_offset scan;
21464 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21466 PERL_ARGS_ASSERT_REGTAIL;
21468 PERL_UNUSED_ARG(depth);
21471 /* The final node in the chain is the first one with a nonzero next pointer
21473 scan = (regnode_offset) p;
21475 regnode * const temp = regnext(REGNODE_p(scan));
21477 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
21478 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
21479 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
21480 SvPV_nolen_const(RExC_mysv), scan,
21481 (temp == NULL ? "->" : ""),
21482 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
21487 scan = REGNODE_OFFSET(temp);
21490 /* Populate this node's next pointer */
21491 assert(val >= scan);
21492 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
21493 assert((UV) (val - scan) <= U32_MAX);
21494 ARG_SET(REGNODE_p(scan), val - scan);
21497 if (val - scan > U16_MAX) {
21498 /* Populate this with something that won't loop and will likely
21499 * lead to a crash if the caller ignores the failure return, and
21500 * execution continues */
21501 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
21504 NEXT_OFF(REGNODE_p(scan)) = val - scan;
21512 - regtail_study - set the next-pointer at the end of a node chain of p to val.
21513 - Look for optimizable sequences at the same time.
21514 - currently only looks for EXACT chains.
21516 This is experimental code. The idea is to use this routine to perform
21517 in place optimizations on branches and groups as they are constructed,
21518 with the long term intention of removing optimization from study_chunk so
21519 that it is purely analytical.
21521 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
21522 to control which is which.
21524 This used to return a value that was ignored. It was a problem that it is
21525 #ifdef'd to be another function that didn't return a value. khw has changed it
21526 so both currently return a pass/fail return.
21529 /* TODO: All four parms should be const */
21532 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
21533 const regnode_offset val, U32 depth)
21535 regnode_offset scan;
21537 #ifdef EXPERIMENTAL_INPLACESCAN
21540 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21542 PERL_ARGS_ASSERT_REGTAIL_STUDY;
21545 /* Find last node. */
21549 regnode * const temp = regnext(REGNODE_p(scan));
21550 #ifdef EXPERIMENTAL_INPLACESCAN
21551 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
21552 bool unfolded_multi_char; /* Unexamined in this routine */
21553 if (join_exact(pRExC_state, scan, &min,
21554 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
21555 return TRUE; /* Was return EXACT */
21559 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
21560 if (exact == PSEUDO )
21561 exact= OP(REGNODE_p(scan));
21562 else if (exact != OP(REGNODE_p(scan)) )
21565 else if (OP(REGNODE_p(scan)) != NOTHING) {
21570 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
21571 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
21572 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
21573 SvPV_nolen_const(RExC_mysv),
21575 REGNODE_NAME(exact));
21579 scan = REGNODE_OFFSET(temp);
21582 DEBUG_PARSE_MSG("");
21583 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
21584 Perl_re_printf( aTHX_
21585 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
21586 SvPV_nolen_const(RExC_mysv),
21591 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
21592 assert((UV) (val - scan) <= U32_MAX);
21593 ARG_SET(REGNODE_p(scan), val - scan);
21596 if (val - scan > U16_MAX) {
21597 /* Populate this with something that won't loop and will likely
21598 * lead to a crash if the caller ignores the failure return, and
21599 * execution continues */
21600 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
21603 NEXT_OFF(REGNODE_p(scan)) = val - scan;
21606 return TRUE; /* Was 'return exact' */
21611 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
21613 /* Returns an inversion list of all the code points matched by the
21614 * ANYOFM/NANYOFM node 'n' */
21616 SV * cp_list = _new_invlist(-1);
21617 const U8 lowest = (U8) ARG(n);
21620 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
21622 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
21624 /* Starting with the lowest code point, any code point that ANDed with the
21625 * mask yields the lowest code point is in the set */
21626 for (i = lowest; i <= 0xFF; i++) {
21627 if ((i & FLAGS(n)) == ARG(n)) {
21628 cp_list = add_cp_to_invlist(cp_list, i);
21631 /* We know how many code points (a power of two) that are in the
21632 * set. No use looking once we've got that number */
21633 if (count >= needed) break;
21637 if (OP(n) == NANYOFM) {
21638 _invlist_invert(cp_list);
21644 S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
21645 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
21647 SV * cp_list = NULL;
21648 populate_invlist_from_bitmap(
21649 ((struct regnode_bbm *) n)->bitmap,
21650 REGNODE_BBM_BITMAP_LEN * CHARBITS,
21653 /* The base cp is from the start byte plus a zero continuation */
21654 TWO_BYTE_UTF8_TO_NATIVE(((struct regnode_bbm *) n)->first_byte,
21655 UTF_CONTINUATION_MARK | 0));
21660 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
21665 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
21670 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
21672 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
21673 if (flags & (1<<bit)) {
21674 if (!set++ && lead)
21675 Perl_re_printf( aTHX_ "%s", lead);
21676 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
21681 Perl_re_printf( aTHX_ "\n");
21683 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
21688 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
21694 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
21696 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
21697 if (flags & (1U<<bit)) {
21698 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
21701 if (!set++ && lead)
21702 Perl_re_printf( aTHX_ "%s", lead);
21703 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
21706 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
21707 if (!set++ && lead) {
21708 Perl_re_printf( aTHX_ "%s", lead);
21711 case REGEX_UNICODE_CHARSET:
21712 Perl_re_printf( aTHX_ "UNICODE");
21714 case REGEX_LOCALE_CHARSET:
21715 Perl_re_printf( aTHX_ "LOCALE");
21717 case REGEX_ASCII_RESTRICTED_CHARSET:
21718 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
21720 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
21721 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
21724 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
21730 Perl_re_printf( aTHX_ "\n");
21732 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
21738 Perl_regdump(pTHX_ const regexp *r)
21742 SV * const sv = sv_newmortal();
21743 SV *dsv= sv_newmortal();
21744 RXi_GET_DECL(r, ri);
21745 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21747 PERL_ARGS_ASSERT_REGDUMP;
21749 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
21751 /* Header fields of interest. */
21752 for (i = 0; i < 2; i++) {
21753 if (r->substrs->data[i].substr) {
21754 RE_PV_QUOTED_DECL(s, 0, dsv,
21755 SvPVX_const(r->substrs->data[i].substr),
21756 RE_SV_DUMPLEN(r->substrs->data[i].substr),
21757 PL_dump_re_max_len);
21758 Perl_re_printf( aTHX_
21759 "%s %s%s at %" IVdf "..%" UVuf " ",
21760 i ? "floating" : "anchored",
21762 RE_SV_TAIL(r->substrs->data[i].substr),
21763 (IV)r->substrs->data[i].min_offset,
21764 (UV)r->substrs->data[i].max_offset);
21766 else if (r->substrs->data[i].utf8_substr) {
21767 RE_PV_QUOTED_DECL(s, 1, dsv,
21768 SvPVX_const(r->substrs->data[i].utf8_substr),
21769 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
21771 Perl_re_printf( aTHX_
21772 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
21773 i ? "floating" : "anchored",
21775 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
21776 (IV)r->substrs->data[i].min_offset,
21777 (UV)r->substrs->data[i].max_offset);
21781 if (r->check_substr || r->check_utf8)
21782 Perl_re_printf( aTHX_
21784 ( r->check_substr == r->substrs->data[1].substr
21785 && r->check_utf8 == r->substrs->data[1].utf8_substr
21786 ? "(checking floating" : "(checking anchored"));
21787 if (r->intflags & PREGf_NOSCAN)
21788 Perl_re_printf( aTHX_ " noscan");
21789 if (r->extflags & RXf_CHECK_ALL)
21790 Perl_re_printf( aTHX_ " isall");
21791 if (r->check_substr || r->check_utf8)
21792 Perl_re_printf( aTHX_ ") ");
21794 if (ri->regstclass) {
21795 regprop(r, sv, ri->regstclass, NULL, NULL);
21796 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
21798 if (r->intflags & PREGf_ANCH) {
21799 Perl_re_printf( aTHX_ "anchored");
21800 if (r->intflags & PREGf_ANCH_MBOL)
21801 Perl_re_printf( aTHX_ "(MBOL)");
21802 if (r->intflags & PREGf_ANCH_SBOL)
21803 Perl_re_printf( aTHX_ "(SBOL)");
21804 if (r->intflags & PREGf_ANCH_GPOS)
21805 Perl_re_printf( aTHX_ "(GPOS)");
21806 Perl_re_printf( aTHX_ " ");
21808 if (r->intflags & PREGf_GPOS_SEEN)
21809 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
21810 if (r->intflags & PREGf_SKIP)
21811 Perl_re_printf( aTHX_ "plus ");
21812 if (r->intflags & PREGf_IMPLICIT)
21813 Perl_re_printf( aTHX_ "implicit ");
21814 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
21815 if (r->extflags & RXf_EVAL_SEEN)
21816 Perl_re_printf( aTHX_ "with eval ");
21817 Perl_re_printf( aTHX_ "\n");
21819 regdump_extflags("r->extflags: ", r->extflags);
21820 regdump_intflags("r->intflags: ", r->intflags);
21823 PERL_ARGS_ASSERT_REGDUMP;
21824 PERL_UNUSED_CONTEXT;
21825 PERL_UNUSED_ARG(r);
21826 #endif /* DEBUGGING */
21829 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21832 # if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \
21833 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \
21834 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \
21835 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \
21836 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \
21837 || CC_VERTSPACE_ != 15
21838 # error Need to adjust order of anyofs[]
21840 static const char * const anyofs[] = {
21877 - regprop - printable representation of opcode, with run time support
21881 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21885 const U8 op = OP(o);
21886 RXi_GET_DECL(prog, progi);
21887 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21889 PERL_ARGS_ASSERT_REGPROP;
21893 if (op > REGNODE_MAX) { /* regnode.type is unsigned */
21894 if (pRExC_state) { /* This gives more info, if we have it */
21895 FAIL3("panic: corrupted regexp opcode %d > %d",
21896 (int)op, (int)REGNODE_MAX);
21899 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21900 (int)op, (int)REGNODE_MAX);
21903 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
21905 k = REGNODE_TYPE(op);
21908 sv_catpvs(sv, " ");
21909 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21910 * is a crude hack but it may be the best for now since
21911 * we have no flag "this EXACTish node was UTF-8"
21913 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21914 PL_colors[0], PL_colors[1],
21915 PERL_PV_ESCAPE_UNI_DETECT |
21916 PERL_PV_ESCAPE_NONASCII |
21917 PERL_PV_PRETTY_ELLIPSES |
21918 PERL_PV_PRETTY_LTGT |
21919 PERL_PV_PRETTY_NOCLEAR
21921 } else if (k == TRIE) {
21922 /* print the details of the trie in dumpuntil instead, as
21923 * progi->data isn't available here */
21924 const U32 n = ARG(o);
21925 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21926 (reg_ac_data *)progi->data->data[n] :
21928 const reg_trie_data * const trie
21929 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21931 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags));
21932 DEBUG_TRIE_COMPILE_r({
21934 sv_catpvs(sv, "(JUMP)");
21935 Perl_sv_catpvf(aTHX_ sv,
21936 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21937 (UV)trie->startstate,
21938 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21939 (UV)trie->wordcount,
21942 (UV)TRIE_CHARCOUNT(trie),
21943 (UV)trie->uniquecharcount
21946 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21947 sv_catpvs(sv, "[");
21948 (void) put_charclass_bitmap_innards(sv,
21949 ((IS_ANYOF_TRIE(op))
21951 : TRIE_BITMAP(trie)),
21958 sv_catpvs(sv, "]");
21960 } else if (k == CURLY) {
21961 U32 lo = ARG1(o), hi = ARG2(o);
21962 if (op == CURLYM || op == CURLYN || op == CURLYX)
21963 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21964 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21965 if (hi == REG_INFTY)
21966 sv_catpvs(sv, "INFTY");
21968 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21969 sv_catpvs(sv, "}");
21971 else if (k == WHILEM && o->flags) /* Ordinal/of */
21972 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21973 else if (k == REF || k == OPEN || k == CLOSE
21974 || k == GROUPP || op == ACCEPT)
21976 AV *name_list= NULL;
21977 U32 parno= op == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21978 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21979 if ( RXp_PAREN_NAMES(prog) ) {
21980 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21981 } else if ( pRExC_state ) {
21982 name_list= RExC_paren_name_list;
21985 if ( k != REF || (op < REFN)) {
21986 SV **name= av_fetch(name_list, parno, 0 );
21988 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21992 /* parno must always be larger than 0 for this block
21993 * as it represents a slot into the data array, which
21994 * has the 0 slot reserved for a placeholder so any valid
21995 * index into it is always true, eg non-zero
21996 * see the '%' "what" type and the implementation of
21999 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
22000 I32 *nums=(I32*)SvPVX(sv_dat);
22001 SV **name= av_fetch(name_list, nums[0], 0 );
22004 for ( n=0; n<SvIVX(sv_dat); n++ ) {
22005 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
22006 (n ? "," : ""), (IV)nums[n]);
22008 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
22012 if ( k == REF && reginfo) {
22013 U32 n = ARG(o); /* which paren pair */
22014 I32 ln = prog->offs[n].start;
22015 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
22016 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
22017 else if (ln == prog->offs[n].end)
22018 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
22020 const char *s = reginfo->strbeg + ln;
22021 Perl_sv_catpvf(aTHX_ sv, ": ");
22022 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
22023 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
22026 } else if (k == GOSUB) {
22027 AV *name_list= NULL;
22028 if ( RXp_PAREN_NAMES(prog) ) {
22029 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
22030 } else if ( pRExC_state ) {
22031 name_list= RExC_paren_name_list;
22034 /* Paren and offset */
22035 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
22036 (int)((o + (int)ARG2L(o)) - progi->program) );
22038 SV **name= av_fetch(name_list, ARG(o), 0 );
22040 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
22043 else if (k == LOGICAL)
22044 /* 2: embedded, otherwise 1 */
22045 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
22046 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
22049 U8 do_sep = 0; /* Do we need to separate various components of the
22051 /* Set if there is still an unresolved user-defined property */
22052 SV *unresolved = NULL;
22054 /* Things that are ignored except when the runtime locale is UTF-8 */
22055 SV *only_utf8_locale_invlist = NULL;
22057 /* Code points that don't fit in the bitmap */
22058 SV *nonbitmap_invlist = NULL;
22060 /* And things that aren't in the bitmap, but are small enough to be */
22061 SV* bitmap_range_not_in_bitmap = NULL;
22070 flags = ANYOF_FLAGS(o);
22071 bitmap = ANYOF_BITMAP(o);
22074 if (op == ANYOFL || op == ANYOFPOSIXL) {
22075 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
22076 sv_catpvs(sv, "{utf8-locale-reqd}");
22078 if (flags & ANYOFL_FOLD) {
22079 sv_catpvs(sv, "{i}");
22083 inverted = flags & ANYOF_INVERT;
22085 /* If there is stuff outside the bitmap, get it */
22088 /* For a single range, split into the parts inside vs outside the
22090 UV start = ANYOFRbase(o);
22091 UV end = ANYOFRbase(o) + ANYOFRdelta(o);
22093 if (start < NUM_ANYOF_CODE_POINTS) {
22094 if (end < NUM_ANYOF_CODE_POINTS) {
22095 bitmap_range_not_in_bitmap
22096 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
22100 bitmap_range_not_in_bitmap
22101 = _add_range_to_invlist(bitmap_range_not_in_bitmap,
22102 start, NUM_ANYOF_CODE_POINTS);
22103 start = NUM_ANYOF_CODE_POINTS;
22107 if (start >= NUM_ANYOF_CODE_POINTS) {
22108 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
22110 ANYOFRbase(o) + ANYOFRdelta(o));
22113 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
22114 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
22115 NUM_ANYOF_CODE_POINTS,
22118 else if (ANYOF_HAS_AUX(o)) {
22119 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
22121 &only_utf8_locale_invlist,
22122 &nonbitmap_invlist);
22124 /* The aux data may contain stuff that could fit in the bitmap.
22125 * This could come from a user-defined property being finally
22126 * resolved when this call was done; or much more likely because
22127 * there are matches that require UTF-8 to be valid, and so aren't
22128 * in the bitmap (or ANYOFR). This is teased apart later */
22129 _invlist_intersection(nonbitmap_invlist,
22131 &bitmap_range_not_in_bitmap);
22132 /* Leave just the things that don't fit into the bitmap */
22133 _invlist_subtract(nonbitmap_invlist,
22135 &nonbitmap_invlist);
22138 /* Ready to start outputting. First, the initial left bracket */
22139 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
22142 || bitmap_range_not_in_bitmap
22143 || only_utf8_locale_invlist
22146 /* Then all the things that could fit in the bitmap */
22147 do_sep = put_charclass_bitmap_innards(
22150 bitmap_range_not_in_bitmap,
22151 only_utf8_locale_invlist,
22155 /* Can't try inverting for a
22156 * better display if there
22157 * are things that haven't
22159 (unresolved != NULL || k == ANYOFR));
22160 SvREFCNT_dec(bitmap_range_not_in_bitmap);
22162 /* If there are user-defined properties which haven't been defined
22163 * yet, output them. If the result is not to be inverted, it is
22164 * clearest to output them in a separate [] from the bitmap range
22165 * stuff. If the result is to be complemented, we have to show
22166 * everything in one [], as the inversion applies to the whole
22167 * thing. Use {braces} to separate them from anything in the
22168 * bitmap and anything above the bitmap. */
22171 if (! do_sep) { /* If didn't output anything in the bitmap
22173 sv_catpvs(sv, "^");
22175 sv_catpvs(sv, "{");
22178 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
22181 sv_catsv(sv, unresolved);
22183 sv_catpvs(sv, "}");
22185 do_sep = ! inverted;
22187 else if ( do_sep == 2
22188 && ! nonbitmap_invlist
22189 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
22191 /* Here, the display shows the class as inverted, and
22192 * everything above the lower display should also match, but
22193 * there is no indication of that. Add this range so the code
22194 * below will add it to the display */
22195 _invlist_union_complement_2nd(nonbitmap_invlist,
22197 &nonbitmap_invlist);
22201 /* And, finally, add the above-the-bitmap stuff */
22202 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
22205 /* See if truncation size is overridden */
22206 const STRLEN dump_len = (PL_dump_re_max_len > 256)
22207 ? PL_dump_re_max_len
22210 /* This is output in a separate [] */
22212 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
22215 /* And, for easy of understanding, it is shown in the
22216 * uncomplemented form if possible. The one exception being if
22217 * there are unresolved items, where the inversion has to be
22218 * delayed until runtime */
22219 if (inverted && ! unresolved) {
22220 _invlist_invert(nonbitmap_invlist);
22221 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
22224 contents = invlist_contents(nonbitmap_invlist,
22225 FALSE /* output suitable for catsv */
22228 /* If the output is shorter than the permissible maximum, just do it. */
22229 if (SvCUR(contents) <= dump_len) {
22230 sv_catsv(sv, contents);
22233 const char * contents_string = SvPVX(contents);
22234 STRLEN i = dump_len;
22236 /* Otherwise, start at the permissible max and work back to the
22237 * first break possibility */
22238 while (i > 0 && contents_string[i] != ' ') {
22241 if (i == 0) { /* Fail-safe. Use the max if we couldn't
22242 find a legal break */
22246 sv_catpvn(sv, contents_string, i);
22247 sv_catpvs(sv, "...");
22250 SvREFCNT_dec_NN(contents);
22251 SvREFCNT_dec_NN(nonbitmap_invlist);
22254 /* And finally the matching, closing ']' */
22255 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
22257 if (op == ANYOFHs) {
22258 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
22260 else if (REGNODE_TYPE(op) != ANYOF) {
22261 U8 lowest = (op != ANYOFHr)
22263 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
22264 U8 highest = (op == ANYOFHr)
22265 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
22266 : (op == ANYOFH || op == ANYOFR)
22270 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
22273 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
22274 if (lowest != highest) {
22275 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
22277 Perl_sv_catpvf(aTHX_ sv, ")");
22281 SvREFCNT_dec(unresolved);
22283 else if (k == ANYOFM) {
22284 SV * cp_list = get_ANYOFM_contents(o);
22286 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
22287 if (op == NANYOFM) {
22288 _invlist_invert(cp_list);
22291 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
22292 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
22294 SvREFCNT_dec(cp_list);
22296 else if (k == ANYOFHbbm) {
22297 SV * cp_list = get_ANYOFHbbm_contents(o);
22298 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
22300 sv_catsv(sv, invlist_contents(cp_list,
22301 FALSE /* output suitable for catsv */
22303 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
22305 SvREFCNT_dec(cp_list);
22307 else if (k == POSIXD || k == NPOSIXD) {
22308 U8 index = FLAGS(o) * 2;
22309 if (index < C_ARRAY_LENGTH(anyofs)) {
22310 if (*anyofs[index] != '[') {
22311 sv_catpvs(sv, "[");
22313 sv_catpv(sv, anyofs[index]);
22314 if (*anyofs[index] != '[') {
22315 sv_catpvs(sv, "]");
22319 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
22322 else if (k == BOUND || k == NBOUND) {
22323 /* Must be synced with order of 'bound_type' in regcomp.h */
22324 const char * const bounds[] = {
22325 "", /* Traditional */
22331 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
22332 sv_catpv(sv, bounds[FLAGS(o)]);
22334 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
22335 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
22337 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
22339 Perl_sv_catpvf(aTHX_ sv, "]");
22341 else if (op == SBOL)
22342 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
22344 /* add on the verb argument if there is one */
22345 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) {
22347 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
22348 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
22350 sv_catpvs(sv, ":NULL");
22353 PERL_UNUSED_CONTEXT;
22354 PERL_UNUSED_ARG(sv);
22355 PERL_UNUSED_ARG(o);
22356 PERL_UNUSED_ARG(prog);
22357 PERL_UNUSED_ARG(reginfo);
22358 PERL_UNUSED_ARG(pRExC_state);
22359 #endif /* DEBUGGING */
22365 Perl_re_intuit_string(pTHX_ REGEXP * const r)
22366 { /* Assume that RE_INTUIT is set */
22367 /* Returns an SV containing a string that must appear in the target for it
22368 * to match, or NULL if nothing is known that must match.
22370 * CAUTION: the SV can be freed during execution of the regex engine */
22372 struct regexp *const prog = ReANY(r);
22373 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22375 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
22376 PERL_UNUSED_CONTEXT;
22380 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
22381 const char * const s = SvPV_nolen_const(RX_UTF8(r)
22382 ? prog->check_utf8 : prog->check_substr);
22384 if (!PL_colorset) reginitcolors();
22385 Perl_re_printf( aTHX_
22386 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
22388 RX_UTF8(r) ? "utf8 " : "",
22389 PL_colors[5], PL_colors[0],
22392 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
22396 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
22397 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
22403 handles refcounting and freeing the perl core regexp structure. When
22404 it is necessary to actually free the structure the first thing it
22405 does is call the 'free' method of the regexp_engine associated to
22406 the regexp, allowing the handling of the void *pprivate; member
22407 first. (This routine is not overridable by extensions, which is why
22408 the extensions free is called first.)
22410 See regdupe and regdupe_internal if you change anything here.
22412 #ifndef PERL_IN_XSUB_RE
22414 Perl_pregfree(pTHX_ REGEXP *r)
22420 Perl_pregfree2(pTHX_ REGEXP *rx)
22422 struct regexp *const r = ReANY(rx);
22423 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22425 PERL_ARGS_ASSERT_PREGFREE2;
22430 if (r->mother_re) {
22431 ReREFCNT_dec(r->mother_re);
22433 CALLREGFREE_PVT(rx); /* free the private data */
22434 SvREFCNT_dec(RXp_PAREN_NAMES(r));
22438 for (i = 0; i < 2; i++) {
22439 SvREFCNT_dec(r->substrs->data[i].substr);
22440 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
22442 Safefree(r->substrs);
22444 RX_MATCH_COPY_FREE(rx);
22445 #ifdef PERL_ANY_COW
22446 SvREFCNT_dec(r->saved_copy);
22449 SvREFCNT_dec(r->qr_anoncv);
22450 if (r->recurse_locinput)
22451 Safefree(r->recurse_locinput);
22457 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
22458 except that dsv will be created if NULL.
22460 This function is used in two main ways. First to implement
22461 $r = qr/....; $s = $$r;
22463 Secondly, it is used as a hacky workaround to the structural issue of
22465 being stored in the regexp structure which is in turn stored in
22466 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
22467 could be PL_curpm in multiple contexts, and could require multiple
22468 result sets being associated with the pattern simultaneously, such
22469 as when doing a recursive match with (??{$qr})
22471 The solution is to make a lightweight copy of the regexp structure
22472 when a qr// is returned from the code executed by (??{$qr}) this
22473 lightweight copy doesn't actually own any of its data except for
22474 the starp/end and the actual regexp structure itself.
22480 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
22482 struct regexp *drx;
22483 struct regexp *const srx = ReANY(ssv);
22484 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
22486 PERL_ARGS_ASSERT_REG_TEMP_COPY;
22489 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
22491 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
22493 /* our only valid caller, sv_setsv_flags(), should have done
22494 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
22495 assert(!SvOOK(dsv));
22496 assert(!SvIsCOW(dsv));
22497 assert(!SvROK(dsv));
22499 if (SvPVX_const(dsv)) {
22501 Safefree(SvPVX(dsv));
22506 SvOK_off((SV *)dsv);
22509 /* For PVLVs, the head (sv_any) points to an XPVLV, while
22510 * the LV's xpvlenu_rx will point to a regexp body, which
22511 * we allocate here */
22512 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
22513 assert(!SvPVX(dsv));
22514 /* We "steal" the body from the newly allocated SV temp, changing
22515 * the pointer in its HEAD to NULL. We then change its type to
22516 * SVt_NULL so that when we immediately release its only reference,
22517 * no memory deallocation happens.
22519 * The body will eventually be freed (from the PVLV) either in
22520 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
22521 * the regexp body needs to be removed)
22522 * or in Perl_sv_clear() (if the PVLV still holds the pointer until
22523 * the PVLV itself is deallocated). */
22524 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
22525 temp->sv_any = NULL;
22526 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
22527 SvREFCNT_dec_NN(temp);
22528 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
22529 ing below will not set it. */
22530 SvCUR_set(dsv, SvCUR(ssv));
22533 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
22534 sv_force_normal(sv) is called. */
22538 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
22539 SvPV_set(dsv, RX_WRAPPED(ssv));
22540 /* We share the same string buffer as the original regexp, on which we
22541 hold a reference count, incremented when mother_re is set below.
22542 The string pointer is copied here, being part of the regexp struct.
22544 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
22545 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
22549 const I32 npar = srx->nparens+1;
22550 Newx(drx->offs, npar, regexp_paren_pair);
22551 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
22553 if (srx->substrs) {
22555 Newx(drx->substrs, 1, struct reg_substr_data);
22556 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
22558 for (i = 0; i < 2; i++) {
22559 SvREFCNT_inc_void(drx->substrs->data[i].substr);
22560 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
22563 /* check_substr and check_utf8, if non-NULL, point to either their
22564 anchored or float namesakes, and don't hold a second reference. */
22566 RX_MATCH_COPIED_off(dsv);
22567 #ifdef PERL_ANY_COW
22568 drx->saved_copy = NULL;
22570 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
22571 SvREFCNT_inc_void(drx->qr_anoncv);
22572 if (srx->recurse_locinput)
22573 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
22580 /* regfree_internal()
22582 Free the private data in a regexp. This is overloadable by
22583 extensions. Perl takes care of the regexp structure in pregfree(),
22584 this covers the *pprivate pointer which technically perl doesn't
22585 know about, however of course we have to handle the
22586 regexp_internal structure when no extension is in use.
22588 Note this is called before freeing anything in the regexp
22593 Perl_regfree_internal(pTHX_ REGEXP * const rx)
22595 struct regexp *const r = ReANY(rx);
22596 RXi_GET_DECL(r, ri);
22597 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22599 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
22609 SV *dsv= sv_newmortal();
22610 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
22611 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
22612 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
22613 PL_colors[4], PL_colors[5], s);
22617 if (ri->code_blocks)
22618 S_free_codeblocks(aTHX_ ri->code_blocks);
22621 int n = ri->data->count;
22624 /* If you add a ->what type here, update the comment in regcomp.h */
22625 switch (ri->data->what[n]) {
22631 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
22634 Safefree(ri->data->data[n]);
22640 { /* Aho Corasick add-on structure for a trie node.
22641 Used in stclass optimization only */
22643 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
22645 refcount = --aho->refcount;
22648 PerlMemShared_free(aho->states);
22649 PerlMemShared_free(aho->fail);
22650 /* do this last!!!! */
22651 PerlMemShared_free(ri->data->data[n]);
22652 /* we should only ever get called once, so
22653 * assert as much, and also guard the free
22654 * which /might/ happen twice. At the least
22655 * it will make code anlyzers happy and it
22656 * doesn't cost much. - Yves */
22657 assert(ri->regstclass);
22658 if (ri->regstclass) {
22659 PerlMemShared_free(ri->regstclass);
22660 ri->regstclass = 0;
22667 /* trie structure. */
22669 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
22671 refcount = --trie->refcount;
22674 PerlMemShared_free(trie->charmap);
22675 PerlMemShared_free(trie->states);
22676 PerlMemShared_free(trie->trans);
22678 PerlMemShared_free(trie->bitmap);
22680 PerlMemShared_free(trie->jump);
22681 PerlMemShared_free(trie->wordinfo);
22682 /* do this last!!!! */
22683 PerlMemShared_free(ri->data->data[n]);
22688 /* NO-OP a '%' data contains a null pointer, so that add_data
22689 * always returns non-zero, this should only ever happen in the
22694 Perl_croak(aTHX_ "panic: regfree data code '%c'",
22695 ri->data->what[n]);
22698 Safefree(ri->data->what);
22699 Safefree(ri->data);
22705 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
22706 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
22707 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
22710 =for apidoc re_dup_guts
22711 Duplicate a regexp.
22713 This routine is expected to clone a given regexp structure. It is only
22714 compiled under USE_ITHREADS.
22716 After all of the core data stored in struct regexp is duplicated
22717 the C<regexp_engine.dupe> method is used to copy any private data
22718 stored in the *pprivate pointer. This allows extensions to handle
22719 any duplication they need to do.
22723 See pregfree() and regfree_internal() if you change anything here.
22725 #if defined(USE_ITHREADS)
22726 #ifndef PERL_IN_XSUB_RE
22728 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
22731 const struct regexp *r = ReANY(sstr);
22732 struct regexp *ret = ReANY(dstr);
22734 PERL_ARGS_ASSERT_RE_DUP_GUTS;
22736 npar = r->nparens+1;
22737 Newx(ret->offs, npar, regexp_paren_pair);
22738 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
22740 if (ret->substrs) {
22741 /* Do it this way to avoid reading from *r after the StructCopy().
22742 That way, if any of the sv_dup_inc()s dislodge *r from the L1
22743 cache, it doesn't matter. */
22745 const bool anchored = r->check_substr
22746 ? r->check_substr == r->substrs->data[0].substr
22747 : r->check_utf8 == r->substrs->data[0].utf8_substr;
22748 Newx(ret->substrs, 1, struct reg_substr_data);
22749 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
22751 for (i = 0; i < 2; i++) {
22752 ret->substrs->data[i].substr =
22753 sv_dup_inc(ret->substrs->data[i].substr, param);
22754 ret->substrs->data[i].utf8_substr =
22755 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
22758 /* check_substr and check_utf8, if non-NULL, point to either their
22759 anchored or float namesakes, and don't hold a second reference. */
22761 if (ret->check_substr) {
22763 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
22765 ret->check_substr = ret->substrs->data[0].substr;
22766 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
22768 assert(r->check_substr == r->substrs->data[1].substr);
22769 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
22771 ret->check_substr = ret->substrs->data[1].substr;
22772 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
22774 } else if (ret->check_utf8) {
22776 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
22778 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
22783 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
22784 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
22785 if (r->recurse_locinput)
22786 Newx(ret->recurse_locinput, r->nparens + 1, char *);
22789 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
22791 if (RX_MATCH_COPIED(dstr))
22792 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
22794 ret->subbeg = NULL;
22795 #ifdef PERL_ANY_COW
22796 ret->saved_copy = NULL;
22799 /* Whether mother_re be set or no, we need to copy the string. We
22800 cannot refrain from copying it when the storage points directly to
22801 our mother regexp, because that's
22802 1: a buffer in a different thread
22803 2: something we no longer hold a reference on
22804 so we need to copy it locally. */
22805 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
22806 /* set malloced length to a non-zero value so it will be freed
22807 * (otherwise in combination with SVf_FAKE it looks like an alien
22808 * buffer). It doesn't have to be the actual malloced size, since it
22809 * should never be grown */
22810 SvLEN_set(dstr, SvCUR(sstr)+1);
22811 ret->mother_re = NULL;
22813 #endif /* PERL_IN_XSUB_RE */
22818 This is the internal complement to regdupe() which is used to copy
22819 the structure pointed to by the *pprivate pointer in the regexp.
22820 This is the core version of the extension overridable cloning hook.
22821 The regexp structure being duplicated will be copied by perl prior
22822 to this and will be provided as the regexp *r argument, however
22823 with the /old/ structures pprivate pointer value. Thus this routine
22824 may override any copying normally done by perl.
22826 It returns a pointer to the new regexp_internal structure.
22830 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
22832 struct regexp *const r = ReANY(rx);
22833 regexp_internal *reti;
22835 RXi_GET_DECL(r, ri);
22837 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
22841 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
22842 char, regexp_internal);
22843 Copy(ri->program, reti->program, len+1, regnode);
22846 if (ri->code_blocks) {
22848 Newx(reti->code_blocks, 1, struct reg_code_blocks);
22849 Newx(reti->code_blocks->cb, ri->code_blocks->count,
22850 struct reg_code_block);
22851 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22852 ri->code_blocks->count, struct reg_code_block);
22853 for (n = 0; n < ri->code_blocks->count; n++)
22854 reti->code_blocks->cb[n].src_regex = (REGEXP*)
22855 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22856 reti->code_blocks->count = ri->code_blocks->count;
22857 reti->code_blocks->refcnt = 1;
22860 reti->code_blocks = NULL;
22862 reti->regstclass = NULL;
22865 struct reg_data *d;
22866 const int count = ri->data->count;
22869 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22870 char, struct reg_data);
22871 Newx(d->what, count, U8);
22874 for (i = 0; i < count; i++) {
22875 d->what[i] = ri->data->what[i];
22876 switch (d->what[i]) {
22877 /* see also regcomp.h and regfree_internal() */
22878 case 'a': /* actually an AV, but the dup function is identical.
22879 values seem to be "plain sv's" generally. */
22880 case 'r': /* a compiled regex (but still just another SV) */
22881 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22882 this use case should go away, the code could have used
22883 'a' instead - see S_set_ANYOF_arg() for array contents. */
22884 case 'S': /* actually an SV, but the dup function is identical. */
22885 case 'u': /* actually an HV, but the dup function is identical.
22886 values are "plain sv's" */
22887 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22890 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22891 * patterns which could start with several different things. Pre-TRIE
22892 * this was more important than it is now, however this still helps
22893 * in some places, for instance /x?a+/ might produce a SSC equivalent
22894 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22897 /* This is cheating. */
22898 Newx(d->data[i], 1, regnode_ssc);
22899 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22900 reti->regstclass = (regnode*)d->data[i];
22903 /* AHO-CORASICK fail table */
22904 /* Trie stclasses are readonly and can thus be shared
22905 * without duplication. We free the stclass in pregfree
22906 * when the corresponding reg_ac_data struct is freed.
22908 reti->regstclass= ri->regstclass;
22911 /* TRIE transition table */
22913 ((reg_trie_data*)ri->data->data[i])->refcount++;
22916 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22917 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22918 is not from another regexp */
22919 d->data[i] = ri->data->data[i];
22922 /* this is a placeholder type, it exists purely so that
22923 * add_data always returns a non-zero value, this type of
22924 * entry should ONLY be present in the 0 slot of the array */
22926 d->data[i]= ri->data->data[i];
22929 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22930 ri->data->what[i]);
22939 reti->name_list_idx = ri->name_list_idx;
22941 SetProgLen(reti, len);
22943 return (void*)reti;
22946 #endif /* USE_ITHREADS */
22949 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22952 STRLEN len = strlen(pat);
22955 const char *message;
22957 PERL_ARGS_ASSERT_RE_CROAK;
22961 Copy(pat, buf, len , char);
22963 buf[len + 1] = '\0';
22964 va_start(args, pat);
22965 msv = vmess(buf, &args);
22967 message = SvPV_const(msv, len);
22970 Copy(message, buf, len , char);
22971 /* len-1 to avoid \n */
22972 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22975 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22977 #ifndef PERL_IN_XSUB_RE
22979 Perl_save_re_context(pTHX)
22984 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22987 const REGEXP * const rx = PM_GETRE(PL_curpm);
22989 nparens = RX_NPARENS(rx);
22992 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22993 * that PL_curpm will be null, but that utf8.pm and the modules it
22994 * loads will only use $1..$3.
22995 * The t/porting/re_context.t test file checks this assumption.
23000 for (i = 1; i <= nparens; i++) {
23001 char digits[TYPE_CHARS(long)];
23002 const STRLEN len = my_snprintf(digits, sizeof(digits),
23004 GV *const *const gvp
23005 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
23008 GV * const gv = *gvp;
23009 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
23019 S_put_code_point(pTHX_ SV *sv, UV c)
23021 PERL_ARGS_ASSERT_PUT_CODE_POINT;
23024 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
23026 else if (isPRINT(c)) {
23027 const char string = (char) c;
23029 /* We use {phrase} as metanotation in the class, so also escape literal
23031 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
23032 sv_catpvs(sv, "\\");
23033 sv_catpvn(sv, &string, 1);
23035 else if (isMNEMONIC_CNTRL(c)) {
23036 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
23039 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
23044 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
23046 /* Appends to 'sv' a displayable version of the range of code points from
23047 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
23048 * that have them, when they occur at the beginning or end of the range.
23049 * It uses hex to output the remaining code points, unless 'allow_literals'
23050 * is true, in which case the printable ASCII ones are output as-is (though
23051 * some of these will be escaped by put_code_point()).
23053 * NOTE: This is designed only for printing ranges of code points that fit
23054 * inside an ANYOF bitmap. Higher code points are simply suppressed
23057 const unsigned int min_range_count = 3;
23059 assert(start <= end);
23061 PERL_ARGS_ASSERT_PUT_RANGE;
23063 while (start <= end) {
23065 const char * format;
23067 if ( end - start < min_range_count
23068 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
23070 /* Output a range of 1 or 2 chars individually, or longer ranges
23071 * when printable */
23072 for (; start <= end; start++) {
23073 put_code_point(sv, start);
23078 /* If permitted by the input options, and there is a possibility that
23079 * this range contains a printable literal, look to see if there is
23081 if (allow_literals && start <= MAX_PRINT_A) {
23083 /* If the character at the beginning of the range isn't an ASCII
23084 * printable, effectively split the range into two parts:
23085 * 1) the portion before the first such printable,
23087 * and output them separately. */
23088 if (! isPRINT_A(start)) {
23089 UV temp_end = start + 1;
23091 /* There is no point looking beyond the final possible
23092 * printable, in MAX_PRINT_A */
23093 UV max = MIN(end, MAX_PRINT_A);
23095 while (temp_end <= max && ! isPRINT_A(temp_end)) {
23099 /* Here, temp_end points to one beyond the first printable if
23100 * found, or to one beyond 'max' if not. If none found, make
23101 * sure that we use the entire range */
23102 if (temp_end > MAX_PRINT_A) {
23103 temp_end = end + 1;
23106 /* Output the first part of the split range: the part that
23107 * doesn't have printables, with the parameter set to not look
23108 * for literals (otherwise we would infinitely recurse) */
23109 put_range(sv, start, temp_end - 1, FALSE);
23111 /* The 2nd part of the range (if any) starts here. */
23114 /* We do a continue, instead of dropping down, because even if
23115 * the 2nd part is non-empty, it could be so short that we want
23116 * to output it as individual characters, as tested for at the
23117 * top of this loop. */
23121 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
23122 * output a sub-range of just the digits or letters, then process
23123 * the remaining portion as usual. */
23124 if (isALPHANUMERIC_A(start)) {
23125 UV mask = (isDIGIT_A(start))
23130 UV temp_end = start + 1;
23132 /* Find the end of the sub-range that includes just the
23133 * characters in the same class as the first character in it */
23134 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
23139 /* For short ranges, don't duplicate the code above to output
23140 * them; just call recursively */
23141 if (temp_end - start < min_range_count) {
23142 put_range(sv, start, temp_end, FALSE);
23144 else { /* Output as a range */
23145 put_code_point(sv, start);
23146 sv_catpvs(sv, "-");
23147 put_code_point(sv, temp_end);
23149 start = temp_end + 1;
23153 /* We output any other printables as individual characters */
23154 if (isPUNCT_A(start) || isSPACE_A(start)) {
23155 while (start <= end && (isPUNCT_A(start)
23156 || isSPACE_A(start)))
23158 put_code_point(sv, start);
23163 } /* End of looking for literals */
23165 /* Here is not to output as a literal. Some control characters have
23166 * mnemonic names. Split off any of those at the beginning and end of
23167 * the range to print mnemonically. It isn't possible for many of
23168 * these to be in a row, so this won't overwhelm with output */
23170 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
23172 while (isMNEMONIC_CNTRL(start) && start <= end) {
23173 put_code_point(sv, start);
23177 /* If this didn't take care of the whole range ... */
23178 if (start <= end) {
23180 /* Look backwards from the end to find the final non-mnemonic
23183 while (isMNEMONIC_CNTRL(temp_end)) {
23187 /* And separately output the interior range that doesn't start
23188 * or end with mnemonics */
23189 put_range(sv, start, temp_end, FALSE);
23191 /* Then output the mnemonic trailing controls */
23192 start = temp_end + 1;
23193 while (start <= end) {
23194 put_code_point(sv, start);
23201 /* As a final resort, output the range or subrange as hex. */
23203 if (start >= NUM_ANYOF_CODE_POINTS) {
23206 else { /* Have to split range at the bitmap boundary */
23207 this_end = (end < NUM_ANYOF_CODE_POINTS)
23209 : NUM_ANYOF_CODE_POINTS - 1;
23211 #if NUM_ANYOF_CODE_POINTS > 256
23212 format = (this_end < 256)
23213 ? "\\x%02" UVXf "-\\x%02" UVXf
23214 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
23216 format = "\\x%02" UVXf "-\\x%02" UVXf;
23218 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
23219 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
23220 GCC_DIAG_RESTORE_STMT;
23226 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
23228 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
23232 bool allow_literals = TRUE;
23234 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
23236 /* Generally, it is more readable if printable characters are output as
23237 * literals, but if a range (nearly) spans all of them, it's best to output
23238 * it as a single range. This code will use a single range if all but 2
23239 * ASCII printables are in it */
23240 invlist_iterinit(invlist);
23241 while (invlist_iternext(invlist, &start, &end)) {
23243 /* If the range starts beyond the final printable, it doesn't have any
23245 if (start > MAX_PRINT_A) {
23249 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
23250 * all but two, the range must start and end no later than 2 from
23252 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
23253 if (end > MAX_PRINT_A) {
23259 if (end - start >= MAX_PRINT_A - ' ' - 2) {
23260 allow_literals = FALSE;
23265 invlist_iterfinish(invlist);
23267 /* Here we have figured things out. Output each range */
23268 invlist_iterinit(invlist);
23269 while (invlist_iternext(invlist, &start, &end)) {
23270 if (start >= NUM_ANYOF_CODE_POINTS) {
23273 put_range(sv, start, end, allow_literals);
23275 invlist_iterfinish(invlist);
23281 S_put_charclass_bitmap_innards_common(pTHX_
23282 SV* invlist, /* The bitmap */
23283 SV* posixes, /* Under /l, things like [:word:], \S */
23284 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
23285 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
23286 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
23287 const bool invert /* Is the result to be inverted? */
23290 /* Create and return an SV containing a displayable version of the bitmap
23291 * and associated information determined by the input parameters. If the
23292 * output would have been only the inversion indicator '^', NULL is instead
23297 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
23300 output = newSVpvs("^");
23303 output = newSVpvs("");
23306 /* First, the code points in the bitmap that are unconditionally there */
23307 put_charclass_bitmap_innards_invlist(output, invlist);
23309 /* Traditionally, these have been placed after the main code points */
23311 sv_catsv(output, posixes);
23314 if (only_utf8 && _invlist_len(only_utf8)) {
23315 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
23316 put_charclass_bitmap_innards_invlist(output, only_utf8);
23319 if (not_utf8 && _invlist_len(not_utf8)) {
23320 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
23321 put_charclass_bitmap_innards_invlist(output, not_utf8);
23324 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
23325 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
23326 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
23328 /* This is the only list in this routine that can legally contain code
23329 * points outside the bitmap range. The call just above to
23330 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
23331 * output them here. There's about a half-dozen possible, and none in
23332 * contiguous ranges longer than 2 */
23333 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
23335 SV* above_bitmap = NULL;
23337 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
23339 invlist_iterinit(above_bitmap);
23340 while (invlist_iternext(above_bitmap, &start, &end)) {
23343 for (i = start; i <= end; i++) {
23344 put_code_point(output, i);
23347 invlist_iterfinish(above_bitmap);
23348 SvREFCNT_dec_NN(above_bitmap);
23352 if (invert && SvCUR(output) == 1) {
23360 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
23362 SV *nonbitmap_invlist,
23363 SV *only_utf8_locale_invlist,
23364 const regnode * const node,
23366 const bool force_as_is_display)
23368 /* Appends to 'sv' a displayable version of the innards of the bracketed
23369 * character class defined by the other arguments:
23370 * 'bitmap' points to the bitmap, or NULL if to ignore that.
23371 * 'nonbitmap_invlist' is an inversion list of the code points that are in
23372 * the bitmap range, but for some reason aren't in the bitmap; NULL if
23373 * none. The reasons for this could be that they require some
23374 * condition such as the target string being or not being in UTF-8
23375 * (under /d), or because they came from a user-defined property that
23376 * was not resolved at the time of the regex compilation (under /u)
23377 * 'only_utf8_locale_invlist' is an inversion list of the code points that
23378 * are valid only if the runtime locale is a UTF-8 one; NULL if none
23379 * 'node' is the regex pattern ANYOF node. It is needed only when the
23380 * above two parameters are not null, and is passed so that this
23381 * routine can tease apart the various reasons for them.
23382 * 'flags' is the flags field of 'node'
23383 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
23384 * to invert things to see if that leads to a cleaner display. If
23385 * FALSE, this routine is free to use its judgment about doing this.
23387 * It returns 0 if nothing was actually output. (It may be that
23388 * the bitmap, etc is empty.)
23389 * 1 if the output wasn't inverted (didn't begin with a '^')
23390 * 2 if the output was inverted (did begin with a '^')
23392 * When called for outputting the bitmap of a non-ANYOF node, just pass the
23393 * bitmap, with the succeeding parameters set to NULL, and the final one to
23397 /* In general, it tries to display the 'cleanest' representation of the
23398 * innards, choosing whether to display them inverted or not, regardless of
23399 * whether the class itself is to be inverted. However, there are some
23400 * cases where it can't try inverting, as what actually matches isn't known
23401 * until runtime, and hence the inversion isn't either. */
23403 bool inverting_allowed = ! force_as_is_display;
23406 STRLEN orig_sv_cur = SvCUR(sv);
23408 SV* invlist; /* Inversion list we accumulate of code points that
23409 are unconditionally matched */
23410 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
23412 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
23414 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
23415 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
23418 SV* as_is_display; /* The output string when we take the inputs
23420 SV* inverted_display; /* The output string when we invert the inputs */
23422 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
23424 /* We are biased in favor of displaying things without them being inverted,
23425 * as that is generally easier to understand */
23426 const int bias = 5;
23428 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
23430 /* Start off with whatever code points are passed in. (We clone, so we
23431 * don't change the caller's list) */
23432 if (nonbitmap_invlist) {
23433 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
23434 invlist = invlist_clone(nonbitmap_invlist, NULL);
23436 else { /* Worst case size is every other code point is matched */
23437 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
23441 if (OP(node) == ANYOFD) {
23443 /* This flag indicates that the code points below 0x100 in the
23444 * nonbitmap list are precisely the ones that match only when the
23445 * target is UTF-8 (they should all be non-ASCII). */
23446 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
23447 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
23448 _invlist_subtract(invlist, only_utf8, &invlist);
23451 /* And this flag for matching all non-ASCII 0xFF and below */
23452 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
23453 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
23456 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
23458 /* If either of these flags are set, what matches isn't
23459 * determinable except during execution, so don't know enough here
23461 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
23462 inverting_allowed = FALSE;
23465 /* What the posix classes match also varies at runtime, so these
23466 * will be output symbolically. */
23467 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
23470 posixes = newSVpvs("");
23471 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
23472 if (ANYOF_POSIXL_TEST(node, i)) {
23473 sv_catpv(posixes, anyofs[i]);
23480 /* Accumulate the bit map into the unconditional match list */
23482 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
23483 if (BITMAP_TEST(bitmap, i)) {
23486 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
23489 invlist = _add_range_to_invlist(invlist, start, i-1);
23494 /* Make sure that the conditional match lists don't have anything in them
23495 * that match unconditionally; otherwise the output is quite confusing.
23496 * This could happen if the code that populates these misses some
23499 _invlist_subtract(only_utf8, invlist, &only_utf8);
23502 _invlist_subtract(not_utf8, invlist, ¬_utf8);
23505 if (only_utf8_locale_invlist) {
23507 /* Since this list is passed in, we have to make a copy before
23509 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
23511 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
23513 /* And, it can get really weird for us to try outputting an inverted
23514 * form of this list when it has things above the bitmap, so don't even
23516 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
23517 inverting_allowed = FALSE;
23521 /* Calculate what the output would be if we take the input as-is */
23522 as_is_display = put_charclass_bitmap_innards_common(invlist,
23529 /* If have to take the output as-is, just do that */
23530 if (! inverting_allowed) {
23531 if (as_is_display) {
23532 sv_catsv(sv, as_is_display);
23533 SvREFCNT_dec_NN(as_is_display);
23536 else { /* But otherwise, create the output again on the inverted input, and
23537 use whichever version is shorter */
23539 int inverted_bias, as_is_bias;
23541 /* We will apply our bias to whichever of the results doesn't have
23545 trial_invert = FALSE;
23550 trial_invert = TRUE;
23552 inverted_bias = bias;
23555 /* Now invert each of the lists that contribute to the output,
23556 * excluding from the result things outside the possible range */
23558 /* For the unconditional inversion list, we have to add in all the
23559 * conditional code points, so that when inverted, they will be gone
23561 _invlist_union(only_utf8, invlist, &invlist);
23562 _invlist_union(not_utf8, invlist, &invlist);
23563 _invlist_union(only_utf8_locale, invlist, &invlist);
23564 _invlist_invert(invlist);
23565 _invlist_intersection(invlist, PL_InBitmap, &invlist);
23568 _invlist_invert(only_utf8);
23569 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
23571 else if (not_utf8) {
23573 /* If a code point matches iff the target string is not in UTF-8,
23574 * then complementing the result has it not match iff not in UTF-8,
23575 * which is the same thing as matching iff it is UTF-8. */
23576 only_utf8 = not_utf8;
23580 if (only_utf8_locale) {
23581 _invlist_invert(only_utf8_locale);
23582 _invlist_intersection(only_utf8_locale,
23584 &only_utf8_locale);
23587 inverted_display = put_charclass_bitmap_innards_common(
23592 only_utf8_locale, trial_invert);
23594 /* Use the shortest representation, taking into account our bias
23595 * against showing it inverted */
23596 if ( inverted_display
23597 && ( ! as_is_display
23598 || ( SvCUR(inverted_display) + inverted_bias
23599 < SvCUR(as_is_display) + as_is_bias)))
23601 sv_catsv(sv, inverted_display);
23604 else if (as_is_display) {
23605 sv_catsv(sv, as_is_display);
23608 SvREFCNT_dec(as_is_display);
23609 SvREFCNT_dec(inverted_display);
23612 SvREFCNT_dec_NN(invlist);
23613 SvREFCNT_dec(only_utf8);
23614 SvREFCNT_dec(not_utf8);
23615 SvREFCNT_dec(posixes);
23616 SvREFCNT_dec(only_utf8_locale);
23618 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
23619 if (did_output_something) {
23620 /* Distinguish between non and inverted cases */
23621 did_output_something += invert;
23624 return did_output_something;
23627 #define CLEAR_OPTSTART \
23628 if (optstart) STMT_START { \
23629 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
23630 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
23634 #define DUMPUNTIL(b,e) \
23636 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
23638 STATIC const regnode *
23639 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
23640 const regnode *last, const regnode *plast,
23641 SV* sv, I32 indent, U32 depth)
23643 const regnode *next;
23644 const regnode *optstart= NULL;
23646 RXi_GET_DECL(r, ri);
23647 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23649 PERL_ARGS_ASSERT_DUMPUNTIL;
23651 #ifdef DEBUG_DUMPUNTIL
23652 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
23653 last ? last-start : 0, plast ? plast-start : 0);
23656 if (plast && plast < last)
23659 while (node && (!last || node < last)) {
23660 const U8 op = OP(node);
23662 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
23664 next = regnext((regnode *)node);
23665 const regnode *after = regnode_after((regnode *)node,0);
23668 if (op == OPTIMIZED) {
23669 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
23676 regprop(r, sv, node, NULL, NULL);
23677 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
23678 (int)(2*indent + 1), "", SvPVX_const(sv));
23680 if (op != OPTIMIZED) {
23681 if (next == NULL) /* Next ptr. */
23682 Perl_re_printf( aTHX_ " (0)");
23683 else if (REGNODE_TYPE(op) == BRANCH
23684 && REGNODE_TYPE(OP(next)) != BRANCH )
23685 Perl_re_printf( aTHX_ " (FAIL)");
23687 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
23688 Perl_re_printf( aTHX_ "\n");
23692 if (REGNODE_TYPE(op) == BRANCHJ) {
23694 const regnode *nnode = (OP(next) == LONGJMP
23695 ? regnext((regnode *)next)
23697 if (last && nnode > last)
23699 DUMPUNTIL(after, nnode);
23701 else if (REGNODE_TYPE(op) == BRANCH) {
23703 DUMPUNTIL(after, next);
23705 else if ( REGNODE_TYPE(op) == TRIE ) {
23706 const regnode *this_trie = node;
23707 const U32 n = ARG(node);
23708 const reg_ac_data * const ac = op>=AHOCORASICK ?
23709 (reg_ac_data *)ri->data->data[n] :
23711 const reg_trie_data * const trie =
23712 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
23714 AV *const trie_words
23715 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
23717 const regnode *nextbranch= NULL;
23720 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
23721 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
23723 Perl_re_indentf( aTHX_ "%s ",
23726 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
23727 SvCUR(*elem_ptr), PL_dump_re_max_len,
23728 PL_colors[0], PL_colors[1],
23730 ? PERL_PV_ESCAPE_UNI
23732 | PERL_PV_PRETTY_ELLIPSES
23733 | PERL_PV_PRETTY_LTGT
23738 U16 dist= trie->jump[word_idx+1];
23739 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
23740 (UV)((dist ? this_trie + dist : next) - start));
23743 nextbranch= this_trie + trie->jump[0];
23744 DUMPUNTIL(this_trie + dist, nextbranch);
23746 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
23747 nextbranch= regnext((regnode *)nextbranch);
23749 Perl_re_printf( aTHX_ "\n");
23752 if (last && next > last)
23757 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
23758 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
23760 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
23762 DUMPUNTIL(after, next);
23764 else if ( op == PLUS || op == STAR) {
23765 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
23767 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
23768 /* Literal string, where present. */
23769 node = (const regnode *)REGNODE_AFTER_varies(node);
23772 node = REGNODE_AFTER_opcode(node,op);
23774 if (op == CURLYX || op == OPEN || op == SROPEN)
23776 if (REGNODE_TYPE(op) == END)
23780 #ifdef DEBUG_DUMPUNTIL
23781 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
23786 #endif /* DEBUGGING */
23788 #ifndef PERL_IN_XSUB_RE
23790 # include "uni_keywords.h"
23793 Perl_init_uniprops(pTHX)
23797 char * dump_len_string;
23799 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
23800 if ( ! dump_len_string
23801 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
23803 PL_dump_re_max_len = 60; /* A reasonable default */
23807 PL_user_def_props = newHV();
23809 # ifdef USE_ITHREADS
23811 HvSHAREKEYS_off(PL_user_def_props);
23812 PL_user_def_props_aTHX = aTHX;
23816 /* Set up the inversion list interpreter-level variables */
23818 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23819 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23820 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23821 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23822 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23823 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23824 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23825 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23826 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23827 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23828 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23829 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23830 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23831 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23832 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23833 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23835 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23836 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23837 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23838 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23839 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
23840 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23841 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23842 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23843 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23844 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23845 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23846 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23847 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23848 PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
23849 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23850 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23852 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23853 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23854 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23855 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23856 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23858 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23859 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23860 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23861 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23863 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23865 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23866 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23868 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23869 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23871 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23872 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23873 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23874 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23875 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23876 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23877 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23878 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23879 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23880 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23881 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23882 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23883 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23884 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23887 /* The below are used only by deprecated functions. They could be removed */
23888 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23889 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23890 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23894 /* These four functions are compiled only in regcomp.c, where they have access
23895 * to the data they return. They are a way for re_comp.c to get access to that
23896 * data without having to compile the whole data structures. */
23899 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23901 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23903 return match_uniprop((U8 *) key, key_len);
23907 Perl_get_prop_definition(pTHX_ const int table_index)
23909 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23911 /* Create and return the inversion list */
23912 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23915 const char * const *
23916 Perl_get_prop_values(const int table_index)
23918 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23920 return UNI_prop_value_ptrs[table_index];
23924 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23926 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23928 return deprecated_property_msgs[warning_offset];
23933 This code was mainly added for backcompat to give a warning for non-portable
23934 code points in user-defined properties. But experiments showed that the
23935 warning in earlier perls were only omitted on overflow, which should be an
23936 error, so there really isnt a backcompat issue, and actually adding the
23937 warning when none was present before might cause breakage, for little gain. So
23938 khw left this code in, but not enabled. Tests were never added.
23941 Ei |const char *|get_extended_utf8_msg|const UV cp
23943 PERL_STATIC_INLINE const char *
23944 S_get_extended_utf8_msg(pTHX_ const UV cp)
23946 U8 dummy[UTF8_MAXBYTES + 1];
23950 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23953 msg = hv_fetchs(msgs, "text", 0);
23956 (void) sv_2mortal((SV *) msgs);
23958 return SvPVX(*msg);
23962 #endif /* end of ! PERL_IN_XSUB_RE */
23965 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23966 const bool ignore_case)
23968 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23969 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23970 * because nothing outside of ASCII will match. Use /m because the input
23971 * string may be a bunch of lines strung together.
23973 * Also sets up the debugging info */
23975 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23977 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
23978 REGEXP * subpattern_re;
23979 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23981 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23986 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23988 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23989 rx_flags = flags & RXf_PMf_COMPILETIME;
23991 #ifndef PERL_IN_XSUB_RE
23992 /* Use the core engine if this file is regcomp.c. That means no
23993 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23994 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23995 &PL_core_reg_engine,
23999 if (isDEBUG_WILDCARD) {
24000 /* Use the special debugging engine if this file is re_comp.c and wants
24001 * to output the wildcard matching. This uses whatever
24002 * 'use re "Debug ..." is in effect */
24003 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
24009 /* Use the special wildcard engine if this file is re_comp.c and
24010 * doesn't want to output the wildcard matching. This uses whatever
24011 * 'use re "Debug ..." is in effect for compilation, but this engine
24012 * structure has been set up so that it uses the core engine for
24013 * execution, so no execution debugging as a result of re.pm will be
24015 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
24019 /* XXX The above has the effect that any user-supplied regex engine
24020 * won't be called for matching wildcards. That might be good, or bad.
24021 * It could be changed in several ways. The reason it is done the
24022 * current way is to avoid having to save and restore
24023 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
24024 * could be used. Another suggestion is to keep the authoritative
24025 * value of the debug flags in a thread-local variable and add set/get
24026 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
24027 * Still another is to pass a flag, say in the engine's intflags that
24028 * would be checked each time before doing the debug output */
24032 assert(subpattern_re); /* Should have died if didn't compile successfully */
24033 return subpattern_re;
24037 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
24038 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
24041 DECLARE_AND_GET_RE_DEBUG_FLAGS;
24043 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
24047 /* The compilation has set things up so that if the program doesn't want to
24048 * see the wildcard matching procedure, it will get the core execution
24049 * engine, which is subject only to -Dr. So we have to turn that off
24050 * around this procedure */
24051 if (! isDEBUG_WILDCARD) {
24052 /* Note! Casts away 'volatile' */
24054 PL_debug &= ~ DEBUG_r_FLAG;
24057 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
24065 S_handle_user_defined_property(pTHX_
24067 /* Parses the contents of a user-defined property definition; returning the
24068 * expanded definition if possible. If so, the return is an inversion
24071 * If there are subroutines that are part of the expansion and which aren't
24072 * known at the time of the call to this function, this returns what
24073 * parse_uniprop_string() returned for the first one encountered.
24075 * If an error was found, NULL is returned, and 'msg' gets a suitable
24076 * message appended to it. (Appending allows the back trace of how we got
24077 * to the faulty definition to be displayed through nested calls of
24078 * user-defined subs.)
24080 * The caller IS responsible for freeing any returned SV.
24082 * The syntax of the contents is pretty much described in perlunicode.pod,
24083 * but we also allow comments on each line */
24085 const char * name, /* Name of property */
24086 const STRLEN name_len, /* The name's length in bytes */
24087 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
24088 const bool to_fold, /* ? Is this under /i */
24089 const bool runtime, /* ? Are we in compile- or run-time */
24090 const bool deferrable, /* Is it ok for this property's full definition
24091 to be deferred until later? */
24092 SV* contents, /* The property's definition */
24093 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
24094 getting called unless this is thought to be
24095 a user-defined property */
24096 SV * msg, /* Any error or warning msg(s) are appended to
24098 const STRLEN level) /* Recursion level of this call */
24101 const char * string = SvPV_const(contents, len);
24102 const char * const e = string + len;
24103 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
24104 const STRLEN msgs_length_on_entry = SvCUR(msg);
24106 const char * s0 = string; /* Points to first byte in the current line
24107 being parsed in 'string' */
24108 const char overflow_msg[] = "Code point too large in \"";
24109 SV* running_definition = NULL;
24111 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
24113 *user_defined_ptr = TRUE;
24115 /* Look at each line */
24117 const char * s; /* Current byte */
24118 char op = '+'; /* Default operation is 'union' */
24119 IV min = 0; /* range begin code point */
24120 IV max = -1; /* and range end */
24121 SV* this_definition;
24123 /* Skip comment lines */
24125 s0 = strchr(s0, '\n');
24133 /* For backcompat, allow an empty first line */
24139 /* First character in the line may optionally be the operation */
24148 /* If the line is one or two hex digits separated by blank space, its
24149 * a range; otherwise it is either another user-defined property or an
24154 if (! isXDIGIT(*s)) {
24155 goto check_if_property;
24158 do { /* Each new hex digit will add 4 bits. */
24159 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
24160 s = strchr(s, '\n');
24164 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24165 sv_catpv(msg, overflow_msg);
24166 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24167 UTF8fARG(is_contents_utf8, s - s0, s0));
24168 sv_catpvs(msg, "\"");
24169 goto return_failure;
24172 /* Accumulate this digit into the value */
24173 min = (min << 4) + READ_XDIGIT(s);
24174 } while (isXDIGIT(*s));
24176 while (isBLANK(*s)) { s++; }
24178 /* We allow comments at the end of the line */
24180 s = strchr(s, '\n');
24186 else if (s < e && *s != '\n') {
24187 if (! isXDIGIT(*s)) {
24188 goto check_if_property;
24191 /* Look for the high point of the range */
24194 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
24195 s = strchr(s, '\n');
24199 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24200 sv_catpv(msg, overflow_msg);
24201 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24202 UTF8fARG(is_contents_utf8, s - s0, s0));
24203 sv_catpvs(msg, "\"");
24204 goto return_failure;
24207 max = (max << 4) + READ_XDIGIT(s);
24208 } while (isXDIGIT(*s));
24210 while (isBLANK(*s)) { s++; }
24213 s = strchr(s, '\n');
24218 else if (s < e && *s != '\n') {
24219 goto check_if_property;
24223 if (max == -1) { /* The line only had one entry */
24226 else if (max < min) {
24227 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24228 sv_catpvs(msg, "Illegal range in \"");
24229 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24230 UTF8fARG(is_contents_utf8, s - s0, s0));
24231 sv_catpvs(msg, "\"");
24232 goto return_failure;
24235 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
24237 if ( UNICODE_IS_PERL_EXTENDED(min)
24238 || UNICODE_IS_PERL_EXTENDED(max))
24240 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24242 /* If both code points are non-portable, warn only on the lower
24244 sv_catpv(msg, get_extended_utf8_msg(
24245 (UNICODE_IS_PERL_EXTENDED(min))
24247 sv_catpvs(msg, " in \"");
24248 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
24249 UTF8fARG(is_contents_utf8, s - s0, s0));
24250 sv_catpvs(msg, "\"");
24255 /* Here, this line contains a legal range */
24256 this_definition = sv_2mortal(_new_invlist(2));
24257 this_definition = _add_range_to_invlist(this_definition, min, max);
24262 /* Here it isn't a legal range line. See if it is a legal property
24263 * line. First find the end of the meat of the line */
24264 s = strpbrk(s, "#\n");
24269 /* Ignore trailing blanks in keeping with the requirements of
24270 * parse_uniprop_string() */
24272 while (s > s0 && isBLANK_A(*s)) {
24277 this_definition = parse_uniprop_string(s0, s - s0,
24278 is_utf8, to_fold, runtime,
24281 user_defined_ptr, msg,
24283 ? level /* Don't increase level
24284 if input is empty */
24287 if (this_definition == NULL) {
24288 goto return_failure; /* 'msg' should have had the reason
24289 appended to it by the above call */
24292 if (! is_invlist(this_definition)) { /* Unknown at this time */
24293 return newSVsv(this_definition);
24297 s = strchr(s, '\n');
24307 _invlist_union(running_definition, this_definition,
24308 &running_definition);
24311 _invlist_subtract(running_definition, this_definition,
24312 &running_definition);
24315 _invlist_intersection(running_definition, this_definition,
24316 &running_definition);
24319 _invlist_union_complement_2nd(running_definition,
24320 this_definition, &running_definition);
24323 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
24324 __FILE__, __LINE__, op);
24328 /* Position past the '\n' */
24330 } /* End of loop through the lines of 'contents' */
24332 /* Here, we processed all the lines in 'contents' without error. If we
24333 * didn't add any warnings, simply return success */
24334 if (msgs_length_on_entry == SvCUR(msg)) {
24336 /* If the expansion was empty, the answer isn't nothing: its an empty
24337 * inversion list */
24338 if (running_definition == NULL) {
24339 running_definition = _new_invlist(1);
24342 return running_definition;
24345 /* Otherwise, add some explanatory text, but we will return success */
24349 running_definition = NULL;
24353 if (name_len > 0) {
24354 sv_catpvs(msg, " in expansion of ");
24355 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24358 return running_definition;
24361 /* As explained below, certain operations need to take place in the first
24362 * thread created. These macros switch contexts */
24363 # ifdef USE_ITHREADS
24364 # define DECLARATION_FOR_GLOBAL_CONTEXT \
24365 PerlInterpreter * save_aTHX = aTHX;
24366 # define SWITCH_TO_GLOBAL_CONTEXT \
24367 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
24368 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
24369 # define CUR_CONTEXT aTHX
24370 # define ORIGINAL_CONTEXT save_aTHX
24372 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
24373 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
24374 # define RESTORE_CONTEXT NOOP
24375 # define CUR_CONTEXT NULL
24376 # define ORIGINAL_CONTEXT NULL
24380 S_delete_recursion_entry(pTHX_ void *key)
24382 /* Deletes the entry used to detect recursion when expanding user-defined
24383 * properties. This is a function so it can be set up to be called even if
24384 * the program unexpectedly quits */
24386 SV ** current_entry;
24387 const STRLEN key_len = strlen((const char *) key);
24388 DECLARATION_FOR_GLOBAL_CONTEXT;
24390 SWITCH_TO_GLOBAL_CONTEXT;
24392 /* If the entry is one of these types, it is a permanent entry, and not the
24393 * one used to detect recursions. This function should delete only the
24394 * recursion entry */
24395 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
24397 && ! is_invlist(*current_entry)
24398 && ! SvPOK(*current_entry))
24400 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
24408 S_get_fq_name(pTHX_
24409 const char * const name, /* The first non-blank in the \p{}, \P{} */
24410 const Size_t name_len, /* Its length in bytes, not including any trailing space */
24411 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
24412 const bool has_colon_colon
24415 /* Returns a mortal SV containing the fully qualified version of the input
24420 fq_name = newSVpvs_flags("", SVs_TEMP);
24422 /* Use the current package if it wasn't included in our input */
24423 if (! has_colon_colon) {
24424 const HV * pkg = (IN_PERL_COMPILETIME)
24426 : CopSTASH(PL_curcop);
24427 const char* pkgname = HvNAME(pkg);
24429 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
24430 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
24431 sv_catpvs(fq_name, "::");
24434 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
24435 UTF8fARG(is_utf8, name_len, name));
24440 S_parse_uniprop_string(pTHX_
24442 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
24443 * now. If so, the return is an inversion list.
24445 * If the property is user-defined, it is a subroutine, which in turn
24446 * may call other subroutines. This function will call the whole nest of
24447 * them to get the definition they return; if some aren't known at the time
24448 * of the call to this function, the fully qualified name of the highest
24449 * level sub is returned. It is an error to call this function at runtime
24450 * without every sub defined.
24452 * If an error was found, NULL is returned, and 'msg' gets a suitable
24453 * message appended to it. (Appending allows the back trace of how we got
24454 * to the faulty definition to be displayed through nested calls of
24455 * user-defined subs.)
24457 * The caller should NOT try to free any returned inversion list.
24459 * Other parameters will be set on return as described below */
24461 const char * const name, /* The first non-blank in the \p{}, \P{} */
24462 Size_t name_len, /* Its length in bytes, not including any
24464 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
24465 const bool to_fold, /* ? Is this under /i */
24466 const bool runtime, /* TRUE if this is being called at run time */
24467 const bool deferrable, /* TRUE if it's ok for the definition to not be
24468 known at this call */
24469 AV ** strings, /* To return string property values, like named
24471 bool *user_defined_ptr, /* Upon return from this function it will be
24472 set to TRUE if any component is a
24473 user-defined property */
24474 SV * msg, /* Any error or warning msg(s) are appended to
24476 const STRLEN level) /* Recursion level of this call */
24478 char* lookup_name; /* normalized name for lookup in our tables */
24479 unsigned lookup_len; /* Its length */
24480 enum { Not_Strict = 0, /* Some properties have stricter name */
24481 Strict, /* normalization rules, which we decide */
24482 As_Is /* upon based on parsing */
24483 } stricter = Not_Strict;
24485 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
24486 * (though it requires extra effort to download them from Unicode and
24487 * compile perl to know about them) */
24488 bool is_nv_type = FALSE;
24490 unsigned int i, j = 0;
24491 int equals_pos = -1; /* Where the '=' is found, or negative if none */
24492 int slash_pos = -1; /* Where the '/' is found, or negative if none */
24493 int table_index = 0; /* The entry number for this property in the table
24494 of all Unicode property names */
24495 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
24496 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
24497 the normalized name in certain situations */
24498 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
24499 part of a package name */
24500 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
24501 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
24502 property rather than a Unicode
24504 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
24505 if an error. If it is an inversion list,
24506 it is the definition. Otherwise it is a
24507 string containing the fully qualified sub
24509 SV * fq_name = NULL; /* For user-defined properties, the fully
24511 bool invert_return = FALSE; /* ? Do we need to complement the result before
24513 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
24514 explicit utf8:: package that we strip
24516 /* The expansion of properties that could be either user-defined or
24517 * official unicode ones is deferred until runtime, including a marker for
24518 * those that might be in the latter category. This boolean indicates if
24519 * we've seen that marker. If not, what we're parsing can't be such an
24520 * official Unicode property whose expansion was deferred */
24521 bool could_be_deferred_official = FALSE;
24523 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
24525 /* The input will be normalized into 'lookup_name' */
24526 Newx(lookup_name, name_len, char);
24527 SAVEFREEPV(lookup_name);
24529 /* Parse the input. */
24530 for (i = 0; i < name_len; i++) {
24531 char cur = name[i];
24533 /* Most of the characters in the input will be of this ilk, being parts
24535 if (isIDCONT_A(cur)) {
24537 /* Case differences are ignored. Our lookup routine assumes
24538 * everything is lowercase, so normalize to that */
24539 if (isUPPER_A(cur)) {
24540 lookup_name[j++] = toLOWER_A(cur);
24544 if (cur == '_') { /* Don't include these in the normalized name */
24548 lookup_name[j++] = cur;
24550 /* The first character in a user-defined name must be of this type.
24552 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
24553 could_be_user_defined = FALSE;
24559 /* Here, the character is not something typically in a name, But these
24560 * two types of characters (and the '_' above) can be freely ignored in
24561 * most situations. Later it may turn out we shouldn't have ignored
24562 * them, and we have to reparse, but we don't have enough information
24563 * yet to make that decision */
24564 if (cur == '-' || isSPACE_A(cur)) {
24565 could_be_user_defined = FALSE;
24569 /* An equals sign or single colon mark the end of the first part of
24570 * the property name */
24572 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
24574 lookup_name[j++] = '='; /* Treat the colon as an '=' */
24575 equals_pos = j; /* Note where it occurred in the input */
24576 could_be_user_defined = FALSE;
24580 /* If this looks like it is a marker we inserted at compile time,
24581 * set a flag and otherwise ignore it. If it isn't in the final
24582 * position, keep it as it would have been user input. */
24583 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
24585 && could_be_user_defined
24586 && i == name_len - 1)
24589 could_be_deferred_official = TRUE;
24593 /* Otherwise, this character is part of the name. */
24594 lookup_name[j++] = cur;
24596 /* Here it isn't a single colon, so if it is a colon, it must be a
24600 /* A double colon should be a package qualifier. We note its
24601 * position and continue. Note that one could have
24602 * pkg1::pkg2::...::foo
24603 * so that the position at the end of the loop will be just after
24604 * the final qualifier */
24607 non_pkg_begin = i + 1;
24608 lookup_name[j++] = ':';
24609 lun_non_pkg_begin = j;
24611 else { /* Only word chars (and '::') can be in a user-defined name */
24612 could_be_user_defined = FALSE;
24614 } /* End of parsing through the lhs of the property name (or all of it if
24617 /* If there is a single package name 'utf8::', it is ambiguous. It could
24618 * be for a user-defined property, or it could be a Unicode property, as
24619 * all of them are considered to be for that package. For the purposes of
24620 * parsing the rest of the property, strip it off */
24621 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
24622 lookup_name += STRLENs("utf8::");
24623 j -= STRLENs("utf8::");
24624 equals_pos -= STRLENs("utf8::");
24625 stripped_utf8_pkg = TRUE;
24628 /* Here, we are either done with the whole property name, if it was simple;
24629 * or are positioned just after the '=' if it is compound. */
24631 if (equals_pos >= 0) {
24632 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
24634 /* Space immediately after the '=' is ignored */
24636 for (; i < name_len; i++) {
24637 if (! isSPACE_A(name[i])) {
24642 /* Most punctuation after the equals indicates a subpattern, like
24644 if ( isPUNCT_A(name[i])
24649 /* A backslash means the real delimitter is the next character,
24650 * but it must be punctuation */
24651 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
24653 bool special_property = memEQs(lookup_name, j - 1, "name")
24654 || memEQs(lookup_name, j - 1, "na");
24655 if (! special_property) {
24656 /* Find the property. The table includes the equals sign, so
24657 * we use 'j' as-is */
24658 table_index = do_uniprop_match(lookup_name, j);
24660 if (special_property || table_index) {
24661 REGEXP * subpattern_re;
24662 char open = name[i++];
24664 const char * pos_in_brackets;
24665 const char * const * prop_values;
24668 /* Backslash => delimitter is the character following. We
24669 * already checked that it is punctuation */
24670 if (open == '\\') {
24675 /* This data structure is constructed so that the matching
24676 * closing bracket is 3 past its matching opening. The second
24677 * set of closing is so that if the opening is something like
24678 * ']', the closing will be that as well. Something similar is
24679 * done in toke.c */
24680 pos_in_brackets = memCHRs("([<)]>)]>", open);
24681 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
24684 || name[name_len-1] != close
24685 || (escaped && name[name_len-2] != '\\')
24686 /* Also make sure that there are enough characters.
24687 * e.g., '\\\' would show up incorrectly as legal even
24688 * though it is too short */
24689 || (SSize_t) (name_len - i - 1 - escaped) < 0)
24691 sv_catpvs(msg, "Unicode property wildcard not terminated");
24692 goto append_name_to_msg;
24695 Perl_ck_warner_d(aTHX_
24696 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
24697 "The Unicode property wildcards feature is experimental");
24699 if (special_property) {
24700 const char * error_msg;
24701 const char * revised_name = name + i;
24702 Size_t revised_name_len = name_len - (i + 1 + escaped);
24704 /* Currently, the only 'special_property' is name, which we
24705 * lookup in _charnames.pm */
24707 if (! load_charnames(newSVpvs("placeholder"),
24708 revised_name, revised_name_len,
24711 sv_catpv(msg, error_msg);
24712 goto append_name_to_msg;
24715 /* Farm this out to a function just to make the current
24716 * function less unwieldy */
24717 if (handle_names_wildcard(revised_name, revised_name_len,
24721 return prop_definition;
24727 prop_values = get_prop_values(table_index);
24729 /* Now create and compile the wildcard subpattern. Use /i
24730 * because the property values are supposed to match with case
24732 subpattern_re = compile_wildcard(name + i,
24733 name_len - i - 1 - escaped,
24737 /* For each legal property value, see if the supplied pattern
24739 while (*prop_values) {
24740 const char * const entry = *prop_values;
24741 const Size_t len = strlen(entry);
24742 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
24744 if (execute_wildcard(subpattern_re,
24746 (char *) entry + len,
24750 { /* Here, matched. Add to the returned list */
24751 Size_t total_len = j + len;
24752 SV * sub_invlist = NULL;
24753 char * this_string;
24755 /* We know this is a legal \p{property=value}. Call
24756 * the function to return the list of code points that
24758 Newxz(this_string, total_len + 1, char);
24759 Copy(lookup_name, this_string, j, char);
24760 my_strlcat(this_string, entry, total_len + 1);
24761 SAVEFREEPV(this_string);
24762 sub_invlist = parse_uniprop_string(this_string,
24772 _invlist_union(prop_definition, sub_invlist,
24776 prop_values++; /* Next iteration, look at next propvalue */
24777 } /* End of looking through property values; (the data
24778 structure is terminated by a NULL ptr) */
24780 SvREFCNT_dec_NN(subpattern_re);
24782 if (prop_definition) {
24783 return prop_definition;
24786 sv_catpvs(msg, "No Unicode property value wildcard matches:");
24787 goto append_name_to_msg;
24790 /* Here's how khw thinks we should proceed to handle the properties
24791 * not yet done: Bidi Mirroring Glyph can map to ""
24792 Bidi Paired Bracket can map to ""
24793 Case Folding (both full and simple)
24794 Shouldn't /i be good enough for Full
24795 Decomposition Mapping
24796 Equivalent Unified Ideograph can map to ""
24797 Lowercase Mapping (both full and simple)
24798 NFKC Case Fold can map to ""
24799 Titlecase Mapping (both full and simple)
24800 Uppercase Mapping (both full and simple)
24801 * Handle these the same way Name is done, using say, _wild.pm, but
24802 * having both loose and full, like in charclass_invlists.h.
24803 * Perhaps move block and script to that as they are somewhat large
24804 * in charclass_invlists.h.
24805 * For properties where the default is the code point itself, such
24806 * as any of the case changing mappings, the string would otherwise
24807 * consist of all Unicode code points in UTF-8 strung together.
24808 * This would be impractical. So instead, examine their compiled
24809 * pattern, looking at the ssc. If none, reject the pattern as an
24810 * error. Otherwise run the pattern against every code point in
24811 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
24812 * And it might be good to create an API to return the ssc.
24813 * Or handle them like the algorithmic names are done
24815 } /* End of is a wildcard subppattern */
24817 /* \p{name=...} is handled specially. Instead of using the normal
24818 * mechanism involving charclass_invlists.h, it uses _charnames.pm
24819 * which has the necessary (huge) data accessible to it, and which
24820 * doesn't get loaded unless necessary. The legal syntax for names is
24821 * somewhat different than other properties due both to the vagaries of
24822 * a few outlier official names, and the fact that only a few ASCII
24823 * characters are permitted in them */
24824 if ( memEQs(lookup_name, j - 1, "name")
24825 || memEQs(lookup_name, j - 1, "na"))
24830 const char * error_msg;
24832 SV * character_name;
24833 STRLEN character_len;
24838 /* Since the RHS (after skipping initial space) is passed unchanged
24839 * to charnames, and there are different criteria for what are
24840 * legal characters in the name, just parse it here. A character
24841 * name must begin with an ASCII alphabetic */
24842 if (! isALPHA(name[i])) {
24845 lookup_name[j++] = name[i];
24847 for (++i; i < name_len; i++) {
24848 /* Official names can only be in the ASCII range, and only
24849 * certain characters */
24850 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24853 lookup_name[j++] = name[i];
24856 /* Finished parsing, save the name into an SV */
24857 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24859 /* Make sure _charnames is loaded. (The parameters give context
24860 * for any errors generated */
24861 table = load_charnames(character_name, name, name_len, &error_msg);
24862 if (table == NULL) {
24863 sv_catpv(msg, error_msg);
24864 goto append_name_to_msg;
24867 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24868 if (! lookup_loose) {
24870 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24873 PUSHSTACKi(PERLSI_REGCOMP);
24879 XPUSHs(character_name);
24881 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24886 SvREFCNT_inc_simple_void_NN(character);
24893 if (! SvOK(character)) {
24897 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24898 if (character_len == SvCUR(character)) {
24899 prop_definition = add_cp_to_invlist(NULL, cp);
24904 /* First of the remaining characters in the string. */
24905 char * remaining = SvPVX(character) + character_len;
24907 if (strings == NULL) {
24908 goto failed; /* XXX Perhaps a specific msg instead, like
24909 'not available here' */
24912 if (*strings == NULL) {
24913 *strings = newAV();
24916 this_string = newAV();
24917 av_push(this_string, newSVuv(cp));
24920 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24921 av_push(this_string, newSVuv(cp));
24922 remaining += character_len;
24923 } while (remaining < SvEND(character));
24925 av_push(*strings, (SV *) this_string);
24928 return prop_definition;
24931 /* Certain properties whose values are numeric need special handling.
24932 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24933 * purposes of checking if this is one of those properties */
24934 if (memBEGINPs(lookup_name, j, "is")) {
24938 /* Then check if it is one of these specially-handled properties. The
24939 * possibilities are hard-coded because easier this way, and the list
24940 * is unlikely to change.
24942 * All numeric value type properties are of this ilk, and are also
24943 * special in a different way later on. So find those first. There
24944 * are several numeric value type properties in the Unihan DB (which is
24945 * unlikely to be compiled with perl, but we handle it here in case it
24946 * does get compiled). They all end with 'numeric'. The interiors
24947 * aren't checked for the precise property. This would stop working if
24948 * a cjk property were to be created that ended with 'numeric' and
24949 * wasn't a numeric type */
24950 is_nv_type = memEQs(lookup_name + lookup_offset,
24951 j - 1 - lookup_offset, "numericvalue")
24952 || memEQs(lookup_name + lookup_offset,
24953 j - 1 - lookup_offset, "nv")
24954 || ( memENDPs(lookup_name + lookup_offset,
24955 j - 1 - lookup_offset, "numeric")
24956 && ( memBEGINPs(lookup_name + lookup_offset,
24957 j - 1 - lookup_offset, "cjk")
24958 || memBEGINPs(lookup_name + lookup_offset,
24959 j - 1 - lookup_offset, "k")));
24961 || memEQs(lookup_name + lookup_offset,
24962 j - 1 - lookup_offset, "canonicalcombiningclass")
24963 || memEQs(lookup_name + lookup_offset,
24964 j - 1 - lookup_offset, "ccc")
24965 || memEQs(lookup_name + lookup_offset,
24966 j - 1 - lookup_offset, "age")
24967 || memEQs(lookup_name + lookup_offset,
24968 j - 1 - lookup_offset, "in")
24969 || memEQs(lookup_name + lookup_offset,
24970 j - 1 - lookup_offset, "presentin"))
24974 /* Since the stuff after the '=' is a number, we can't throw away
24975 * '-' willy-nilly, as those could be a minus sign. Other stricter
24976 * rules also apply. However, these properties all can have the
24977 * rhs not be a number, in which case they contain at least one
24978 * alphabetic. In those cases, the stricter rules don't apply.
24979 * But the numeric type properties can have the alphas [Ee] to
24980 * signify an exponent, and it is still a number with stricter
24981 * rules. So look for an alpha that signifies not-strict */
24983 for (k = i; k < name_len; k++) {
24984 if ( isALPHA_A(name[k])
24985 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24987 stricter = Not_Strict;
24995 /* A number may have a leading '+' or '-'. The latter is retained
24997 if (name[i] == '+') {
25000 else if (name[i] == '-') {
25001 lookup_name[j++] = '-';
25005 /* Skip leading zeros including single underscores separating the
25006 * zeros, or between the final leading zero and the first other
25008 for (; i < name_len - 1; i++) {
25009 if ( name[i] != '0'
25010 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
25016 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by
25017 * underling libc implementation. */
25018 if ( i == name_len - 1
25019 && name[name_len-1] == '0'
25020 && lookup_name[j-1] == '-')
25026 else { /* No '=' */
25028 /* Only a few properties without an '=' should be parsed with stricter
25029 * rules. The list is unlikely to change. */
25030 if ( memBEGINPs(lookup_name, j, "perl")
25031 && memNEs(lookup_name + 4, j - 4, "space")
25032 && memNEs(lookup_name + 4, j - 4, "word"))
25036 /* We set the inputs back to 0 and the code below will reparse,
25042 /* Here, we have either finished the property, or are positioned to parse
25043 * the remainder, and we know if stricter rules apply. Finish out, if not
25045 for (; i < name_len; i++) {
25046 char cur = name[i];
25048 /* In all instances, case differences are ignored, and we normalize to
25050 if (isUPPER_A(cur)) {
25051 lookup_name[j++] = toLOWER(cur);
25055 /* An underscore is skipped, but not under strict rules unless it
25056 * separates two digits */
25059 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
25060 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
25062 lookup_name[j++] = '_';
25067 /* Hyphens are skipped except under strict */
25068 if (cur == '-' && ! stricter) {
25072 /* XXX Bug in documentation. It says white space skipped adjacent to
25073 * non-word char. Maybe we should, but shouldn't skip it next to a dot
25075 if (isSPACE_A(cur) && ! stricter) {
25079 lookup_name[j++] = cur;
25081 /* Unless this is a non-trailing slash, we are done with it */
25082 if (i >= name_len - 1 || cur != '/') {
25088 /* A slash in the 'numeric value' property indicates that what follows
25089 * is a denominator. It can have a leading '+' and '0's that should be
25090 * skipped. But we have never allowed a negative denominator, so treat
25091 * a minus like every other character. (No need to rule out a second
25092 * '/', as that won't match anything anyway */
25095 if (i < name_len && name[i] == '+') {
25099 /* Skip leading zeros including underscores separating digits */
25100 for (; i < name_len - 1; i++) {
25101 if ( name[i] != '0'
25102 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
25108 /* Store the first real character in the denominator */
25109 if (i < name_len) {
25110 lookup_name[j++] = name[i];
25115 /* Here are completely done parsing the input 'name', and 'lookup_name'
25116 * contains a copy, normalized.
25118 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
25119 * different from without the underscores. */
25120 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
25121 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
25122 && UNLIKELY(name[name_len-1] == '_'))
25124 lookup_name[j++] = '&';
25127 /* If the original input began with 'In' or 'Is', it could be a subroutine
25128 * call to a user-defined property instead of a Unicode property name. */
25129 if ( name_len - non_pkg_begin > 2
25130 && name[non_pkg_begin+0] == 'I'
25131 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
25133 /* Names that start with In have different characterstics than those
25134 * that start with Is */
25135 if (name[non_pkg_begin+1] == 's') {
25136 starts_with_Is = TRUE;
25140 could_be_user_defined = FALSE;
25143 if (could_be_user_defined) {
25146 /* If the user defined property returns the empty string, it could
25147 * easily be because the pattern is being compiled before the data it
25148 * actually needs to compile is available. This could be argued to be
25149 * a bug in the perl code, but this is a change of behavior for Perl,
25150 * so we handle it. This means that intentionally returning nothing
25151 * will not be resolved until runtime */
25152 bool empty_return = FALSE;
25154 /* Here, the name could be for a user defined property, which are
25155 * implemented as subs. */
25156 user_sub = get_cvn_flags(name, name_len, 0);
25159 /* Here, the property name could be a user-defined one, but there
25160 * is no subroutine to handle it (as of now). Defer handling it
25161 * until runtime. Otherwise, a block defined by Unicode in a later
25162 * release would get the synonym InFoo added for it, and existing
25163 * code that used that name would suddenly break if it referred to
25164 * the property before the sub was declared. See [perl #134146] */
25166 goto definition_deferred;
25169 /* Here, we are at runtime, and didn't find the user property. It
25170 * could be an official property, but only if no package was
25171 * specified, or just the utf8:: package. */
25172 if (could_be_deferred_official) {
25173 lookup_name += lun_non_pkg_begin;
25174 j -= lun_non_pkg_begin;
25176 else if (! stripped_utf8_pkg) {
25177 goto unknown_user_defined;
25180 /* Drop down to look up in the official properties */
25183 const char insecure[] = "Insecure user-defined property";
25185 /* Here, there is a sub by the correct name. Normally we call it
25186 * to get the property definition */
25188 SV * user_sub_sv = MUTABLE_SV(user_sub);
25189 SV * error; /* Any error returned by calling 'user_sub' */
25190 SV * key; /* The key into the hash of user defined sub names
25193 SV ** saved_user_prop_ptr; /* Hash entry for this property */
25195 /* How many times to retry when another thread is in the middle of
25196 * expanding the same definition we want */
25197 PERL_INT_FAST8_T retry_countdown = 10;
25199 DECLARATION_FOR_GLOBAL_CONTEXT;
25201 /* If we get here, we know this property is user-defined */
25202 *user_defined_ptr = TRUE;
25204 /* We refuse to call a potentially tainted subroutine; returning an
25207 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25208 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
25209 goto append_name_to_msg;
25212 /* In principal, we only call each subroutine property definition
25213 * once during the life of the program. This guarantees that the
25214 * property definition never changes. The results of the single
25215 * sub call are stored in a hash, which is used instead for future
25216 * references to this property. The property definition is thus
25217 * immutable. But, to allow the user to have a /i-dependent
25218 * definition, we call the sub once for non-/i, and once for /i,
25219 * should the need arise, passing the /i status as a parameter.
25221 * We start by constructing the hash key name, consisting of the
25222 * fully qualified subroutine name, preceded by the /i status, so
25223 * that there is a key for /i and a different key for non-/i */
25224 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
25225 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25226 non_pkg_begin != 0);
25227 sv_catsv(key, fq_name);
25229 /* We only call the sub once throughout the life of the program
25230 * (with the /i, non-/i exception noted above). That means the
25231 * hash must be global and accessible to all threads. It is
25232 * created at program start-up, before any threads are created, so
25233 * is accessible to all children. But this creates some
25236 * 1) The keys can't be shared, or else problems arise; sharing is
25237 * turned off at hash creation time
25238 * 2) All SVs in it are there for the remainder of the life of the
25239 * program, and must be created in the same interpreter context
25240 * as the hash, or else they will be freed from the wrong pool
25241 * at global destruction time. This is handled by switching to
25242 * the hash's context to create each SV going into it, and then
25243 * immediately switching back
25244 * 3) All accesses to the hash must be controlled by a mutex, to
25245 * prevent two threads from getting an unstable state should
25246 * they simultaneously be accessing it. The code below is
25247 * crafted so that the mutex is locked whenever there is an
25248 * access and unlocked only when the next stable state is
25251 * The hash stores either the definition of the property if it was
25252 * valid, or, if invalid, the error message that was raised. We
25253 * use the type of SV to distinguish.
25255 * There's also the need to guard against the definition expansion
25256 * from infinitely recursing. This is handled by storing the aTHX
25257 * of the expanding thread during the expansion. Again the SV type
25258 * is used to distinguish this from the other two cases. If we
25259 * come to here and the hash entry for this property is our aTHX,
25260 * it means we have recursed, and the code assumes that we would
25261 * infinitely recurse, so instead stops and raises an error.
25262 * (Any recursion has always been treated as infinite recursion in
25265 * If instead, the entry is for a different aTHX, it means that
25266 * that thread has gotten here first, and hasn't finished expanding
25267 * the definition yet. We just have to wait until it is done. We
25268 * sleep and retry a few times, returning an error if the other
25269 * thread doesn't complete. */
25272 USER_PROP_MUTEX_LOCK;
25274 /* If we have an entry for this key, the subroutine has already
25275 * been called once with this /i status. */
25276 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
25277 SvPVX(key), SvCUR(key), 0);
25278 if (saved_user_prop_ptr) {
25280 /* If the saved result is an inversion list, it is the valid
25281 * definition of this property */
25282 if (is_invlist(*saved_user_prop_ptr)) {
25283 prop_definition = *saved_user_prop_ptr;
25285 /* The SV in the hash won't be removed until global
25286 * destruction, so it is stable and we can unlock */
25287 USER_PROP_MUTEX_UNLOCK;
25289 /* The caller shouldn't try to free this SV */
25290 return prop_definition;
25293 /* Otherwise, if it is a string, it is the error message
25294 * that was returned when we first tried to evaluate this
25295 * property. Fail, and append the message */
25296 if (SvPOK(*saved_user_prop_ptr)) {
25297 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25298 sv_catsv(msg, *saved_user_prop_ptr);
25300 /* The SV in the hash won't be removed until global
25301 * destruction, so it is stable and we can unlock */
25302 USER_PROP_MUTEX_UNLOCK;
25307 assert(SvIOK(*saved_user_prop_ptr));
25309 /* Here, we have an unstable entry in the hash. Either another
25310 * thread is in the middle of expanding the property's
25311 * definition, or we are ourselves recursing. We use the aTHX
25312 * in it to distinguish */
25313 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
25315 /* Here, it's another thread doing the expanding. We've
25316 * looked as much as we are going to at the contents of the
25317 * hash entry. It's safe to unlock. */
25318 USER_PROP_MUTEX_UNLOCK;
25320 /* Retry a few times */
25321 if (retry_countdown-- > 0) {
25326 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25327 sv_catpvs(msg, "Timeout waiting for another thread to "
25329 goto append_name_to_msg;
25332 /* Here, we are recursing; don't dig any deeper */
25333 USER_PROP_MUTEX_UNLOCK;
25335 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25337 "Infinite recursion in user-defined property");
25338 goto append_name_to_msg;
25341 /* Here, this thread has exclusive control, and there is no entry
25342 * for this property in the hash. So we have the go ahead to
25343 * expand the definition ourselves. */
25345 PUSHSTACKi(PERLSI_REGCOMP);
25348 /* Create a temporary placeholder in the hash to detect recursion
25350 SWITCH_TO_GLOBAL_CONTEXT;
25351 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
25352 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
25355 /* Now that we have a placeholder, we can let other threads
25357 USER_PROP_MUTEX_UNLOCK;
25359 /* Make sure the placeholder always gets destroyed */
25360 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
25365 /* Call the user's function, with the /i status as a parameter.
25366 * Note that we have gone to a lot of trouble to keep this call
25367 * from being within the locked mutex region. */
25368 XPUSHs(boolSV(to_fold));
25371 /* The following block was taken from swash_init(). Presumably
25372 * they apply to here as well, though we no longer use a swash --
25376 /* We might get here via a subroutine signature which uses a utf8
25377 * parameter name, at which point PL_subname will have been set
25378 * but not yet used. */
25379 save_item(PL_subname);
25381 /* G_SCALAR guarantees a single return value */
25382 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
25387 if (TAINT_get || SvTRUE(error)) {
25388 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25389 if (SvTRUE(error)) {
25390 sv_catpvs(msg, "Error \"");
25391 sv_catsv(msg, error);
25392 sv_catpvs(msg, "\"");
25395 if (SvTRUE(error)) sv_catpvs(msg, "; ");
25396 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
25399 if (name_len > 0) {
25400 sv_catpvs(msg, " in expansion of ");
25401 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
25407 prop_definition = NULL;
25410 SV * contents = POPs;
25412 /* The contents is supposed to be the expansion of the property
25413 * definition. If the definition is deferrable, and we got an
25414 * empty string back, set a flag to later defer it (after clean
25417 && (! SvPOK(contents) || SvCUR(contents) == 0))
25419 empty_return = TRUE;
25421 else { /* Otherwise, call a function to check for valid syntax,
25424 prop_definition = handle_user_defined_property(
25426 is_utf8, to_fold, runtime,
25428 contents, user_defined_ptr,
25434 /* Here, we have the results of the expansion. Delete the
25435 * placeholder, and if the definition is now known, replace it with
25436 * that definition. We need exclusive access to the hash, and we
25437 * can't let anyone else in, between when we delete the placeholder
25438 * and add the permanent entry */
25439 USER_PROP_MUTEX_LOCK;
25441 S_delete_recursion_entry(aTHX_ SvPVX(key));
25443 if ( ! empty_return
25444 && (! prop_definition || is_invlist(prop_definition)))
25446 /* If we got success we use the inversion list defining the
25447 * property; otherwise use the error message */
25448 SWITCH_TO_GLOBAL_CONTEXT;
25449 (void) hv_store_ent(PL_user_def_props,
25452 ? newSVsv(prop_definition)
25458 /* All done, and the hash now has a permanent entry for this
25459 * property. Give up exclusive control */
25460 USER_PROP_MUTEX_UNLOCK;
25466 if (empty_return) {
25467 goto definition_deferred;
25470 if (prop_definition) {
25472 /* If the definition is for something not known at this time,
25473 * we toss it, and go return the main property name, as that's
25474 * the one the user will be aware of */
25475 if (! is_invlist(prop_definition)) {
25476 SvREFCNT_dec_NN(prop_definition);
25477 goto definition_deferred;
25480 sv_2mortal(prop_definition);
25484 return prop_definition;
25486 } /* End of calling the subroutine for the user-defined property */
25487 } /* End of it could be a user-defined property */
25489 /* Here it wasn't a user-defined property that is known at this time. See
25490 * if it is a Unicode property */
25492 lookup_len = j; /* This is a more mnemonic name than 'j' */
25494 /* Get the index into our pointer table of the inversion list corresponding
25495 * to the property */
25496 table_index = do_uniprop_match(lookup_name, lookup_len);
25498 /* If it didn't find the property ... */
25499 if (table_index == 0) {
25501 /* Try again stripping off any initial 'Is'. This is because we
25502 * promise that an initial Is is optional. The same isn't true of
25503 * names that start with 'In'. Those can match only blocks, and the
25504 * lookup table already has those accounted for. The lookup table also
25505 * has already accounted for Perl extensions (without and = sign)
25506 * starting with 'i's'. */
25507 if (starts_with_Is && equals_pos >= 0) {
25513 table_index = do_uniprop_match(lookup_name, lookup_len);
25516 if (table_index == 0) {
25519 /* Here, we didn't find it. If not a numeric type property, and
25520 * can't be a user-defined one, it isn't a legal property */
25521 if (! is_nv_type) {
25522 if (! could_be_user_defined) {
25526 /* Here, the property name is legal as a user-defined one. At
25527 * compile time, it might just be that the subroutine for that
25528 * property hasn't been encountered yet, but at runtime, it's
25529 * an error to try to use an undefined one */
25530 if (! deferrable) {
25531 goto unknown_user_defined;;
25534 goto definition_deferred;
25535 } /* End of isn't a numeric type property */
25537 /* The numeric type properties need more work to decide. What we
25538 * do is make sure we have the number in canonical form and look
25541 if (slash_pos < 0) { /* No slash */
25543 /* When it isn't a rational, take the input, convert it to a
25544 * NV, then create a canonical string representation of that
25548 SSize_t value_len = lookup_len - equals_pos;
25550 /* Get the value */
25551 if ( value_len <= 0
25552 || my_atof3(lookup_name + equals_pos, &value,
25554 != lookup_name + lookup_len)
25559 /* If the value is an integer, the canonical value is integral
25561 if (Perl_ceil(value) == value) {
25562 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
25563 equals_pos, lookup_name, value);
25565 else { /* Otherwise, it is %e with a known precision */
25568 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
25569 equals_pos, lookup_name,
25570 PL_E_FORMAT_PRECISION, value);
25572 /* The exponent generated is expecting two digits, whereas
25573 * %e on some systems will generate three. Remove leading
25574 * zeros in excess of 2 from the exponent. We start
25575 * looking for them after the '=' */
25576 exp_ptr = strchr(canonical + equals_pos, 'e');
25578 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
25579 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
25581 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
25583 if (excess_exponent_len > 0) {
25584 SSize_t leading_zeros = strspn(cur_ptr, "0");
25585 SSize_t excess_leading_zeros
25586 = MIN(leading_zeros, excess_exponent_len);
25587 if (excess_leading_zeros > 0) {
25588 Move(cur_ptr + excess_leading_zeros,
25590 strlen(cur_ptr) - excess_leading_zeros
25591 + 1, /* Copy the NUL as well */
25598 else { /* Has a slash. Create a rational in canonical form */
25599 UV numerator, denominator, gcd, trial;
25600 const char * end_ptr;
25601 const char * sign = "";
25603 /* We can't just find the numerator, denominator, and do the
25604 * division, then use the method above, because that is
25605 * inexact. And the input could be a rational that is within
25606 * epsilon (given our precision) of a valid rational, and would
25607 * then incorrectly compare valid.
25609 * We're only interested in the part after the '=' */
25610 const char * this_lookup_name = lookup_name + equals_pos;
25611 lookup_len -= equals_pos;
25612 slash_pos -= equals_pos;
25614 /* Handle any leading minus */
25615 if (this_lookup_name[0] == '-') {
25617 this_lookup_name++;
25622 /* Convert the numerator to numeric */
25623 end_ptr = this_lookup_name + slash_pos;
25624 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
25628 /* It better have included all characters before the slash */
25629 if (*end_ptr != '/') {
25633 /* Set to look at just the denominator */
25634 this_lookup_name += slash_pos;
25635 lookup_len -= slash_pos;
25636 end_ptr = this_lookup_name + lookup_len;
25638 /* Convert the denominator to numeric */
25639 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
25643 /* It better be the rest of the characters, and don't divide by
25645 if ( end_ptr != this_lookup_name + lookup_len
25646 || denominator == 0)
25651 /* Get the greatest common denominator using
25652 http://en.wikipedia.org/wiki/Euclidean_algorithm */
25654 trial = denominator;
25655 while (trial != 0) {
25657 trial = gcd % trial;
25661 /* If already in lowest possible terms, we have already tried
25662 * looking this up */
25667 /* Reduce the rational, which should put it in canonical form
25670 denominator /= gcd;
25672 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
25673 equals_pos, lookup_name, sign, numerator, denominator);
25676 /* Here, we have the number in canonical form. Try that */
25677 table_index = do_uniprop_match(canonical, strlen(canonical));
25678 if (table_index == 0) {
25681 } /* End of still didn't find the property in our table */
25682 } /* End of didn't find the property in our table */
25684 /* Here, we have a non-zero return, which is an index into a table of ptrs.
25685 * A negative return signifies that the real index is the absolute value,
25686 * but the result needs to be inverted */
25687 if (table_index < 0) {
25688 invert_return = TRUE;
25689 table_index = -table_index;
25692 /* Out-of band indices indicate a deprecated property. The proper index is
25693 * modulo it with the table size. And dividing by the table size yields
25694 * an offset into a table constructed by regen/mk_invlists.pl to contain
25695 * the corresponding warning message */
25696 if (table_index > MAX_UNI_KEYWORD_INDEX) {
25697 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
25698 table_index %= MAX_UNI_KEYWORD_INDEX;
25699 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
25700 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
25701 (int) name_len, name,
25702 get_deprecated_property_msg(warning_offset));
25705 /* In a few properties, a different property is used under /i. These are
25706 * unlikely to change, so are hard-coded here. */
25708 if ( table_index == UNI_XPOSIXUPPER
25709 || table_index == UNI_XPOSIXLOWER
25710 || table_index == UNI_TITLE)
25712 table_index = UNI_CASED;
25714 else if ( table_index == UNI_UPPERCASELETTER
25715 || table_index == UNI_LOWERCASELETTER
25716 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
25717 || table_index == UNI_TITLECASELETTER
25720 table_index = UNI_CASEDLETTER;
25722 else if ( table_index == UNI_POSIXUPPER
25723 || table_index == UNI_POSIXLOWER)
25725 table_index = UNI_POSIXALPHA;
25729 /* Create and return the inversion list */
25730 prop_definition = get_prop_definition(table_index);
25731 sv_2mortal(prop_definition);
25733 /* See if there is a private use override to add to this definition */
25735 COPHH * hinthash = (IN_PERL_COMPILETIME)
25736 ? CopHINTHASH_get(&PL_compiling)
25737 : CopHINTHASH_get(PL_curcop);
25738 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
25740 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
25742 /* See if there is an element in the hints hash for this table */
25743 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
25744 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
25748 SV * pu_definition;
25750 SV * expanded_prop_definition =
25751 sv_2mortal(invlist_clone(prop_definition, NULL));
25753 /* If so, it's definition is the string from here to the next
25754 * \a character. And its format is the same as a user-defined
25756 pos += SvCUR(pu_lookup);
25757 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
25758 pu_invlist = handle_user_defined_property(lookup_name,
25761 0, /* Not folded */
25769 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25770 sv_catpvs(msg, "Insecure private-use override");
25771 goto append_name_to_msg;
25774 /* For now, as a safety measure, make sure that it doesn't
25775 * override non-private use code points */
25776 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
25778 /* Add it to the list to be returned */
25779 _invlist_union(prop_definition, pu_invlist,
25780 &expanded_prop_definition);
25781 prop_definition = expanded_prop_definition;
25782 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
25787 if (invert_return) {
25788 _invlist_invert(prop_definition);
25790 return prop_definition;
25792 unknown_user_defined:
25793 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25794 sv_catpvs(msg, "Unknown user-defined property name");
25795 goto append_name_to_msg;
25798 if (non_pkg_begin != 0) {
25799 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25800 sv_catpvs(msg, "Illegal user-defined property name");
25803 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
25804 sv_catpvs(msg, "Can't find Unicode property definition");
25808 append_name_to_msg:
25810 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
25811 const char * suffix = (runtime && level == 0) ? "}" : "\"";
25813 sv_catpv(msg, prefix);
25814 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
25815 sv_catpv(msg, suffix);
25820 definition_deferred:
25823 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
25825 /* Here it could yet to be defined, so defer evaluation of this until
25826 * its needed at runtime. We need the fully qualified property name to
25827 * avoid ambiguity */
25829 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25833 /* If it didn't come with a package, or the package is utf8::, this
25834 * actually could be an official Unicode property whose inclusion we
25835 * are deferring until runtime to make sure that it isn't overridden by
25836 * a user-defined property of the same name (which we haven't
25837 * encountered yet). Add a marker to indicate this possibility, for
25838 * use at such time when we first need the definition during pattern
25839 * matching execution */
25840 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25841 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25844 /* We also need a trailing newline */
25845 sv_catpvs(fq_name, "\n");
25847 *user_defined_ptr = TRUE;
25853 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25854 const STRLEN wname_len, /* Its length */
25855 SV ** prop_definition,
25858 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25859 * any matches, adding them to prop_definition */
25863 CV * get_names_info; /* entry to charnames.pm to get info we need */
25864 SV * names_string; /* Contains all character names, except algo */
25865 SV * algorithmic_names; /* Contains info about algorithmically
25866 generated character names */
25867 REGEXP * subpattern_re; /* The user's pattern to match with */
25868 struct regexp * prog; /* The compiled pattern */
25869 char * all_names_start; /* lib/unicore/Name.pl string of every
25870 (non-algorithmic) character name */
25871 char * cur_pos; /* We match, effectively using /gc; this is
25872 where we are now */
25873 bool found_matches = FALSE; /* Did any name match so far? */
25874 SV * empty; /* For matching zero length names */
25875 SV * must_sv; /* Contains the substring, if any, that must be
25876 in a name for the subpattern to match */
25877 const char * must; /* The PV of 'must' */
25878 STRLEN must_len; /* And its length */
25879 SV * syllable_name = NULL; /* For Hangul syllables */
25880 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25881 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25883 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25884 * syllable name, and these are immutable and guaranteed by the Unicode
25885 * standard to never be extended */
25886 const STRLEN syl_max_len = hangul_prefix_len + 7;
25890 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25892 /* Make sure _charnames is loaded. (The parameters give context
25893 * for any errors generated */
25894 get_names_info = get_cv("_charnames::_get_names_info", 0);
25895 if (! get_names_info) {
25896 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25899 /* Get the charnames data */
25900 PUSHSTACKi(PERLSI_REGCOMP);
25908 /* Special _charnames entry point that returns the info this routine
25910 call_sv(MUTABLE_SV(get_names_info), G_LIST);
25914 /* Data structure for names which end in their very own code points */
25915 algorithmic_names = POPs;
25916 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25918 /* The lib/unicore/Name.pl string */
25919 names_string = POPs;
25920 SvREFCNT_inc_simple_void_NN(names_string);
25927 if ( ! SvROK(names_string)
25928 || ! SvROK(algorithmic_names))
25929 { /* Perhaps should panic instead XXX */
25930 SvREFCNT_dec(names_string);
25931 SvREFCNT_dec(algorithmic_names);
25935 names_string = sv_2mortal(SvRV(names_string));
25936 all_names_start = SvPVX(names_string);
25937 cur_pos = all_names_start;
25939 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25941 /* Compile the subpattern consisting of the name being looked for */
25942 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25944 must_sv = re_intuit_string(subpattern_re);
25946 /* regexec.c can free the re_intuit_string() return. GH #17734 */
25947 must_sv = sv_2mortal(newSVsv(must_sv));
25948 must = SvPV(must_sv, must_len);
25955 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
25956 * This works because the NUL causes the function to return early, thus
25957 * showing that there are characters in it other than the acceptable ones,
25958 * which is our desired result.) */
25960 prog = ReANY(subpattern_re);
25962 /* If only nothing is matched, skip to where empty names are looked for */
25963 if (prog->maxlen == 0) {
25967 /* And match against the string of all names /gc. Don't even try if it
25968 * must match a character not found in any name. */
25969 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25971 while (execute_wildcard(subpattern_re,
25973 SvEND(names_string),
25974 all_names_start, 0,
25977 { /* Here, matched. */
25979 /* Note the string entries look like
25980 * 00001\nSTART OF HEADING\n\n
25981 * so we could match anywhere in that string. We have to rule out
25982 * matching a code point line */
25983 char * this_name_start = all_names_start
25984 + RX_OFFS(subpattern_re)->start;
25985 char * this_name_end = all_names_start
25986 + RX_OFFS(subpattern_re)->end;
25989 UV cp = 0; /* Silences some compilers */
25990 AV * this_string = NULL;
25991 bool is_multi = FALSE;
25993 /* If matched nothing, advance to next possible match */
25994 if (this_name_start == this_name_end) {
25995 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25996 SvEND(names_string) - this_name_end);
25997 if (cur_pos == NULL) {
26002 /* Position the next match to start beyond the current returned
26004 cur_pos = (char *) memchr(this_name_end, '\n',
26005 SvEND(names_string) - this_name_end);
26008 /* Back up to the \n just before the beginning of the character. */
26009 cp_end = (char *) my_memrchr(all_names_start,
26011 this_name_start - all_names_start);
26013 /* If we didn't find a \n, it means it matched somewhere in the
26014 * initial '00000' in the string, so isn't a real match */
26015 if (cp_end == NULL) {
26019 this_name_start = cp_end + 1; /* The name starts just after */
26020 cp_end--; /* the \n, and the code point */
26021 /* ends just before it */
26023 /* All code points are 5 digits long */
26024 cp_start = cp_end - 4;
26026 /* This shouldn't happen, as we found a \n, and the first \n is
26027 * further along than what we subtracted */
26028 assert(cp_start >= all_names_start);
26030 if (cp_start == all_names_start) {
26031 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
26035 /* If the character is a blank, we either have a named sequence, or
26036 * something is wrong */
26037 if (*(cp_start - 1) == ' ') {
26038 cp_start = (char *) my_memrchr(all_names_start,
26040 cp_start - all_names_start);
26044 assert(cp_start != NULL && cp_start >= all_names_start + 2);
26046 /* Except for the first line in the string, the sequence before the
26047 * code point is \n\n. If that isn't the case here, we didn't
26048 * match the name of a character. (We could have matched a named
26049 * sequence, not currently handled */
26050 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
26054 /* We matched! Add this to the list */
26055 found_matches = TRUE;
26057 /* Loop through all the code points in the sequence */
26058 while (cp_start < cp_end) {
26060 /* Calculate this code point from its 5 digits */
26061 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
26062 + (XDIGIT_VALUE(cp_start[1]) << 12)
26063 + (XDIGIT_VALUE(cp_start[2]) << 8)
26064 + (XDIGIT_VALUE(cp_start[3]) << 4)
26065 + XDIGIT_VALUE(cp_start[4]);
26067 cp_start += 6; /* Go past any blank */
26069 if (cp_start < cp_end || is_multi) {
26070 if (this_string == NULL) {
26071 this_string = newAV();
26075 av_push(this_string, newSVuv(cp));
26079 if (is_multi) { /* Was more than one code point */
26080 if (*strings == NULL) {
26081 *strings = newAV();
26084 av_push(*strings, (SV *) this_string);
26086 else { /* Only a single code point */
26087 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
26089 } /* End of loop through the non-algorithmic names string */
26092 /* There are also character names not in 'names_string'. These are
26093 * algorithmically generatable. Try this pattern on each possible one.
26094 * (khw originally planned to leave this out given the large number of
26095 * matches attempted; but the speed turned out to be quite acceptable
26097 * There are plenty of opportunities to optimize to skip many of the tests.
26098 * beyond the rudimentary ones already here */
26100 /* First see if the subpattern matches any of the algorithmic generatable
26101 * Hangul syllable names.
26103 * We know none of these syllable names will match if the input pattern
26104 * requires more bytes than any syllable has, or if the input pattern only
26105 * matches an empty name, or if the pattern has something it must match and
26106 * one of the characters in that isn't in any Hangul syllable. */
26107 if ( prog->minlen <= (SSize_t) syl_max_len
26108 && prog->maxlen > 0
26109 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
26111 /* These constants, names, values, and algorithm are adapted from the
26112 * Unicode standard, version 5.1, section 3.12, and should never
26114 const char * JamoL[] = {
26115 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
26116 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
26118 const int LCount = C_ARRAY_LENGTH(JamoL);
26120 const char * JamoV[] = {
26121 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
26122 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
26125 const int VCount = C_ARRAY_LENGTH(JamoV);
26127 const char * JamoT[] = {
26128 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
26129 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
26130 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
26132 const int TCount = C_ARRAY_LENGTH(JamoT);
26136 /* This is the initial Hangul syllable code point; each time through the
26137 * inner loop, it maps to the next higher code point. For more info,
26138 * see the Hangul syllable section of the Unicode standard. */
26141 syllable_name = sv_2mortal(newSV(syl_max_len));
26142 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
26144 for (L = 0; L < LCount; L++) {
26145 for (V = 0; V < VCount; V++) {
26146 for (T = 0; T < TCount; T++) {
26148 /* Truncate back to the prefix, which is unvarying */
26149 SvCUR_set(syllable_name, hangul_prefix_len);
26151 sv_catpv(syllable_name, JamoL[L]);
26152 sv_catpv(syllable_name, JamoV[V]);
26153 sv_catpv(syllable_name, JamoT[T]);
26155 if (execute_wildcard(subpattern_re,
26156 SvPVX(syllable_name),
26157 SvEND(syllable_name),
26158 SvPVX(syllable_name), 0,
26162 *prop_definition = add_cp_to_invlist(*prop_definition,
26164 found_matches = TRUE;
26173 /* The rest of the algorithmically generatable names are of the form
26174 * "PREFIX-code_point". The prefixes and the code point limits of each
26175 * were returned to us in the array 'algorithmic_names' from data in
26176 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
26177 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
26180 /* Each element of the array is a hash, giving the details for the
26181 * series of names it covers. There is the base name of the characters
26182 * in the series, and the low and high code points in the series. And,
26183 * for optimization purposes a string containing all the legal
26184 * characters that could possibly be in a name in this series. */
26185 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
26186 SV * prefix = * hv_fetchs(this_series, "name", 0);
26187 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
26188 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
26189 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
26191 /* Pre-allocate an SV with enough space */
26192 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
26194 if (high >= 0x10000) {
26195 sv_catpvs(algo_name, "0");
26198 /* This series can be skipped entirely if the pattern requires
26199 * something longer than any name in the series, or can only match an
26200 * empty name, or contains a character not found in any name in the
26202 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
26203 && prog->maxlen > 0
26204 && (strspn(must, legal) == must_len))
26206 for (j = low; j <= high; j++) { /* For each code point in the series */
26208 /* Get its name, and see if it matches the subpattern */
26209 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
26212 if (execute_wildcard(subpattern_re,
26215 SvPVX(algo_name), 0,
26219 *prop_definition = add_cp_to_invlist(*prop_definition, j);
26220 found_matches = TRUE;
26227 /* Finally, see if the subpattern matches an empty string */
26228 empty = newSVpvs("");
26229 if (execute_wildcard(subpattern_re,
26236 /* Many code points have empty names. Currently these are the \p{GC=C}
26237 * ones, minus CC and CF */
26239 SV * empty_names_ref = get_prop_definition(UNI_C);
26240 SV * empty_names = invlist_clone(empty_names_ref, NULL);
26242 SV * subtract = get_prop_definition(UNI_CC);
26244 _invlist_subtract(empty_names, subtract, &empty_names);
26245 SvREFCNT_dec_NN(empty_names_ref);
26246 SvREFCNT_dec_NN(subtract);
26248 subtract = get_prop_definition(UNI_CF);
26249 _invlist_subtract(empty_names, subtract, &empty_names);
26250 SvREFCNT_dec_NN(subtract);
26252 _invlist_union(*prop_definition, empty_names, prop_definition);
26253 found_matches = TRUE;
26254 SvREFCNT_dec_NN(empty_names);
26256 SvREFCNT_dec_NN(empty);
26259 /* If we ever were to accept aliases for, say private use names, we would
26260 * need to do something fancier to find empty names. The code below works
26261 * (at the time it was written), and is slower than the above */
26262 const char empties_pat[] = "^.";
26263 if (strNE(name, empties_pat)) {
26264 SV * empty = newSVpvs("");
26265 if (execute_wildcard(subpattern_re,
26272 SV * empties = NULL;
26274 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
26276 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
26277 SvREFCNT_dec_NN(empties);
26279 found_matches = TRUE;
26281 SvREFCNT_dec_NN(empty);
26285 SvREFCNT_dec_NN(subpattern_re);
26286 return found_matches;
26290 * ex: set ts=8 sts=4 sw=4 et: