5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 /* this is a chain of data about sub patterns we are processing that
110 need to be handled separately/specially in study_chunk. Its so
111 we can simulate recursion without losing state. */
113 typedef struct scan_frame {
114 regnode *last_regnode; /* last node to process in this frame */
115 regnode *next_regnode; /* next node to process when last is reached */
116 U32 prev_recursed_depth;
117 I32 stopparen; /* what stopparen do we use */
118 U32 is_top_frame; /* what flags do we use? */
120 struct scan_frame *this_prev_frame; /* this previous frame */
121 struct scan_frame *prev_frame; /* previous frame */
122 struct scan_frame *next_frame; /* next frame */
125 struct RExC_state_t {
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
128 char *precomp; /* uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 SSize_t whilem_seen; /* number of WHILEM in this expr */
137 regnode *emit_start; /* Start of emitted-code area */
138 regnode *emit_bound; /* First regnode outside of the
140 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
141 implies compiling, so don't emit */
142 regnode_ssc emit_dummy; /* placeholder for emit to point to;
143 large enough for the largest
144 non-EXACTish node, so can use it as
146 I32 naughty; /* How bad is this pattern? */
147 I32 sawback; /* Did we see \1, ...? */
149 SSize_t size; /* Code size. */
150 I32 npar; /* Capture buffer count, (OPEN) plus
151 one. ("par" 0 is the whole
153 I32 nestroot; /* root parens we are in - used by
157 regnode **open_parens; /* pointers to open parens */
158 regnode **close_parens; /* pointers to close parens */
159 regnode *opend; /* END node in program */
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
167 HV *paren_names; /* Paren names */
169 regnode **recurse; /* Recurse regops */
170 I32 recurse_count; /* Number of recurse regops */
171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
177 I32 override_recoding;
178 I32 in_multi_char_class;
179 struct reg_code_block *code_blocks; /* positions of literal (?{})
181 int num_code_blocks; /* size of code_blocks[] */
182 int code_index; /* next code_blocks[] slot */
183 SSize_t maxlen; /* mininum possible number of chars in string to match */
184 scan_frame *frame_head;
185 scan_frame *frame_last;
187 #ifdef ADD_TO_REGEXEC
188 char *starttry; /* -Dr: where regtry was called. */
189 #define RExC_starttry (pRExC_state->starttry)
191 SV *runtime_code_qr; /* qr with the runtime code blocks */
193 const char *lastparse;
195 AV *paren_name_list; /* idx -> name */
196 U32 study_chunk_recursed_count;
199 #define RExC_lastparse (pRExC_state->lastparse)
200 #define RExC_lastnum (pRExC_state->lastnum)
201 #define RExC_paren_name_list (pRExC_state->paren_name_list)
202 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
203 #define RExC_mysv (pRExC_state->mysv1)
204 #define RExC_mysv1 (pRExC_state->mysv1)
205 #define RExC_mysv2 (pRExC_state->mysv2)
210 #define RExC_flags (pRExC_state->flags)
211 #define RExC_pm_flags (pRExC_state->pm_flags)
212 #define RExC_precomp (pRExC_state->precomp)
213 #define RExC_rx_sv (pRExC_state->rx_sv)
214 #define RExC_rx (pRExC_state->rx)
215 #define RExC_rxi (pRExC_state->rxi)
216 #define RExC_start (pRExC_state->start)
217 #define RExC_end (pRExC_state->end)
218 #define RExC_parse (pRExC_state->parse)
219 #define RExC_whilem_seen (pRExC_state->whilem_seen)
220 #ifdef RE_TRACK_PATTERN_OFFSETS
221 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
224 #define RExC_emit (pRExC_state->emit)
225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
226 #define RExC_emit_start (pRExC_state->emit_start)
227 #define RExC_emit_bound (pRExC_state->emit_bound)
228 #define RExC_naughty (pRExC_state->naughty)
229 #define RExC_sawback (pRExC_state->sawback)
230 #define RExC_seen (pRExC_state->seen)
231 #define RExC_size (pRExC_state->size)
232 #define RExC_maxlen (pRExC_state->maxlen)
233 #define RExC_npar (pRExC_state->npar)
234 #define RExC_nestroot (pRExC_state->nestroot)
235 #define RExC_extralen (pRExC_state->extralen)
236 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
237 #define RExC_utf8 (pRExC_state->utf8)
238 #define RExC_uni_semantics (pRExC_state->uni_semantics)
239 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
240 #define RExC_open_parens (pRExC_state->open_parens)
241 #define RExC_close_parens (pRExC_state->close_parens)
242 #define RExC_opend (pRExC_state->opend)
243 #define RExC_paren_names (pRExC_state->paren_names)
244 #define RExC_recurse (pRExC_state->recurse)
245 #define RExC_recurse_count (pRExC_state->recurse_count)
246 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
247 #define RExC_study_chunk_recursed_bytes \
248 (pRExC_state->study_chunk_recursed_bytes)
249 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
250 #define RExC_contains_locale (pRExC_state->contains_locale)
251 #define RExC_contains_i (pRExC_state->contains_i)
252 #define RExC_override_recoding (pRExC_state->override_recoding)
253 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
254 #define RExC_frame_head (pRExC_state->frame_head)
255 #define RExC_frame_last (pRExC_state->frame_last)
256 #define RExC_frame_count (pRExC_state->frame_count)
259 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
260 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
261 ((*s) == '{' && regcurly(s)))
264 * Flags to be passed up and down.
266 #define WORST 0 /* Worst case. */
267 #define HASWIDTH 0x01 /* Known to match non-null strings. */
269 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
270 * character. (There needs to be a case: in the switch statement in regexec.c
271 * for any node marked SIMPLE.) Note that this is not the same thing as
274 #define SPSTART 0x04 /* Starts with * or + */
275 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
276 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
277 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
279 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
281 /* whether trie related optimizations are enabled */
282 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
283 #define TRIE_STUDY_OPT
284 #define FULL_TRIE_STUDY
290 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
291 #define PBITVAL(paren) (1 << ((paren) & 7))
292 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
293 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
294 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
296 #define REQUIRE_UTF8 STMT_START { \
298 *flagp = RESTART_UTF8; \
303 /* This converts the named class defined in regcomp.h to its equivalent class
304 * number defined in handy.h. */
305 #define namedclass_to_classnum(class) ((int) ((class) / 2))
306 #define classnum_to_namedclass(classnum) ((classnum) * 2)
308 #define _invlist_union_complement_2nd(a, b, output) \
309 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
310 #define _invlist_intersection_complement_2nd(a, b, output) \
311 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
313 /* About scan_data_t.
315 During optimisation we recurse through the regexp program performing
316 various inplace (keyhole style) optimisations. In addition study_chunk
317 and scan_commit populate this data structure with information about
318 what strings MUST appear in the pattern. We look for the longest
319 string that must appear at a fixed location, and we look for the
320 longest string that may appear at a floating location. So for instance
325 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
326 strings (because they follow a .* construct). study_chunk will identify
327 both FOO and BAR as being the longest fixed and floating strings respectively.
329 The strings can be composites, for instance
333 will result in a composite fixed substring 'foo'.
335 For each string some basic information is maintained:
337 - offset or min_offset
338 This is the position the string must appear at, or not before.
339 It also implicitly (when combined with minlenp) tells us how many
340 characters must match before the string we are searching for.
341 Likewise when combined with minlenp and the length of the string it
342 tells us how many characters must appear after the string we have
346 Only used for floating strings. This is the rightmost point that
347 the string can appear at. If set to SSize_t_MAX it indicates that the
348 string can occur infinitely far to the right.
351 A pointer to the minimum number of characters of the pattern that the
352 string was found inside. This is important as in the case of positive
353 lookahead or positive lookbehind we can have multiple patterns
358 The minimum length of the pattern overall is 3, the minimum length
359 of the lookahead part is 3, but the minimum length of the part that
360 will actually match is 1. So 'FOO's minimum length is 3, but the
361 minimum length for the F is 1. This is important as the minimum length
362 is used to determine offsets in front of and behind the string being
363 looked for. Since strings can be composites this is the length of the
364 pattern at the time it was committed with a scan_commit. Note that
365 the length is calculated by study_chunk, so that the minimum lengths
366 are not known until the full pattern has been compiled, thus the
367 pointer to the value.
371 In the case of lookbehind the string being searched for can be
372 offset past the start point of the final matching string.
373 If this value was just blithely removed from the min_offset it would
374 invalidate some of the calculations for how many chars must match
375 before or after (as they are derived from min_offset and minlen and
376 the length of the string being searched for).
377 When the final pattern is compiled and the data is moved from the
378 scan_data_t structure into the regexp structure the information
379 about lookbehind is factored in, with the information that would
380 have been lost precalculated in the end_shift field for the
383 The fields pos_min and pos_delta are used to store the minimum offset
384 and the delta to the maximum offset at the current point in the pattern.
388 typedef struct scan_data_t {
389 /*I32 len_min; unused */
390 /*I32 len_delta; unused */
394 SSize_t last_end; /* min value, <0 unless valid. */
395 SSize_t last_start_min;
396 SSize_t last_start_max;
397 SV **longest; /* Either &l_fixed, or &l_float. */
398 SV *longest_fixed; /* longest fixed string found in pattern */
399 SSize_t offset_fixed; /* offset where it starts */
400 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
401 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
402 SV *longest_float; /* longest floating string found in pattern */
403 SSize_t offset_float_min; /* earliest point in string it can appear */
404 SSize_t offset_float_max; /* latest point in string it can appear */
405 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
406 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
409 SSize_t *last_closep;
410 regnode_ssc *start_class;
414 * Forward declarations for pregcomp()'s friends.
417 static const scan_data_t zero_scan_data =
418 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
420 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
421 #define SF_BEFORE_SEOL 0x0001
422 #define SF_BEFORE_MEOL 0x0002
423 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
424 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
426 #define SF_FIX_SHIFT_EOL (+2)
427 #define SF_FL_SHIFT_EOL (+4)
429 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
430 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
432 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
433 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
434 #define SF_IS_INF 0x0040
435 #define SF_HAS_PAR 0x0080
436 #define SF_IN_PAR 0x0100
437 #define SF_HAS_EVAL 0x0200
438 #define SCF_DO_SUBSTR 0x0400
439 #define SCF_DO_STCLASS_AND 0x0800
440 #define SCF_DO_STCLASS_OR 0x1000
441 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
442 #define SCF_WHILEM_VISITED_POS 0x2000
444 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
445 #define SCF_SEEN_ACCEPT 0x8000
446 #define SCF_TRIE_DOING_RESTUDY 0x10000
447 #define SCF_IN_DEFINE 0x20000
452 #define UTF cBOOL(RExC_utf8)
454 /* The enums for all these are ordered so things work out correctly */
455 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
456 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
457 == REGEX_DEPENDS_CHARSET)
458 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
459 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
460 >= REGEX_UNICODE_CHARSET)
461 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
462 == REGEX_ASCII_RESTRICTED_CHARSET)
463 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
464 >= REGEX_ASCII_RESTRICTED_CHARSET)
465 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
466 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
468 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
470 /* For programs that want to be strictly Unicode compatible by dying if any
471 * attempt is made to match a non-Unicode code point against a Unicode
473 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
475 #define OOB_NAMEDCLASS -1
477 /* There is no code point that is out-of-bounds, so this is problematic. But
478 * its only current use is to initialize a variable that is always set before
480 #define OOB_UNICODE 0xDEADBEEF
482 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
483 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
486 /* length of regex to show in messages that don't mark a position within */
487 #define RegexLengthToShowInErrorMessages 127
490 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
491 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
492 * op/pragma/warn/regcomp.
494 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
495 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
497 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
498 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
500 #define REPORT_LOCATION_ARGS(offset) \
501 UTF8fARG(UTF, offset, RExC_precomp), \
502 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
505 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
506 * arg. Show regex, up to a maximum length. If it's too long, chop and add
509 #define _FAIL(code) STMT_START { \
510 const char *ellipses = ""; \
511 IV len = RExC_end - RExC_precomp; \
514 SAVEFREESV(RExC_rx_sv); \
515 if (len > RegexLengthToShowInErrorMessages) { \
516 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
517 len = RegexLengthToShowInErrorMessages - 10; \
523 #define FAIL(msg) _FAIL( \
524 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
525 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
527 #define FAIL2(msg,arg) _FAIL( \
528 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
529 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
532 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
534 #define Simple_vFAIL(m) STMT_START { \
536 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
537 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
538 m, REPORT_LOCATION_ARGS(offset)); \
542 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
544 #define vFAIL(m) STMT_START { \
546 SAVEFREESV(RExC_rx_sv); \
551 * Like Simple_vFAIL(), but accepts two arguments.
553 #define Simple_vFAIL2(m,a1) STMT_START { \
554 const IV offset = RExC_parse - RExC_precomp; \
555 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
556 REPORT_LOCATION_ARGS(offset)); \
560 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
562 #define vFAIL2(m,a1) STMT_START { \
564 SAVEFREESV(RExC_rx_sv); \
565 Simple_vFAIL2(m, a1); \
570 * Like Simple_vFAIL(), but accepts three arguments.
572 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
573 const IV offset = RExC_parse - RExC_precomp; \
574 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
575 REPORT_LOCATION_ARGS(offset)); \
579 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
581 #define vFAIL3(m,a1,a2) STMT_START { \
583 SAVEFREESV(RExC_rx_sv); \
584 Simple_vFAIL3(m, a1, a2); \
588 * Like Simple_vFAIL(), but accepts four arguments.
590 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
591 const IV offset = RExC_parse - RExC_precomp; \
592 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
593 REPORT_LOCATION_ARGS(offset)); \
596 #define vFAIL4(m,a1,a2,a3) STMT_START { \
598 SAVEFREESV(RExC_rx_sv); \
599 Simple_vFAIL4(m, a1, a2, a3); \
602 /* A specialized version of vFAIL2 that works with UTF8f */
603 #define vFAIL2utf8f(m, a1) STMT_START { \
604 const IV offset = RExC_parse - RExC_precomp; \
606 SAVEFREESV(RExC_rx_sv); \
607 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
608 REPORT_LOCATION_ARGS(offset)); \
611 /* These have asserts in them because of [perl #122671] Many warnings in
612 * regcomp.c can occur twice. If they get output in pass1 and later in that
613 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
614 * would get output again. So they should be output in pass2, and these
615 * asserts make sure new warnings follow that paradigm. */
617 /* m is not necessarily a "literal string", in this macro */
618 #define reg_warn_non_literal_string(loc, m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
621 m, REPORT_LOCATION_ARGS(offset)); \
624 #define ckWARNreg(loc,m) STMT_START { \
625 const IV offset = loc - RExC_precomp; \
626 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
627 REPORT_LOCATION_ARGS(offset)); \
630 #define vWARN_dep(loc, m) STMT_START { \
631 const IV offset = loc - RExC_precomp; \
632 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
633 REPORT_LOCATION_ARGS(offset)); \
636 #define ckWARNdep(loc,m) STMT_START { \
637 const IV offset = loc - RExC_precomp; \
638 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
640 REPORT_LOCATION_ARGS(offset)); \
643 #define ckWARNregdep(loc,m) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
647 REPORT_LOCATION_ARGS(offset)); \
650 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
654 a1, REPORT_LOCATION_ARGS(offset)); \
657 #define ckWARN2reg(loc, m, a1) STMT_START { \
658 const IV offset = loc - RExC_precomp; \
659 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
660 a1, REPORT_LOCATION_ARGS(offset)); \
663 #define vWARN3(loc, m, a1, a2) STMT_START { \
664 const IV offset = loc - RExC_precomp; \
665 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
666 a1, a2, REPORT_LOCATION_ARGS(offset)); \
669 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
670 const IV offset = loc - RExC_precomp; \
671 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
672 a1, a2, REPORT_LOCATION_ARGS(offset)); \
675 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
676 const IV offset = loc - RExC_precomp; \
677 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
678 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
681 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
682 const IV offset = loc - RExC_precomp; \
683 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
684 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
687 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
688 const IV offset = loc - RExC_precomp; \
689 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
690 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
694 /* Allow for side effects in s */
695 #define REGC(c,s) STMT_START { \
696 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
699 /* Macros for recording node offsets. 20001227 mjd@plover.com
700 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
701 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
702 * Element 0 holds the number n.
703 * Position is 1 indexed.
705 #ifndef RE_TRACK_PATTERN_OFFSETS
706 #define Set_Node_Offset_To_R(node,byte)
707 #define Set_Node_Offset(node,byte)
708 #define Set_Cur_Node_Offset
709 #define Set_Node_Length_To_R(node,len)
710 #define Set_Node_Length(node,len)
711 #define Set_Node_Cur_Length(node,start)
712 #define Node_Offset(n)
713 #define Node_Length(n)
714 #define Set_Node_Offset_Length(node,offset,len)
715 #define ProgLen(ri) ri->u.proglen
716 #define SetProgLen(ri,x) ri->u.proglen = x
718 #define ProgLen(ri) ri->u.offsets[0]
719 #define SetProgLen(ri,x) ri->u.offsets[0] = x
720 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
722 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
723 __LINE__, (int)(node), (int)(byte))); \
725 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
728 RExC_offsets[2*(node)-1] = (byte); \
733 #define Set_Node_Offset(node,byte) \
734 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
735 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
737 #define Set_Node_Length_To_R(node,len) STMT_START { \
739 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
740 __LINE__, (int)(node), (int)(len))); \
742 Perl_croak(aTHX_ "value of node is %d in Length macro", \
745 RExC_offsets[2*(node)] = (len); \
750 #define Set_Node_Length(node,len) \
751 Set_Node_Length_To_R((node)-RExC_emit_start, len)
752 #define Set_Node_Cur_Length(node, start) \
753 Set_Node_Length(node, RExC_parse - start)
755 /* Get offsets and lengths */
756 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
757 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
759 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
760 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
761 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
765 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
766 #define EXPERIMENTAL_INPLACESCAN
767 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
769 #define DEBUG_RExC_seen() \
770 DEBUG_OPTIMISE_MORE_r({ \
771 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
773 if (RExC_seen & REG_ZERO_LEN_SEEN) \
774 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
776 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
777 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
779 if (RExC_seen & REG_GPOS_SEEN) \
780 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
782 if (RExC_seen & REG_CANY_SEEN) \
783 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
785 if (RExC_seen & REG_RECURSE_SEEN) \
786 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
788 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
789 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
791 if (RExC_seen & REG_VERBARG_SEEN) \
792 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
794 if (RExC_seen & REG_CUTGROUP_SEEN) \
795 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
797 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
798 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
800 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
801 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
803 if (RExC_seen & REG_GOSTART_SEEN) \
804 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
806 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
807 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
809 PerlIO_printf(Perl_debug_log,"\n"); \
812 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
813 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
815 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
817 PerlIO_printf(Perl_debug_log, "%s", open_str); \
818 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
819 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
820 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
821 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
822 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
823 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
824 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
825 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
826 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
827 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
828 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
829 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
830 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
831 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
832 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
833 PerlIO_printf(Perl_debug_log, "%s", close_str); \
837 #define DEBUG_STUDYDATA(str,data,depth) \
838 DEBUG_OPTIMISE_MORE_r(if(data){ \
839 PerlIO_printf(Perl_debug_log, \
840 "%*s" str "Pos:%"IVdf"/%"IVdf \
842 (int)(depth)*2, "", \
843 (IV)((data)->pos_min), \
844 (IV)((data)->pos_delta), \
845 (UV)((data)->flags) \
847 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
848 PerlIO_printf(Perl_debug_log, \
849 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
850 (IV)((data)->whilem_c), \
851 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
852 is_inf ? "INF " : "" \
854 if ((data)->last_found) \
855 PerlIO_printf(Perl_debug_log, \
856 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
857 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
858 SvPVX_const((data)->last_found), \
859 (IV)((data)->last_end), \
860 (IV)((data)->last_start_min), \
861 (IV)((data)->last_start_max), \
862 ((data)->longest && \
863 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
864 SvPVX_const((data)->longest_fixed), \
865 (IV)((data)->offset_fixed), \
866 ((data)->longest && \
867 (data)->longest==&((data)->longest_float)) ? "*" : "", \
868 SvPVX_const((data)->longest_float), \
869 (IV)((data)->offset_float_min), \
870 (IV)((data)->offset_float_max) \
872 PerlIO_printf(Perl_debug_log,"\n"); \
877 /* is c a control character for which we have a mnemonic? */
878 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
881 S_cntrl_to_mnemonic(const U8 c)
883 /* Returns the mnemonic string that represents character 'c', if one
884 * exists; NULL otherwise. The only ones that exist for the purposes of
885 * this routine are a few control characters */
888 case '\a': return "\\a";
889 case '\b': return "\\b";
890 case ESC_NATIVE: return "\\e";
891 case '\f': return "\\f";
892 case '\n': return "\\n";
893 case '\r': return "\\r";
894 case '\t': return "\\t";
902 /* Mark that we cannot extend a found fixed substring at this point.
903 Update the longest found anchored substring and the longest found
904 floating substrings if needed. */
907 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
908 SSize_t *minlenp, int is_inf)
910 const STRLEN l = CHR_SVLEN(data->last_found);
911 const STRLEN old_l = CHR_SVLEN(*data->longest);
912 GET_RE_DEBUG_FLAGS_DECL;
914 PERL_ARGS_ASSERT_SCAN_COMMIT;
916 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
917 SvSetMagicSV(*data->longest, data->last_found);
918 if (*data->longest == data->longest_fixed) {
919 data->offset_fixed = l ? data->last_start_min : data->pos_min;
920 if (data->flags & SF_BEFORE_EOL)
922 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
924 data->flags &= ~SF_FIX_BEFORE_EOL;
925 data->minlen_fixed=minlenp;
926 data->lookbehind_fixed=0;
928 else { /* *data->longest == data->longest_float */
929 data->offset_float_min = l ? data->last_start_min : data->pos_min;
930 data->offset_float_max = (l
931 ? data->last_start_max
932 : (data->pos_delta == SSize_t_MAX
934 : data->pos_min + data->pos_delta));
936 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
937 data->offset_float_max = SSize_t_MAX;
938 if (data->flags & SF_BEFORE_EOL)
940 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
942 data->flags &= ~SF_FL_BEFORE_EOL;
943 data->minlen_float=minlenp;
944 data->lookbehind_float=0;
947 SvCUR_set(data->last_found, 0);
949 SV * const sv = data->last_found;
950 if (SvUTF8(sv) && SvMAGICAL(sv)) {
951 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
957 data->flags &= ~SF_BEFORE_EOL;
958 DEBUG_STUDYDATA("commit: ",data,0);
961 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
962 * list that describes which code points it matches */
965 S_ssc_anything(pTHX_ regnode_ssc *ssc)
967 /* Set the SSC 'ssc' to match an empty string or any code point */
969 PERL_ARGS_ASSERT_SSC_ANYTHING;
971 assert(is_ANYOF_SYNTHETIC(ssc));
973 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
974 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
975 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
979 S_ssc_is_anything(const regnode_ssc *ssc)
981 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
982 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
983 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
984 * in any way, so there's no point in using it */
989 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
991 assert(is_ANYOF_SYNTHETIC(ssc));
993 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
997 /* See if the list consists solely of the range 0 - Infinity */
998 invlist_iterinit(ssc->invlist);
999 ret = invlist_iternext(ssc->invlist, &start, &end)
1003 invlist_iterfinish(ssc->invlist);
1009 /* If e.g., both \w and \W are set, matches everything */
1010 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1012 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1013 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1023 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1025 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1026 * string, any code point, or any posix class under locale */
1028 PERL_ARGS_ASSERT_SSC_INIT;
1030 Zero(ssc, 1, regnode_ssc);
1031 set_ANYOF_SYNTHETIC(ssc);
1032 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1035 /* If any portion of the regex is to operate under locale rules,
1036 * initialization includes it. The reason this isn't done for all regexes
1037 * is that the optimizer was written under the assumption that locale was
1038 * all-or-nothing. Given the complexity and lack of documentation in the
1039 * optimizer, and that there are inadequate test cases for locale, many
1040 * parts of it may not work properly, it is safest to avoid locale unless
1042 if (RExC_contains_locale) {
1043 ANYOF_POSIXL_SETALL(ssc);
1046 ANYOF_POSIXL_ZERO(ssc);
1051 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1052 const regnode_ssc *ssc)
1054 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1055 * to the list of code points matched, and locale posix classes; hence does
1056 * not check its flags) */
1061 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1063 assert(is_ANYOF_SYNTHETIC(ssc));
1065 invlist_iterinit(ssc->invlist);
1066 ret = invlist_iternext(ssc->invlist, &start, &end)
1070 invlist_iterfinish(ssc->invlist);
1076 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1084 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1085 const regnode_charclass* const node)
1087 /* Returns a mortal inversion list defining which code points are matched
1088 * by 'node', which is of type ANYOF. Handles complementing the result if
1089 * appropriate. If some code points aren't knowable at this time, the
1090 * returned list must, and will, contain every code point that is a
1093 SV* invlist = sv_2mortal(_new_invlist(0));
1094 SV* only_utf8_locale_invlist = NULL;
1096 const U32 n = ARG(node);
1097 bool new_node_has_latin1 = FALSE;
1099 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1101 /* Look at the data structure created by S_set_ANYOF_arg() */
1102 if (n != ANYOF_ONLY_HAS_BITMAP) {
1103 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1104 AV * const av = MUTABLE_AV(SvRV(rv));
1105 SV **const ary = AvARRAY(av);
1106 assert(RExC_rxi->data->what[n] == 's');
1108 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1109 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1111 else if (ary[0] && ary[0] != &PL_sv_undef) {
1113 /* Here, no compile-time swash, and there are things that won't be
1114 * known until runtime -- we have to assume it could be anything */
1115 return _add_range_to_invlist(invlist, 0, UV_MAX);
1117 else if (ary[3] && ary[3] != &PL_sv_undef) {
1119 /* Here no compile-time swash, and no run-time only data. Use the
1120 * node's inversion list */
1121 invlist = sv_2mortal(invlist_clone(ary[3]));
1124 /* Get the code points valid only under UTF-8 locales */
1125 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1126 && ary[2] && ary[2] != &PL_sv_undef)
1128 only_utf8_locale_invlist = ary[2];
1132 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1133 * code points, and an inversion list for the others, but if there are code
1134 * points that should match only conditionally on the target string being
1135 * UTF-8, those are placed in the inversion list, and not the bitmap.
1136 * Since there are circumstances under which they could match, they are
1137 * included in the SSC. But if the ANYOF node is to be inverted, we have
1138 * to exclude them here, so that when we invert below, the end result
1139 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1140 * have to do this here before we add the unconditionally matched code
1142 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1143 _invlist_intersection_complement_2nd(invlist,
1148 /* Add in the points from the bit map */
1149 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1150 if (ANYOF_BITMAP_TEST(node, i)) {
1151 invlist = add_cp_to_invlist(invlist, i);
1152 new_node_has_latin1 = TRUE;
1156 /* If this can match all upper Latin1 code points, have to add them
1158 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1159 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1162 /* Similarly for these */
1163 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1164 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1167 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168 _invlist_invert(invlist);
1170 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1172 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1173 * locale. We can skip this if there are no 0-255 at all. */
1174 _invlist_union(invlist, PL_Latin1, &invlist);
1177 /* Similarly add the UTF-8 locale possible matches. These have to be
1178 * deferred until after the non-UTF-8 locale ones are taken care of just
1179 * above, or it leads to wrong results under ANYOF_INVERT */
1180 if (only_utf8_locale_invlist) {
1181 _invlist_union_maybe_complement_2nd(invlist,
1182 only_utf8_locale_invlist,
1183 ANYOF_FLAGS(node) & ANYOF_INVERT,
1190 /* These two functions currently do the exact same thing */
1191 #define ssc_init_zero ssc_init
1193 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1194 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1196 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1197 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1198 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1201 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1202 const regnode_charclass *and_with)
1204 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1205 * another SSC or a regular ANYOF class. Can create false positives. */
1210 PERL_ARGS_ASSERT_SSC_AND;
1212 assert(is_ANYOF_SYNTHETIC(ssc));
1214 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1215 * the code point inversion list and just the relevant flags */
1216 if (is_ANYOF_SYNTHETIC(and_with)) {
1217 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1218 anded_flags = ANYOF_FLAGS(and_with);
1220 /* XXX This is a kludge around what appears to be deficiencies in the
1221 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1222 * there are paths through the optimizer where it doesn't get weeded
1223 * out when it should. And if we don't make some extra provision for
1224 * it like the code just below, it doesn't get added when it should.
1225 * This solution is to add it only when AND'ing, which is here, and
1226 * only when what is being AND'ed is the pristine, original node
1227 * matching anything. Thus it is like adding it to ssc_anything() but
1228 * only when the result is to be AND'ed. Probably the same solution
1229 * could be adopted for the same problem we have with /l matching,
1230 * which is solved differently in S_ssc_init(), and that would lead to
1231 * fewer false positives than that solution has. But if this solution
1232 * creates bugs, the consequences are only that a warning isn't raised
1233 * that should be; while the consequences for having /l bugs is
1234 * incorrect matches */
1235 if (ssc_is_anything((regnode_ssc *)and_with)) {
1236 anded_flags |= ANYOF_WARN_SUPER;
1240 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1241 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1244 ANYOF_FLAGS(ssc) &= anded_flags;
1246 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1247 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1248 * 'and_with' may be inverted. When not inverted, we have the situation of
1250 * (C1 | P1) & (C2 | P2)
1251 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1252 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1253 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1254 * <= ((C1 & C2) | P1 | P2)
1255 * Alternatively, the last few steps could be:
1256 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1257 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1258 * <= (C1 | C2 | (P1 & P2))
1259 * We favor the second approach if either P1 or P2 is non-empty. This is
1260 * because these components are a barrier to doing optimizations, as what
1261 * they match cannot be known until the moment of matching as they are
1262 * dependent on the current locale, 'AND"ing them likely will reduce or
1264 * But we can do better if we know that C1,P1 are in their initial state (a
1265 * frequent occurrence), each matching everything:
1266 * (<everything>) & (C2 | P2) = C2 | P2
1267 * Similarly, if C2,P2 are in their initial state (again a frequent
1268 * occurrence), the result is a no-op
1269 * (C1 | P1) & (<everything>) = C1 | P1
1272 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1273 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1274 * <= (C1 & ~C2) | (P1 & ~P2)
1277 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1278 && ! is_ANYOF_SYNTHETIC(and_with))
1282 ssc_intersection(ssc,
1284 FALSE /* Has already been inverted */
1287 /* If either P1 or P2 is empty, the intersection will be also; can skip
1289 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1290 ANYOF_POSIXL_ZERO(ssc);
1292 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1294 /* Note that the Posix class component P from 'and_with' actually
1296 * P = Pa | Pb | ... | Pn
1297 * where each component is one posix class, such as in [\w\s].
1299 * ~P = ~(Pa | Pb | ... | Pn)
1300 * = ~Pa & ~Pb & ... & ~Pn
1301 * <= ~Pa | ~Pb | ... | ~Pn
1302 * The last is something we can easily calculate, but unfortunately
1303 * is likely to have many false positives. We could do better
1304 * in some (but certainly not all) instances if two classes in
1305 * P have known relationships. For example
1306 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1308 * :lower: & :print: = :lower:
1309 * And similarly for classes that must be disjoint. For example,
1310 * since \s and \w can have no elements in common based on rules in
1311 * the POSIX standard,
1312 * \w & ^\S = nothing
1313 * Unfortunately, some vendor locales do not meet the Posix
1314 * standard, in particular almost everything by Microsoft.
1315 * The loop below just changes e.g., \w into \W and vice versa */
1317 regnode_charclass_posixl temp;
1318 int add = 1; /* To calculate the index of the complement */
1320 ANYOF_POSIXL_ZERO(&temp);
1321 for (i = 0; i < ANYOF_MAX; i++) {
1323 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1324 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1326 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1327 ANYOF_POSIXL_SET(&temp, i + add);
1329 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1331 ANYOF_POSIXL_AND(&temp, ssc);
1333 } /* else ssc already has no posixes */
1334 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1335 in its initial state */
1336 else if (! is_ANYOF_SYNTHETIC(and_with)
1337 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1339 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1340 * copy it over 'ssc' */
1341 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1342 if (is_ANYOF_SYNTHETIC(and_with)) {
1343 StructCopy(and_with, ssc, regnode_ssc);
1346 ssc->invlist = anded_cp_list;
1347 ANYOF_POSIXL_ZERO(ssc);
1348 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1349 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1353 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1354 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1356 /* One or the other of P1, P2 is non-empty. */
1357 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1358 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1360 ssc_union(ssc, anded_cp_list, FALSE);
1362 else { /* P1 = P2 = empty */
1363 ssc_intersection(ssc, anded_cp_list, FALSE);
1369 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1370 const regnode_charclass *or_with)
1372 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1373 * another SSC or a regular ANYOF class. Can create false positives if
1374 * 'or_with' is to be inverted. */
1379 PERL_ARGS_ASSERT_SSC_OR;
1381 assert(is_ANYOF_SYNTHETIC(ssc));
1383 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1384 * the code point inversion list and just the relevant flags */
1385 if (is_ANYOF_SYNTHETIC(or_with)) {
1386 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1387 ored_flags = ANYOF_FLAGS(or_with);
1390 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1391 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1394 ANYOF_FLAGS(ssc) |= ored_flags;
1396 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1397 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1398 * 'or_with' may be inverted. When not inverted, we have the simple
1399 * situation of computing:
1400 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1401 * If P1|P2 yields a situation with both a class and its complement are
1402 * set, like having both \w and \W, this matches all code points, and we
1403 * can delete these from the P component of the ssc going forward. XXX We
1404 * might be able to delete all the P components, but I (khw) am not certain
1405 * about this, and it is better to be safe.
1408 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1409 * <= (C1 | P1) | ~C2
1410 * <= (C1 | ~C2) | P1
1411 * (which results in actually simpler code than the non-inverted case)
1414 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1415 && ! is_ANYOF_SYNTHETIC(or_with))
1417 /* We ignore P2, leaving P1 going forward */
1418 } /* else Not inverted */
1419 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1420 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1421 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1423 for (i = 0; i < ANYOF_MAX; i += 2) {
1424 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1426 ssc_match_all_cp(ssc);
1427 ANYOF_POSIXL_CLEAR(ssc, i);
1428 ANYOF_POSIXL_CLEAR(ssc, i+1);
1436 FALSE /* Already has been inverted */
1440 PERL_STATIC_INLINE void
1441 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1443 PERL_ARGS_ASSERT_SSC_UNION;
1445 assert(is_ANYOF_SYNTHETIC(ssc));
1447 _invlist_union_maybe_complement_2nd(ssc->invlist,
1453 PERL_STATIC_INLINE void
1454 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1456 const bool invert2nd)
1458 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1460 assert(is_ANYOF_SYNTHETIC(ssc));
1462 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1468 PERL_STATIC_INLINE void
1469 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1471 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1473 assert(is_ANYOF_SYNTHETIC(ssc));
1475 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1478 PERL_STATIC_INLINE void
1479 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1481 /* AND just the single code point 'cp' into the SSC 'ssc' */
1483 SV* cp_list = _new_invlist(2);
1485 PERL_ARGS_ASSERT_SSC_CP_AND;
1487 assert(is_ANYOF_SYNTHETIC(ssc));
1489 cp_list = add_cp_to_invlist(cp_list, cp);
1490 ssc_intersection(ssc, cp_list,
1491 FALSE /* Not inverted */
1493 SvREFCNT_dec_NN(cp_list);
1496 PERL_STATIC_INLINE void
1497 S_ssc_clear_locale(regnode_ssc *ssc)
1499 /* Set the SSC 'ssc' to not match any locale things */
1500 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1502 assert(is_ANYOF_SYNTHETIC(ssc));
1504 ANYOF_POSIXL_ZERO(ssc);
1505 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1508 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1511 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1513 /* The synthetic start class is used to hopefully quickly winnow down
1514 * places where a pattern could start a match in the target string. If it
1515 * doesn't really narrow things down that much, there isn't much point to
1516 * having the overhead of using it. This function uses some very crude
1517 * heuristics to decide if to use the ssc or not.
1519 * It returns TRUE if 'ssc' rules out more than half what it considers to
1520 * be the "likely" possible matches, but of course it doesn't know what the
1521 * actual things being matched are going to be; these are only guesses
1523 * For /l matches, it assumes that the only likely matches are going to be
1524 * in the 0-255 range, uniformly distributed, so half of that is 127
1525 * For /a and /d matches, it assumes that the likely matches will be just
1526 * the ASCII range, so half of that is 63
1527 * For /u and there isn't anything matching above the Latin1 range, it
1528 * assumes that that is the only range likely to be matched, and uses
1529 * half that as the cut-off: 127. If anything matches above Latin1,
1530 * it assumes that all of Unicode could match (uniformly), except for
1531 * non-Unicode code points and things in the General Category "Other"
1532 * (unassigned, private use, surrogates, controls and formats). This
1533 * is a much large number. */
1535 const U32 max_match = (LOC)
1539 : (invlist_highest(ssc->invlist) < 256)
1541 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1542 U32 count = 0; /* Running total of number of code points matched by
1544 UV start, end; /* Start and end points of current range in inversion
1547 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1549 invlist_iterinit(ssc->invlist);
1550 while (invlist_iternext(ssc->invlist, &start, &end)) {
1552 /* /u is the only thing that we expect to match above 255; so if not /u
1553 * and even if there are matches above 255, ignore them. This catches
1554 * things like \d under /d which does match the digits above 255, but
1555 * since the pattern is /d, it is not likely to be expecting them */
1556 if (! UNI_SEMANTICS) {
1560 end = MIN(end, 255);
1562 count += end - start + 1;
1563 if (count > max_match) {
1564 invlist_iterfinish(ssc->invlist);
1574 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1576 /* The inversion list in the SSC is marked mortal; now we need a more
1577 * permanent copy, which is stored the same way that is done in a regular
1578 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1581 SV* invlist = invlist_clone(ssc->invlist);
1583 PERL_ARGS_ASSERT_SSC_FINALIZE;
1585 assert(is_ANYOF_SYNTHETIC(ssc));
1587 /* The code in this file assumes that all but these flags aren't relevant
1588 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1589 * by the time we reach here */
1590 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1592 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1594 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1595 NULL, NULL, NULL, FALSE);
1597 /* Make sure is clone-safe */
1598 ssc->invlist = NULL;
1600 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1601 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1604 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1607 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1608 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1609 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1610 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1611 ? (TRIE_LIST_CUR( idx ) - 1) \
1617 dump_trie(trie,widecharmap,revcharmap)
1618 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1619 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1621 These routines dump out a trie in a somewhat readable format.
1622 The _interim_ variants are used for debugging the interim
1623 tables that are used to generate the final compressed
1624 representation which is what dump_trie expects.
1626 Part of the reason for their existence is to provide a form
1627 of documentation as to how the different representations function.
1632 Dumps the final compressed table form of the trie to Perl_debug_log.
1633 Used for debugging make_trie().
1637 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1638 AV *revcharmap, U32 depth)
1641 SV *sv=sv_newmortal();
1642 int colwidth= widecharmap ? 6 : 4;
1644 GET_RE_DEBUG_FLAGS_DECL;
1646 PERL_ARGS_ASSERT_DUMP_TRIE;
1648 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1649 (int)depth * 2 + 2,"",
1650 "Match","Base","Ofs" );
1652 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1653 SV ** const tmp = av_fetch( revcharmap, state, 0);
1655 PerlIO_printf( Perl_debug_log, "%*s",
1657 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1658 PL_colors[0], PL_colors[1],
1659 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1660 PERL_PV_ESCAPE_FIRSTCHAR
1665 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1666 (int)depth * 2 + 2,"");
1668 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1669 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1670 PerlIO_printf( Perl_debug_log, "\n");
1672 for( state = 1 ; state < trie->statecount ; state++ ) {
1673 const U32 base = trie->states[ state ].trans.base;
1675 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1676 (int)depth * 2 + 2,"", (UV)state);
1678 if ( trie->states[ state ].wordnum ) {
1679 PerlIO_printf( Perl_debug_log, " W%4X",
1680 trie->states[ state ].wordnum );
1682 PerlIO_printf( Perl_debug_log, "%6s", "" );
1685 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1690 while( ( base + ofs < trie->uniquecharcount ) ||
1691 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1692 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1696 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1698 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1699 if ( ( base + ofs >= trie->uniquecharcount )
1700 && ( base + ofs - trie->uniquecharcount
1702 && trie->trans[ base + ofs
1703 - trie->uniquecharcount ].check == state )
1705 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1707 (UV)trie->trans[ base + ofs
1708 - trie->uniquecharcount ].next );
1710 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1714 PerlIO_printf( Perl_debug_log, "]");
1717 PerlIO_printf( Perl_debug_log, "\n" );
1719 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1721 for (word=1; word <= trie->wordcount; word++) {
1722 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1723 (int)word, (int)(trie->wordinfo[word].prev),
1724 (int)(trie->wordinfo[word].len));
1726 PerlIO_printf(Perl_debug_log, "\n" );
1729 Dumps a fully constructed but uncompressed trie in list form.
1730 List tries normally only are used for construction when the number of
1731 possible chars (trie->uniquecharcount) is very high.
1732 Used for debugging make_trie().
1735 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1736 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1740 SV *sv=sv_newmortal();
1741 int colwidth= widecharmap ? 6 : 4;
1742 GET_RE_DEBUG_FLAGS_DECL;
1744 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1746 /* print out the table precompression. */
1747 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1748 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1749 "------:-----+-----------------\n" );
1751 for( state=1 ; state < next_alloc ; state ++ ) {
1754 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1755 (int)depth * 2 + 2,"", (UV)state );
1756 if ( ! trie->states[ state ].wordnum ) {
1757 PerlIO_printf( Perl_debug_log, "%5s| ","");
1759 PerlIO_printf( Perl_debug_log, "W%4x| ",
1760 trie->states[ state ].wordnum
1763 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1764 SV ** const tmp = av_fetch( revcharmap,
1765 TRIE_LIST_ITEM(state,charid).forid, 0);
1767 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1769 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1771 PL_colors[0], PL_colors[1],
1772 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1773 | PERL_PV_ESCAPE_FIRSTCHAR
1775 TRIE_LIST_ITEM(state,charid).forid,
1776 (UV)TRIE_LIST_ITEM(state,charid).newstate
1779 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1780 (int)((depth * 2) + 14), "");
1783 PerlIO_printf( Perl_debug_log, "\n");
1788 Dumps a fully constructed but uncompressed trie in table form.
1789 This is the normal DFA style state transition table, with a few
1790 twists to facilitate compression later.
1791 Used for debugging make_trie().
1794 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1795 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1800 SV *sv=sv_newmortal();
1801 int colwidth= widecharmap ? 6 : 4;
1802 GET_RE_DEBUG_FLAGS_DECL;
1804 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1807 print out the table precompression so that we can do a visual check
1808 that they are identical.
1811 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1813 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1814 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1816 PerlIO_printf( Perl_debug_log, "%*s",
1818 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1819 PL_colors[0], PL_colors[1],
1820 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1821 PERL_PV_ESCAPE_FIRSTCHAR
1827 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1829 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1830 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1833 PerlIO_printf( Perl_debug_log, "\n" );
1835 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1837 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1838 (int)depth * 2 + 2,"",
1839 (UV)TRIE_NODENUM( state ) );
1841 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1844 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1846 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1848 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1849 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1850 (UV)trie->trans[ state ].check );
1852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1853 (UV)trie->trans[ state ].check,
1854 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1862 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1863 startbranch: the first branch in the whole branch sequence
1864 first : start branch of sequence of branch-exact nodes.
1865 May be the same as startbranch
1866 last : Thing following the last branch.
1867 May be the same as tail.
1868 tail : item following the branch sequence
1869 count : words in the sequence
1870 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1871 depth : indent depth
1873 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1875 A trie is an N'ary tree where the branches are determined by digital
1876 decomposition of the key. IE, at the root node you look up the 1st character and
1877 follow that branch repeat until you find the end of the branches. Nodes can be
1878 marked as "accepting" meaning they represent a complete word. Eg:
1882 would convert into the following structure. Numbers represent states, letters
1883 following numbers represent valid transitions on the letter from that state, if
1884 the number is in square brackets it represents an accepting state, otherwise it
1885 will be in parenthesis.
1887 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1891 (1) +-i->(6)-+-s->[7]
1893 +-s->(3)-+-h->(4)-+-e->[5]
1895 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1897 This shows that when matching against the string 'hers' we will begin at state 1
1898 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1899 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1900 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1901 single traverse. We store a mapping from accepting to state to which word was
1902 matched, and then when we have multiple possibilities we try to complete the
1903 rest of the regex in the order in which they occured in the alternation.
1905 The only prior NFA like behaviour that would be changed by the TRIE support is
1906 the silent ignoring of duplicate alternations which are of the form:
1908 / (DUPE|DUPE) X? (?{ ... }) Y /x
1910 Thus EVAL blocks following a trie may be called a different number of times with
1911 and without the optimisation. With the optimisations dupes will be silently
1912 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1913 the following demonstrates:
1915 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1917 which prints out 'word' three times, but
1919 'words'=~/(word|word|word)(?{ print $1 })S/
1921 which doesnt print it out at all. This is due to other optimisations kicking in.
1923 Example of what happens on a structural level:
1925 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1927 1: CURLYM[1] {1,32767}(18)
1938 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1939 and should turn into:
1941 1: CURLYM[1] {1,32767}(18)
1943 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1951 Cases where tail != last would be like /(?foo|bar)baz/:
1961 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1962 and would end up looking like:
1965 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1972 d = uvchr_to_utf8_flags(d, uv, 0);
1974 is the recommended Unicode-aware way of saying
1979 #define TRIE_STORE_REVCHAR(val) \
1982 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1983 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1984 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1985 SvCUR_set(zlopp, kapow - flrbbbbb); \
1988 av_push(revcharmap, zlopp); \
1990 char ooooff = (char)val; \
1991 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1995 /* This gets the next character from the input, folding it if not already
1997 #define TRIE_READ_CHAR STMT_START { \
2000 /* if it is UTF then it is either already folded, or does not need \
2002 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2004 else if (folder == PL_fold_latin1) { \
2005 /* This folder implies Unicode rules, which in the range expressible \
2006 * by not UTF is the lower case, with the two exceptions, one of \
2007 * which should have been taken care of before calling this */ \
2008 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2009 uvc = toLOWER_L1(*uc); \
2010 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2013 /* raw data, will be folded later if needed */ \
2021 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2022 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2023 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2024 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2026 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2027 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2028 TRIE_LIST_CUR( state )++; \
2031 #define TRIE_LIST_NEW(state) STMT_START { \
2032 Newxz( trie->states[ state ].trans.list, \
2033 4, reg_trie_trans_le ); \
2034 TRIE_LIST_CUR( state ) = 1; \
2035 TRIE_LIST_LEN( state ) = 4; \
2038 #define TRIE_HANDLE_WORD(state) STMT_START { \
2039 U16 dupe= trie->states[ state ].wordnum; \
2040 regnode * const noper_next = regnext( noper ); \
2043 /* store the word for dumping */ \
2045 if (OP(noper) != NOTHING) \
2046 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2048 tmp = newSVpvn_utf8( "", 0, UTF ); \
2049 av_push( trie_words, tmp ); \
2053 trie->wordinfo[curword].prev = 0; \
2054 trie->wordinfo[curword].len = wordlen; \
2055 trie->wordinfo[curword].accept = state; \
2057 if ( noper_next < tail ) { \
2059 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2061 trie->jump[curword] = (U16)(noper_next - convert); \
2063 jumper = noper_next; \
2065 nextbranch= regnext(cur); \
2069 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2070 /* chain, so that when the bits of chain are later */\
2071 /* linked together, the dups appear in the chain */\
2072 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2073 trie->wordinfo[dupe].prev = curword; \
2075 /* we haven't inserted this word yet. */ \
2076 trie->states[ state ].wordnum = curword; \
2081 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2082 ( ( base + charid >= ucharcount \
2083 && base + charid < ubound \
2084 && state == trie->trans[ base - ucharcount + charid ].check \
2085 && trie->trans[ base - ucharcount + charid ].next ) \
2086 ? trie->trans[ base - ucharcount + charid ].next \
2087 : ( state==1 ? special : 0 ) \
2091 #define MADE_JUMP_TRIE 2
2092 #define MADE_EXACT_TRIE 4
2095 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2096 regnode *first, regnode *last, regnode *tail,
2097 U32 word_count, U32 flags, U32 depth)
2099 /* first pass, loop through and scan words */
2100 reg_trie_data *trie;
2101 HV *widecharmap = NULL;
2102 AV *revcharmap = newAV();
2108 regnode *jumper = NULL;
2109 regnode *nextbranch = NULL;
2110 regnode *convert = NULL;
2111 U32 *prev_states; /* temp array mapping each state to previous one */
2112 /* we just use folder as a flag in utf8 */
2113 const U8 * folder = NULL;
2116 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2117 AV *trie_words = NULL;
2118 /* along with revcharmap, this only used during construction but both are
2119 * useful during debugging so we store them in the struct when debugging.
2122 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2123 STRLEN trie_charcount=0;
2125 SV *re_trie_maxbuff;
2126 GET_RE_DEBUG_FLAGS_DECL;
2128 PERL_ARGS_ASSERT_MAKE_TRIE;
2130 PERL_UNUSED_ARG(depth);
2137 case EXACTFU: folder = PL_fold_latin1; break;
2138 case EXACTF: folder = PL_fold; break;
2139 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2142 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2144 trie->startstate = 1;
2145 trie->wordcount = word_count;
2146 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2147 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2149 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2150 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2151 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2154 trie_words = newAV();
2157 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2158 assert(re_trie_maxbuff);
2159 if (!SvIOK(re_trie_maxbuff)) {
2160 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2162 DEBUG_TRIE_COMPILE_r({
2163 PerlIO_printf( Perl_debug_log,
2164 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2165 (int)depth * 2 + 2, "",
2166 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2167 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2170 /* Find the node we are going to overwrite */
2171 if ( first == startbranch && OP( last ) != BRANCH ) {
2172 /* whole branch chain */
2175 /* branch sub-chain */
2176 convert = NEXTOPER( first );
2179 /* -- First loop and Setup --
2181 We first traverse the branches and scan each word to determine if it
2182 contains widechars, and how many unique chars there are, this is
2183 important as we have to build a table with at least as many columns as we
2186 We use an array of integers to represent the character codes 0..255
2187 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2188 the native representation of the character value as the key and IV's for
2191 *TODO* If we keep track of how many times each character is used we can
2192 remap the columns so that the table compression later on is more
2193 efficient in terms of memory by ensuring the most common value is in the
2194 middle and the least common are on the outside. IMO this would be better
2195 than a most to least common mapping as theres a decent chance the most
2196 common letter will share a node with the least common, meaning the node
2197 will not be compressible. With a middle is most common approach the worst
2198 case is when we have the least common nodes twice.
2202 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2203 regnode *noper = NEXTOPER( cur );
2204 const U8 *uc = (U8*)STRING( noper );
2205 const U8 *e = uc + STR_LEN( noper );
2207 U32 wordlen = 0; /* required init */
2208 STRLEN minchars = 0;
2209 STRLEN maxchars = 0;
2210 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2213 if (OP(noper) == NOTHING) {
2214 regnode *noper_next= regnext(noper);
2215 if (noper_next != tail && OP(noper_next) == flags) {
2217 uc= (U8*)STRING(noper);
2218 e= uc + STR_LEN(noper);
2219 trie->minlen= STR_LEN(noper);
2226 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2227 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2228 regardless of encoding */
2229 if (OP( noper ) == EXACTFU_SS) {
2230 /* false positives are ok, so just set this */
2231 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2234 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2236 TRIE_CHARCOUNT(trie)++;
2239 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2240 * is in effect. Under /i, this character can match itself, or
2241 * anything that folds to it. If not under /i, it can match just
2242 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2243 * all fold to k, and all are single characters. But some folds
2244 * expand to more than one character, so for example LATIN SMALL
2245 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2246 * the string beginning at 'uc' is 'ffi', it could be matched by
2247 * three characters, or just by the one ligature character. (It
2248 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2249 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2250 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2251 * match.) The trie needs to know the minimum and maximum number
2252 * of characters that could match so that it can use size alone to
2253 * quickly reject many match attempts. The max is simple: it is
2254 * the number of folded characters in this branch (since a fold is
2255 * never shorter than what folds to it. */
2259 /* And the min is equal to the max if not under /i (indicated by
2260 * 'folder' being NULL), or there are no multi-character folds. If
2261 * there is a multi-character fold, the min is incremented just
2262 * once, for the character that folds to the sequence. Each
2263 * character in the sequence needs to be added to the list below of
2264 * characters in the trie, but we count only the first towards the
2265 * min number of characters needed. This is done through the
2266 * variable 'foldlen', which is returned by the macros that look
2267 * for these sequences as the number of bytes the sequence
2268 * occupies. Each time through the loop, we decrement 'foldlen' by
2269 * how many bytes the current char occupies. Only when it reaches
2270 * 0 do we increment 'minchars' or look for another multi-character
2272 if (folder == NULL) {
2275 else if (foldlen > 0) {
2276 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2281 /* See if *uc is the beginning of a multi-character fold. If
2282 * so, we decrement the length remaining to look at, to account
2283 * for the current character this iteration. (We can use 'uc'
2284 * instead of the fold returned by TRIE_READ_CHAR because for
2285 * non-UTF, the latin1_safe macro is smart enough to account
2286 * for all the unfolded characters, and because for UTF, the
2287 * string will already have been folded earlier in the
2288 * compilation process */
2290 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2291 foldlen -= UTF8SKIP(uc);
2294 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2299 /* The current character (and any potential folds) should be added
2300 * to the possible matching characters for this position in this
2304 U8 folded= folder[ (U8) uvc ];
2305 if ( !trie->charmap[ folded ] ) {
2306 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2307 TRIE_STORE_REVCHAR( folded );
2310 if ( !trie->charmap[ uvc ] ) {
2311 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2312 TRIE_STORE_REVCHAR( uvc );
2315 /* store the codepoint in the bitmap, and its folded
2317 TRIE_BITMAP_SET(trie, uvc);
2319 /* store the folded codepoint */
2320 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2323 /* store first byte of utf8 representation of
2324 variant codepoints */
2325 if (! UVCHR_IS_INVARIANT(uvc)) {
2326 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2329 set_bit = 0; /* We've done our bit :-) */
2333 /* XXX We could come up with the list of code points that fold
2334 * to this using PL_utf8_foldclosures, except not for
2335 * multi-char folds, as there may be multiple combinations
2336 * there that could work, which needs to wait until runtime to
2337 * resolve (The comment about LIGATURE FFI above is such an
2342 widecharmap = newHV();
2344 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2347 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2349 if ( !SvTRUE( *svpp ) ) {
2350 sv_setiv( *svpp, ++trie->uniquecharcount );
2351 TRIE_STORE_REVCHAR(uvc);
2354 } /* end loop through characters in this branch of the trie */
2356 /* We take the min and max for this branch and combine to find the min
2357 * and max for all branches processed so far */
2358 if( cur == first ) {
2359 trie->minlen = minchars;
2360 trie->maxlen = maxchars;
2361 } else if (minchars < trie->minlen) {
2362 trie->minlen = minchars;
2363 } else if (maxchars > trie->maxlen) {
2364 trie->maxlen = maxchars;
2366 } /* end first pass */
2367 DEBUG_TRIE_COMPILE_r(
2368 PerlIO_printf( Perl_debug_log,
2369 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2370 (int)depth * 2 + 2,"",
2371 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2372 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2373 (int)trie->minlen, (int)trie->maxlen )
2377 We now know what we are dealing with in terms of unique chars and
2378 string sizes so we can calculate how much memory a naive
2379 representation using a flat table will take. If it's over a reasonable
2380 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2381 conservative but potentially much slower representation using an array
2384 At the end we convert both representations into the same compressed
2385 form that will be used in regexec.c for matching with. The latter
2386 is a form that cannot be used to construct with but has memory
2387 properties similar to the list form and access properties similar
2388 to the table form making it both suitable for fast searches and
2389 small enough that its feasable to store for the duration of a program.
2391 See the comment in the code where the compressed table is produced
2392 inplace from the flat tabe representation for an explanation of how
2393 the compression works.
2398 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2401 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2402 > SvIV(re_trie_maxbuff) )
2405 Second Pass -- Array Of Lists Representation
2407 Each state will be represented by a list of charid:state records
2408 (reg_trie_trans_le) the first such element holds the CUR and LEN
2409 points of the allocated array. (See defines above).
2411 We build the initial structure using the lists, and then convert
2412 it into the compressed table form which allows faster lookups
2413 (but cant be modified once converted).
2416 STRLEN transcount = 1;
2418 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2419 "%*sCompiling trie using list compiler\n",
2420 (int)depth * 2 + 2, ""));
2422 trie->states = (reg_trie_state *)
2423 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2424 sizeof(reg_trie_state) );
2428 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2430 regnode *noper = NEXTOPER( cur );
2431 U8 *uc = (U8*)STRING( noper );
2432 const U8 *e = uc + STR_LEN( noper );
2433 U32 state = 1; /* required init */
2434 U16 charid = 0; /* sanity init */
2435 U32 wordlen = 0; /* required init */
2437 if (OP(noper) == NOTHING) {
2438 regnode *noper_next= regnext(noper);
2439 if (noper_next != tail && OP(noper_next) == flags) {
2441 uc= (U8*)STRING(noper);
2442 e= uc + STR_LEN(noper);
2446 if (OP(noper) != NOTHING) {
2447 for ( ; uc < e ; uc += len ) {
2452 charid = trie->charmap[ uvc ];
2454 SV** const svpp = hv_fetch( widecharmap,
2461 charid=(U16)SvIV( *svpp );
2464 /* charid is now 0 if we dont know the char read, or
2465 * nonzero if we do */
2472 if ( !trie->states[ state ].trans.list ) {
2473 TRIE_LIST_NEW( state );
2476 check <= TRIE_LIST_USED( state );
2479 if ( TRIE_LIST_ITEM( state, check ).forid
2482 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2487 newstate = next_alloc++;
2488 prev_states[newstate] = state;
2489 TRIE_LIST_PUSH( state, charid, newstate );
2494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2498 TRIE_HANDLE_WORD(state);
2500 } /* end second pass */
2502 /* next alloc is the NEXT state to be allocated */
2503 trie->statecount = next_alloc;
2504 trie->states = (reg_trie_state *)
2505 PerlMemShared_realloc( trie->states,
2507 * sizeof(reg_trie_state) );
2509 /* and now dump it out before we compress it */
2510 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2511 revcharmap, next_alloc,
2515 trie->trans = (reg_trie_trans *)
2516 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2523 for( state=1 ; state < next_alloc ; state ++ ) {
2527 DEBUG_TRIE_COMPILE_MORE_r(
2528 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2532 if (trie->states[state].trans.list) {
2533 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2537 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2538 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2539 if ( forid < minid ) {
2541 } else if ( forid > maxid ) {
2545 if ( transcount < tp + maxid - minid + 1) {
2547 trie->trans = (reg_trie_trans *)
2548 PerlMemShared_realloc( trie->trans,
2550 * sizeof(reg_trie_trans) );
2551 Zero( trie->trans + (transcount / 2),
2555 base = trie->uniquecharcount + tp - minid;
2556 if ( maxid == minid ) {
2558 for ( ; zp < tp ; zp++ ) {
2559 if ( ! trie->trans[ zp ].next ) {
2560 base = trie->uniquecharcount + zp - minid;
2561 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2563 trie->trans[ zp ].check = state;
2569 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2571 trie->trans[ tp ].check = state;
2576 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2577 const U32 tid = base
2578 - trie->uniquecharcount
2579 + TRIE_LIST_ITEM( state, idx ).forid;
2580 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2582 trie->trans[ tid ].check = state;
2584 tp += ( maxid - minid + 1 );
2586 Safefree(trie->states[ state ].trans.list);
2589 DEBUG_TRIE_COMPILE_MORE_r(
2590 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2593 trie->states[ state ].trans.base=base;
2595 trie->lasttrans = tp + 1;
2599 Second Pass -- Flat Table Representation.
2601 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2602 each. We know that we will need Charcount+1 trans at most to store
2603 the data (one row per char at worst case) So we preallocate both
2604 structures assuming worst case.
2606 We then construct the trie using only the .next slots of the entry
2609 We use the .check field of the first entry of the node temporarily
2610 to make compression both faster and easier by keeping track of how
2611 many non zero fields are in the node.
2613 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2616 There are two terms at use here: state as a TRIE_NODEIDX() which is
2617 a number representing the first entry of the node, and state as a
2618 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2619 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2620 if there are 2 entrys per node. eg:
2628 The table is internally in the right hand, idx form. However as we
2629 also have to deal with the states array which is indexed by nodenum
2630 we have to use TRIE_NODENUM() to convert.
2633 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2634 "%*sCompiling trie using table compiler\n",
2635 (int)depth * 2 + 2, ""));
2637 trie->trans = (reg_trie_trans *)
2638 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2639 * trie->uniquecharcount + 1,
2640 sizeof(reg_trie_trans) );
2641 trie->states = (reg_trie_state *)
2642 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2643 sizeof(reg_trie_state) );
2644 next_alloc = trie->uniquecharcount + 1;
2647 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2649 regnode *noper = NEXTOPER( cur );
2650 const U8 *uc = (U8*)STRING( noper );
2651 const U8 *e = uc + STR_LEN( noper );
2653 U32 state = 1; /* required init */
2655 U16 charid = 0; /* sanity init */
2656 U32 accept_state = 0; /* sanity init */
2658 U32 wordlen = 0; /* required init */
2660 if (OP(noper) == NOTHING) {
2661 regnode *noper_next= regnext(noper);
2662 if (noper_next != tail && OP(noper_next) == flags) {
2664 uc= (U8*)STRING(noper);
2665 e= uc + STR_LEN(noper);
2669 if ( OP(noper) != NOTHING ) {
2670 for ( ; uc < e ; uc += len ) {
2675 charid = trie->charmap[ uvc ];
2677 SV* const * const svpp = hv_fetch( widecharmap,
2681 charid = svpp ? (U16)SvIV(*svpp) : 0;
2685 if ( !trie->trans[ state + charid ].next ) {
2686 trie->trans[ state + charid ].next = next_alloc;
2687 trie->trans[ state ].check++;
2688 prev_states[TRIE_NODENUM(next_alloc)]
2689 = TRIE_NODENUM(state);
2690 next_alloc += trie->uniquecharcount;
2692 state = trie->trans[ state + charid ].next;
2694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2696 /* charid is now 0 if we dont know the char read, or
2697 * nonzero if we do */
2700 accept_state = TRIE_NODENUM( state );
2701 TRIE_HANDLE_WORD(accept_state);
2703 } /* end second pass */
2705 /* and now dump it out before we compress it */
2706 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2708 next_alloc, depth+1));
2712 * Inplace compress the table.*
2714 For sparse data sets the table constructed by the trie algorithm will
2715 be mostly 0/FAIL transitions or to put it another way mostly empty.
2716 (Note that leaf nodes will not contain any transitions.)
2718 This algorithm compresses the tables by eliminating most such
2719 transitions, at the cost of a modest bit of extra work during lookup:
2721 - Each states[] entry contains a .base field which indicates the
2722 index in the state[] array wheres its transition data is stored.
2724 - If .base is 0 there are no valid transitions from that node.
2726 - If .base is nonzero then charid is added to it to find an entry in
2729 -If trans[states[state].base+charid].check!=state then the
2730 transition is taken to be a 0/Fail transition. Thus if there are fail
2731 transitions at the front of the node then the .base offset will point
2732 somewhere inside the previous nodes data (or maybe even into a node
2733 even earlier), but the .check field determines if the transition is
2737 The following process inplace converts the table to the compressed
2738 table: We first do not compress the root node 1,and mark all its
2739 .check pointers as 1 and set its .base pointer as 1 as well. This
2740 allows us to do a DFA construction from the compressed table later,
2741 and ensures that any .base pointers we calculate later are greater
2744 - We set 'pos' to indicate the first entry of the second node.
2746 - We then iterate over the columns of the node, finding the first and
2747 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2748 and set the .check pointers accordingly, and advance pos
2749 appropriately and repreat for the next node. Note that when we copy
2750 the next pointers we have to convert them from the original
2751 NODEIDX form to NODENUM form as the former is not valid post
2754 - If a node has no transitions used we mark its base as 0 and do not
2755 advance the pos pointer.
2757 - If a node only has one transition we use a second pointer into the
2758 structure to fill in allocated fail transitions from other states.
2759 This pointer is independent of the main pointer and scans forward
2760 looking for null transitions that are allocated to a state. When it
2761 finds one it writes the single transition into the "hole". If the
2762 pointer doesnt find one the single transition is appended as normal.
2764 - Once compressed we can Renew/realloc the structures to release the
2767 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2768 specifically Fig 3.47 and the associated pseudocode.
2772 const U32 laststate = TRIE_NODENUM( next_alloc );
2775 trie->statecount = laststate;
2777 for ( state = 1 ; state < laststate ; state++ ) {
2779 const U32 stateidx = TRIE_NODEIDX( state );
2780 const U32 o_used = trie->trans[ stateidx ].check;
2781 U32 used = trie->trans[ stateidx ].check;
2782 trie->trans[ stateidx ].check = 0;
2785 used && charid < trie->uniquecharcount;
2788 if ( flag || trie->trans[ stateidx + charid ].next ) {
2789 if ( trie->trans[ stateidx + charid ].next ) {
2791 for ( ; zp < pos ; zp++ ) {
2792 if ( ! trie->trans[ zp ].next ) {
2796 trie->states[ state ].trans.base
2798 + trie->uniquecharcount
2800 trie->trans[ zp ].next
2801 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2803 trie->trans[ zp ].check = state;
2804 if ( ++zp > pos ) pos = zp;
2811 trie->states[ state ].trans.base
2812 = pos + trie->uniquecharcount - charid ;
2814 trie->trans[ pos ].next
2815 = SAFE_TRIE_NODENUM(
2816 trie->trans[ stateidx + charid ].next );
2817 trie->trans[ pos ].check = state;
2822 trie->lasttrans = pos + 1;
2823 trie->states = (reg_trie_state *)
2824 PerlMemShared_realloc( trie->states, laststate
2825 * sizeof(reg_trie_state) );
2826 DEBUG_TRIE_COMPILE_MORE_r(
2827 PerlIO_printf( Perl_debug_log,
2828 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2829 (int)depth * 2 + 2,"",
2830 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2834 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2837 } /* end table compress */
2839 DEBUG_TRIE_COMPILE_MORE_r(
2840 PerlIO_printf(Perl_debug_log,
2841 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2842 (int)depth * 2 + 2, "",
2843 (UV)trie->statecount,
2844 (UV)trie->lasttrans)
2846 /* resize the trans array to remove unused space */
2847 trie->trans = (reg_trie_trans *)
2848 PerlMemShared_realloc( trie->trans, trie->lasttrans
2849 * sizeof(reg_trie_trans) );
2851 { /* Modify the program and insert the new TRIE node */
2852 U8 nodetype =(U8)(flags & 0xFF);
2856 regnode *optimize = NULL;
2857 #ifdef RE_TRACK_PATTERN_OFFSETS
2860 U32 mjd_nodelen = 0;
2861 #endif /* RE_TRACK_PATTERN_OFFSETS */
2862 #endif /* DEBUGGING */
2864 This means we convert either the first branch or the first Exact,
2865 depending on whether the thing following (in 'last') is a branch
2866 or not and whther first is the startbranch (ie is it a sub part of
2867 the alternation or is it the whole thing.)
2868 Assuming its a sub part we convert the EXACT otherwise we convert
2869 the whole branch sequence, including the first.
2871 /* Find the node we are going to overwrite */
2872 if ( first != startbranch || OP( last ) == BRANCH ) {
2873 /* branch sub-chain */
2874 NEXT_OFF( first ) = (U16)(last - first);
2875 #ifdef RE_TRACK_PATTERN_OFFSETS
2877 mjd_offset= Node_Offset((convert));
2878 mjd_nodelen= Node_Length((convert));
2881 /* whole branch chain */
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2886 const regnode *nop = NEXTOPER( convert );
2887 mjd_offset= Node_Offset((nop));
2888 mjd_nodelen= Node_Length((nop));
2892 PerlIO_printf(Perl_debug_log,
2893 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2894 (int)depth * 2 + 2, "",
2895 (UV)mjd_offset, (UV)mjd_nodelen)
2898 /* But first we check to see if there is a common prefix we can
2899 split out as an EXACT and put in front of the TRIE node. */
2900 trie->startstate= 1;
2901 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2903 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2907 const U32 base = trie->states[ state ].trans.base;
2909 if ( trie->states[state].wordnum )
2912 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2913 if ( ( base + ofs >= trie->uniquecharcount ) &&
2914 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2915 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2917 if ( ++count > 1 ) {
2918 SV **tmp = av_fetch( revcharmap, ofs, 0);
2919 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2920 if ( state == 1 ) break;
2922 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2924 PerlIO_printf(Perl_debug_log,
2925 "%*sNew Start State=%"UVuf" Class: [",
2926 (int)depth * 2 + 2, "",
2929 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2930 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2932 TRIE_BITMAP_SET(trie,*ch);
2934 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2936 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2940 TRIE_BITMAP_SET(trie,*ch);
2942 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2943 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2949 SV **tmp = av_fetch( revcharmap, idx, 0);
2951 char *ch = SvPV( *tmp, len );
2953 SV *sv=sv_newmortal();
2954 PerlIO_printf( Perl_debug_log,
2955 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2956 (int)depth * 2 + 2, "",
2958 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2959 PL_colors[0], PL_colors[1],
2960 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2961 PERL_PV_ESCAPE_FIRSTCHAR
2966 OP( convert ) = nodetype;
2967 str=STRING(convert);
2970 STR_LEN(convert) += len;
2976 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2981 trie->prefixlen = (state-1);
2983 regnode *n = convert+NODE_SZ_STR(convert);
2984 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2985 trie->startstate = state;
2986 trie->minlen -= (state - 1);
2987 trie->maxlen -= (state - 1);
2989 /* At least the UNICOS C compiler choked on this
2990 * being argument to DEBUG_r(), so let's just have
2993 #ifdef PERL_EXT_RE_BUILD
2999 regnode *fix = convert;
3000 U32 word = trie->wordcount;
3002 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3003 while( ++fix < n ) {
3004 Set_Node_Offset_Length(fix, 0, 0);
3007 SV ** const tmp = av_fetch( trie_words, word, 0 );
3009 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3010 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3012 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3020 NEXT_OFF(convert) = (U16)(tail - convert);
3021 DEBUG_r(optimize= n);
3027 if ( trie->maxlen ) {
3028 NEXT_OFF( convert ) = (U16)(tail - convert);
3029 ARG_SET( convert, data_slot );
3030 /* Store the offset to the first unabsorbed branch in
3031 jump[0], which is otherwise unused by the jump logic.
3032 We use this when dumping a trie and during optimisation. */
3034 trie->jump[0] = (U16)(nextbranch - convert);
3036 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3037 * and there is a bitmap
3038 * and the first "jump target" node we found leaves enough room
3039 * then convert the TRIE node into a TRIEC node, with the bitmap
3040 * embedded inline in the opcode - this is hypothetically faster.
3042 if ( !trie->states[trie->startstate].wordnum
3044 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3046 OP( convert ) = TRIEC;
3047 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3048 PerlMemShared_free(trie->bitmap);
3051 OP( convert ) = TRIE;
3053 /* store the type in the flags */
3054 convert->flags = nodetype;
3058 + regarglen[ OP( convert ) ];
3060 /* XXX We really should free up the resource in trie now,
3061 as we won't use them - (which resources?) dmq */
3063 /* needed for dumping*/
3064 DEBUG_r(if (optimize) {
3065 regnode *opt = convert;
3067 while ( ++opt < optimize) {
3068 Set_Node_Offset_Length(opt,0,0);
3071 Try to clean up some of the debris left after the
3074 while( optimize < jumper ) {
3075 mjd_nodelen += Node_Length((optimize));
3076 OP( optimize ) = OPTIMIZED;
3077 Set_Node_Offset_Length(optimize,0,0);
3080 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3082 } /* end node insert */
3084 /* Finish populating the prev field of the wordinfo array. Walk back
3085 * from each accept state until we find another accept state, and if
3086 * so, point the first word's .prev field at the second word. If the
3087 * second already has a .prev field set, stop now. This will be the
3088 * case either if we've already processed that word's accept state,
3089 * or that state had multiple words, and the overspill words were
3090 * already linked up earlier.
3097 for (word=1; word <= trie->wordcount; word++) {
3099 if (trie->wordinfo[word].prev)
3101 state = trie->wordinfo[word].accept;
3103 state = prev_states[state];
3106 prev = trie->states[state].wordnum;
3110 trie->wordinfo[word].prev = prev;
3112 Safefree(prev_states);
3116 /* and now dump out the compressed format */
3117 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3119 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3121 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3122 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3124 SvREFCNT_dec_NN(revcharmap);
3128 : trie->startstate>1
3134 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3136 /* The Trie is constructed and compressed now so we can build a fail array if
3139 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3141 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3145 We find the fail state for each state in the trie, this state is the longest
3146 proper suffix of the current state's 'word' that is also a proper prefix of
3147 another word in our trie. State 1 represents the word '' and is thus the
3148 default fail state. This allows the DFA not to have to restart after its
3149 tried and failed a word at a given point, it simply continues as though it
3150 had been matching the other word in the first place.
3152 'abcdgu'=~/abcdefg|cdgu/
3153 When we get to 'd' we are still matching the first word, we would encounter
3154 'g' which would fail, which would bring us to the state representing 'd' in
3155 the second word where we would try 'g' and succeed, proceeding to match
3158 /* add a fail transition */
3159 const U32 trie_offset = ARG(source);
3160 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3162 const U32 ucharcount = trie->uniquecharcount;
3163 const U32 numstates = trie->statecount;
3164 const U32 ubound = trie->lasttrans + ucharcount;
3168 U32 base = trie->states[ 1 ].trans.base;
3171 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3173 GET_RE_DEBUG_FLAGS_DECL;
3175 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3176 PERL_UNUSED_CONTEXT;
3178 PERL_UNUSED_ARG(depth);
3181 if ( OP(source) == TRIE ) {
3182 struct regnode_1 *op = (struct regnode_1 *)
3183 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3184 StructCopy(source,op,struct regnode_1);
3185 stclass = (regnode *)op;
3187 struct regnode_charclass *op = (struct regnode_charclass *)
3188 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3189 StructCopy(source,op,struct regnode_charclass);
3190 stclass = (regnode *)op;
3192 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3194 ARG_SET( stclass, data_slot );
3195 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3196 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3197 aho->trie=trie_offset;
3198 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3199 Copy( trie->states, aho->states, numstates, reg_trie_state );
3200 Newxz( q, numstates, U32);
3201 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3204 /* initialize fail[0..1] to be 1 so that we always have
3205 a valid final fail state */
3206 fail[ 0 ] = fail[ 1 ] = 1;
3208 for ( charid = 0; charid < ucharcount ; charid++ ) {
3209 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3211 q[ q_write ] = newstate;
3212 /* set to point at the root */
3213 fail[ q[ q_write++ ] ]=1;
3216 while ( q_read < q_write) {
3217 const U32 cur = q[ q_read++ % numstates ];
3218 base = trie->states[ cur ].trans.base;
3220 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3221 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3223 U32 fail_state = cur;
3226 fail_state = fail[ fail_state ];
3227 fail_base = aho->states[ fail_state ].trans.base;
3228 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3230 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3231 fail[ ch_state ] = fail_state;
3232 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3234 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3236 q[ q_write++ % numstates] = ch_state;
3240 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3241 when we fail in state 1, this allows us to use the
3242 charclass scan to find a valid start char. This is based on the principle
3243 that theres a good chance the string being searched contains lots of stuff
3244 that cant be a start char.
3246 fail[ 0 ] = fail[ 1 ] = 0;
3247 DEBUG_TRIE_COMPILE_r({
3248 PerlIO_printf(Perl_debug_log,
3249 "%*sStclass Failtable (%"UVuf" states): 0",
3250 (int)(depth * 2), "", (UV)numstates
3252 for( q_read=1; q_read<numstates; q_read++ ) {
3253 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3255 PerlIO_printf(Perl_debug_log, "\n");
3258 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3263 #define DEBUG_PEEP(str,scan,depth) \
3264 DEBUG_OPTIMISE_r({if (scan){ \
3265 regnode *Next = regnext(scan); \
3266 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3267 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3268 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3269 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3270 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3271 PerlIO_printf(Perl_debug_log, "\n"); \
3274 /* The below joins as many adjacent EXACTish nodes as possible into a single
3275 * one. The regop may be changed if the node(s) contain certain sequences that
3276 * require special handling. The joining is only done if:
3277 * 1) there is room in the current conglomerated node to entirely contain the
3279 * 2) they are the exact same node type
3281 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3282 * these get optimized out
3284 * If a node is to match under /i (folded), the number of characters it matches
3285 * can be different than its character length if it contains a multi-character
3286 * fold. *min_subtract is set to the total delta number of characters of the
3289 * And *unfolded_multi_char is set to indicate whether or not the node contains
3290 * an unfolded multi-char fold. This happens when whether the fold is valid or
3291 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3292 * SMALL LETTER SHARP S, as only if the target string being matched against
3293 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3294 * folding rules depend on the locale in force at runtime. (Multi-char folds
3295 * whose components are all above the Latin1 range are not run-time locale
3296 * dependent, and have already been folded by the time this function is
3299 * This is as good a place as any to discuss the design of handling these
3300 * multi-character fold sequences. It's been wrong in Perl for a very long
3301 * time. There are three code points in Unicode whose multi-character folds
3302 * were long ago discovered to mess things up. The previous designs for
3303 * dealing with these involved assigning a special node for them. This
3304 * approach doesn't always work, as evidenced by this example:
3305 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3306 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3307 * would match just the \xDF, it won't be able to handle the case where a
3308 * successful match would have to cross the node's boundary. The new approach
3309 * that hopefully generally solves the problem generates an EXACTFU_SS node
3310 * that is "sss" in this case.
3312 * It turns out that there are problems with all multi-character folds, and not
3313 * just these three. Now the code is general, for all such cases. The
3314 * approach taken is:
3315 * 1) This routine examines each EXACTFish node that could contain multi-
3316 * character folded sequences. Since a single character can fold into
3317 * such a sequence, the minimum match length for this node is less than
3318 * the number of characters in the node. This routine returns in
3319 * *min_subtract how many characters to subtract from the the actual
3320 * length of the string to get a real minimum match length; it is 0 if
3321 * there are no multi-char foldeds. This delta is used by the caller to
3322 * adjust the min length of the match, and the delta between min and max,
3323 * so that the optimizer doesn't reject these possibilities based on size
3325 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3326 * is used for an EXACTFU node that contains at least one "ss" sequence in
3327 * it. For non-UTF-8 patterns and strings, this is the only case where
3328 * there is a possible fold length change. That means that a regular
3329 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3330 * with length changes, and so can be processed faster. regexec.c takes
3331 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3332 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3333 * known until runtime). This saves effort in regex matching. However,
3334 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3335 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3336 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3337 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3338 * possibilities for the non-UTF8 patterns are quite simple, except for
3339 * the sharp s. All the ones that don't involve a UTF-8 target string are
3340 * members of a fold-pair, and arrays are set up for all of them so that
3341 * the other member of the pair can be found quickly. Code elsewhere in
3342 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3343 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3344 * described in the next item.
3345 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3346 * validity of the fold won't be known until runtime, and so must remain
3347 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3348 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3349 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3350 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3351 * The reason this is a problem is that the optimizer part of regexec.c
3352 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3353 * that a character in the pattern corresponds to at most a single
3354 * character in the target string. (And I do mean character, and not byte
3355 * here, unlike other parts of the documentation that have never been
3356 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3357 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3358 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3359 * nodes, violate the assumption, and they are the only instances where it
3360 * is violated. I'm reluctant to try to change the assumption, as the
3361 * code involved is impenetrable to me (khw), so instead the code here
3362 * punts. This routine examines EXACTFL nodes, and (when the pattern
3363 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3364 * boolean indicating whether or not the node contains such a fold. When
3365 * it is true, the caller sets a flag that later causes the optimizer in
3366 * this file to not set values for the floating and fixed string lengths,
3367 * and thus avoids the optimizer code in regexec.c that makes the invalid
3368 * assumption. Thus, there is no optimization based on string lengths for
3369 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3370 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3371 * assumption is wrong only in these cases is that all other non-UTF-8
3372 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3373 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3374 * EXACTF nodes because we don't know at compile time if it actually
3375 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3376 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3377 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3378 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3379 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3380 * string would require the pattern to be forced into UTF-8, the overhead
3381 * of which we want to avoid. Similarly the unfolded multi-char folds in
3382 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3385 * Similarly, the code that generates tries doesn't currently handle
3386 * not-already-folded multi-char folds, and it looks like a pain to change
3387 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3388 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3389 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3390 * using /iaa matching will be doing so almost entirely with ASCII
3391 * strings, so this should rarely be encountered in practice */
3393 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3394 if (PL_regkind[OP(scan)] == EXACT) \
3395 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3399 UV *min_subtract, bool *unfolded_multi_char,
3400 U32 flags,regnode *val, U32 depth)
3402 /* Merge several consecutive EXACTish nodes into one. */
3403 regnode *n = regnext(scan);
3405 regnode *next = scan + NODE_SZ_STR(scan);
3409 regnode *stop = scan;
3410 GET_RE_DEBUG_FLAGS_DECL;
3412 PERL_UNUSED_ARG(depth);
3415 PERL_ARGS_ASSERT_JOIN_EXACT;
3416 #ifndef EXPERIMENTAL_INPLACESCAN
3417 PERL_UNUSED_ARG(flags);
3418 PERL_UNUSED_ARG(val);
3420 DEBUG_PEEP("join",scan,depth);
3422 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3423 * EXACT ones that are mergeable to the current one. */
3425 && (PL_regkind[OP(n)] == NOTHING
3426 || (stringok && OP(n) == OP(scan)))
3428 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3431 if (OP(n) == TAIL || n > next)
3433 if (PL_regkind[OP(n)] == NOTHING) {
3434 DEBUG_PEEP("skip:",n,depth);
3435 NEXT_OFF(scan) += NEXT_OFF(n);
3436 next = n + NODE_STEP_REGNODE;
3443 else if (stringok) {
3444 const unsigned int oldl = STR_LEN(scan);
3445 regnode * const nnext = regnext(n);
3447 /* XXX I (khw) kind of doubt that this works on platforms (should
3448 * Perl ever run on one) where U8_MAX is above 255 because of lots
3449 * of other assumptions */
3450 /* Don't join if the sum can't fit into a single node */
3451 if (oldl + STR_LEN(n) > U8_MAX)
3454 DEBUG_PEEP("merg",n,depth);
3457 NEXT_OFF(scan) += NEXT_OFF(n);
3458 STR_LEN(scan) += STR_LEN(n);
3459 next = n + NODE_SZ_STR(n);
3460 /* Now we can overwrite *n : */
3461 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3469 #ifdef EXPERIMENTAL_INPLACESCAN
3470 if (flags && !NEXT_OFF(n)) {
3471 DEBUG_PEEP("atch", val, depth);
3472 if (reg_off_by_arg[OP(n)]) {
3473 ARG_SET(n, val - n);
3476 NEXT_OFF(n) = val - n;
3484 *unfolded_multi_char = FALSE;
3486 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3487 * can now analyze for sequences of problematic code points. (Prior to
3488 * this final joining, sequences could have been split over boundaries, and
3489 * hence missed). The sequences only happen in folding, hence for any
3490 * non-EXACT EXACTish node */
3491 if (OP(scan) != EXACT) {
3492 U8* s0 = (U8*) STRING(scan);
3494 U8* s_end = s0 + STR_LEN(scan);
3496 int total_count_delta = 0; /* Total delta number of characters that
3497 multi-char folds expand to */
3499 /* One pass is made over the node's string looking for all the
3500 * possibilities. To avoid some tests in the loop, there are two main
3501 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3506 if (OP(scan) == EXACTFL) {
3509 /* An EXACTFL node would already have been changed to another
3510 * node type unless there is at least one character in it that
3511 * is problematic; likely a character whose fold definition
3512 * won't be known until runtime, and so has yet to be folded.
3513 * For all but the UTF-8 locale, folds are 1-1 in length, but
3514 * to handle the UTF-8 case, we need to create a temporary
3515 * folded copy using UTF-8 locale rules in order to analyze it.
3516 * This is because our macros that look to see if a sequence is
3517 * a multi-char fold assume everything is folded (otherwise the
3518 * tests in those macros would be too complicated and slow).
3519 * Note that here, the non-problematic folds will have already
3520 * been done, so we can just copy such characters. We actually
3521 * don't completely fold the EXACTFL string. We skip the
3522 * unfolded multi-char folds, as that would just create work
3523 * below to figure out the size they already are */
3525 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3528 STRLEN s_len = UTF8SKIP(s);
3529 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3530 Copy(s, d, s_len, U8);
3533 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3534 *unfolded_multi_char = TRUE;
3535 Copy(s, d, s_len, U8);
3538 else if (isASCII(*s)) {
3539 *(d++) = toFOLD(*s);
3543 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3549 /* Point the remainder of the routine to look at our temporary
3553 } /* End of creating folded copy of EXACTFL string */
3555 /* Examine the string for a multi-character fold sequence. UTF-8
3556 * patterns have all characters pre-folded by the time this code is
3558 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3559 length sequence we are looking for is 2 */
3561 int count = 0; /* How many characters in a multi-char fold */
3562 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3563 if (! len) { /* Not a multi-char fold: get next char */
3568 /* Nodes with 'ss' require special handling, except for
3569 * EXACTFA-ish for which there is no multi-char fold to this */
3570 if (len == 2 && *s == 's' && *(s+1) == 's'
3571 && OP(scan) != EXACTFA
3572 && OP(scan) != EXACTFA_NO_TRIE)
3575 if (OP(scan) != EXACTFL) {
3576 OP(scan) = EXACTFU_SS;
3580 else { /* Here is a generic multi-char fold. */
3581 U8* multi_end = s + len;
3583 /* Count how many characters are in it. In the case of
3584 * /aa, no folds which contain ASCII code points are
3585 * allowed, so check for those, and skip if found. */
3586 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3587 count = utf8_length(s, multi_end);
3591 while (s < multi_end) {
3594 goto next_iteration;
3604 /* The delta is how long the sequence is minus 1 (1 is how long
3605 * the character that folds to the sequence is) */
3606 total_count_delta += count - 1;
3610 /* We created a temporary folded copy of the string in EXACTFL
3611 * nodes. Therefore we need to be sure it doesn't go below zero,
3612 * as the real string could be shorter */
3613 if (OP(scan) == EXACTFL) {
3614 int total_chars = utf8_length((U8*) STRING(scan),
3615 (U8*) STRING(scan) + STR_LEN(scan));
3616 if (total_count_delta > total_chars) {
3617 total_count_delta = total_chars;
3621 *min_subtract += total_count_delta;
3624 else if (OP(scan) == EXACTFA) {
3626 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3627 * fold to the ASCII range (and there are no existing ones in the
3628 * upper latin1 range). But, as outlined in the comments preceding
3629 * this function, we need to flag any occurrences of the sharp s.
3630 * This character forbids trie formation (because of added
3633 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3634 OP(scan) = EXACTFA_NO_TRIE;
3635 *unfolded_multi_char = TRUE;
3644 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3645 * folds that are all Latin1. As explained in the comments
3646 * preceding this function, we look also for the sharp s in EXACTF
3647 * and EXACTFL nodes; it can be in the final position. Otherwise
3648 * we can stop looking 1 byte earlier because have to find at least
3649 * two characters for a multi-fold */
3650 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3655 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3656 if (! len) { /* Not a multi-char fold. */
3657 if (*s == LATIN_SMALL_LETTER_SHARP_S
3658 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3660 *unfolded_multi_char = TRUE;
3667 && isALPHA_FOLD_EQ(*s, 's')
3668 && isALPHA_FOLD_EQ(*(s+1), 's'))
3671 /* EXACTF nodes need to know that the minimum length
3672 * changed so that a sharp s in the string can match this
3673 * ss in the pattern, but they remain EXACTF nodes, as they
3674 * won't match this unless the target string is is UTF-8,
3675 * which we don't know until runtime. EXACTFL nodes can't
3676 * transform into EXACTFU nodes */
3677 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3678 OP(scan) = EXACTFU_SS;
3682 *min_subtract += len - 1;
3689 /* Allow dumping but overwriting the collection of skipped
3690 * ops and/or strings with fake optimized ops */
3691 n = scan + NODE_SZ_STR(scan);
3699 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3703 /* REx optimizer. Converts nodes into quicker variants "in place".
3704 Finds fixed substrings. */
3706 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3707 to the position after last scanned or to NULL. */
3709 #define INIT_AND_WITHP \
3710 assert(!and_withp); \
3711 Newx(and_withp,1, regnode_ssc); \
3712 SAVEFREEPV(and_withp)
3716 S_unwind_scan_frames(pTHX_ const void *p)
3718 scan_frame *f= (scan_frame *)p;
3720 scan_frame *n= f->next_frame;
3728 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3729 SSize_t *minlenp, SSize_t *deltap,
3734 regnode_ssc *and_withp,
3735 U32 flags, U32 depth)
3736 /* scanp: Start here (read-write). */
3737 /* deltap: Write maxlen-minlen here. */
3738 /* last: Stop before this one. */
3739 /* data: string data about the pattern */
3740 /* stopparen: treat close N as END */
3741 /* recursed: which subroutines have we recursed into */
3742 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3744 /* There must be at least this number of characters to match */
3747 regnode *scan = *scanp, *next;
3749 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3750 int is_inf_internal = 0; /* The studied chunk is infinite */
3751 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3752 scan_data_t data_fake;
3753 SV *re_trie_maxbuff = NULL;
3754 regnode *first_non_open = scan;
3755 SSize_t stopmin = SSize_t_MAX;
3756 scan_frame *frame = NULL;
3757 GET_RE_DEBUG_FLAGS_DECL;
3759 PERL_ARGS_ASSERT_STUDY_CHUNK;
3763 while (first_non_open && OP(first_non_open) == OPEN)
3764 first_non_open=regnext(first_non_open);
3770 RExC_study_chunk_recursed_count++;
3772 DEBUG_OPTIMISE_MORE_r(
3774 PerlIO_printf(Perl_debug_log,
3775 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3776 (int)(depth*2), "", (long)stopparen,
3777 (unsigned long)RExC_study_chunk_recursed_count,
3778 (unsigned long)depth, (unsigned long)recursed_depth,
3781 if (recursed_depth) {
3784 for ( j = 0 ; j < recursed_depth ; j++ ) {
3785 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3787 PAREN_TEST(RExC_study_chunk_recursed +
3788 ( j * RExC_study_chunk_recursed_bytes), i )
3791 !PAREN_TEST(RExC_study_chunk_recursed +
3792 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3795 PerlIO_printf(Perl_debug_log," %d",(int)i);
3799 if ( j + 1 < recursed_depth ) {
3800 PerlIO_printf(Perl_debug_log, ",");
3804 PerlIO_printf(Perl_debug_log,"\n");
3807 while ( scan && OP(scan) != END && scan < last ){
3808 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3809 node length to get a real minimum (because
3810 the folded version may be shorter) */
3811 bool unfolded_multi_char = FALSE;
3812 /* Peephole optimizer: */
3813 DEBUG_STUDYDATA("Peep:", data, depth);
3814 DEBUG_PEEP("Peep", scan, depth);
3817 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3818 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3819 * by a different invocation of reg() -- Yves
3821 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3823 /* Follow the next-chain of the current node and optimize
3824 away all the NOTHINGs from it. */
3825 if (OP(scan) != CURLYX) {
3826 const int max = (reg_off_by_arg[OP(scan)]
3828 /* I32 may be smaller than U16 on CRAYs! */
3829 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3830 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3834 /* Skip NOTHING and LONGJMP. */
3835 while ((n = regnext(n))
3836 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3837 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3838 && off + noff < max)
3840 if (reg_off_by_arg[OP(scan)])
3843 NEXT_OFF(scan) = off;
3846 /* The principal pseudo-switch. Cannot be a switch, since we
3847 look into several different things. */
3848 if ( OP(scan) == DEFINEP ) {
3850 SSize_t deltanext = 0;
3851 SSize_t fake_last_close = 0;
3852 I32 f = SCF_IN_DEFINE;
3854 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3855 scan = regnext(scan);
3856 assert( OP(scan) == IFTHEN );
3857 DEBUG_PEEP("expect IFTHEN", scan, depth);
3859 data_fake.last_closep= &fake_last_close;
3861 next = regnext(scan);
3862 scan = NEXTOPER(NEXTOPER(scan));
3863 DEBUG_PEEP("scan", scan, depth);
3864 DEBUG_PEEP("next", next, depth);
3866 /* we suppose the run is continuous, last=next...
3867 * NOTE we dont use the return here! */
3868 (void)study_chunk(pRExC_state, &scan, &minlen,
3869 &deltanext, next, &data_fake, stopparen,
3870 recursed_depth, NULL, f, depth+1);
3875 OP(scan) == BRANCH ||
3876 OP(scan) == BRANCHJ ||
3879 next = regnext(scan);
3882 /* The op(next)==code check below is to see if we
3883 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3884 * IFTHEN is special as it might not appear in pairs.
3885 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3886 * we dont handle it cleanly. */
3887 if (OP(next) == code || code == IFTHEN) {
3888 /* NOTE - There is similar code to this block below for
3889 * handling TRIE nodes on a re-study. If you change stuff here
3890 * check there too. */
3891 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3893 regnode * const startbranch=scan;
3895 if (flags & SCF_DO_SUBSTR) {
3896 /* Cannot merge strings after this. */
3897 scan_commit(pRExC_state, data, minlenp, is_inf);
3900 if (flags & SCF_DO_STCLASS)
3901 ssc_init_zero(pRExC_state, &accum);
3903 while (OP(scan) == code) {
3904 SSize_t deltanext, minnext, fake;
3906 regnode_ssc this_class;
3908 DEBUG_PEEP("Branch", scan, depth);
3911 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3913 data_fake.whilem_c = data->whilem_c;
3914 data_fake.last_closep = data->last_closep;
3917 data_fake.last_closep = &fake;
3919 data_fake.pos_delta = delta;
3920 next = regnext(scan);
3922 scan = NEXTOPER(scan); /* everything */
3923 if (code != BRANCH) /* everything but BRANCH */
3924 scan = NEXTOPER(scan);
3926 if (flags & SCF_DO_STCLASS) {
3927 ssc_init(pRExC_state, &this_class);
3928 data_fake.start_class = &this_class;
3929 f = SCF_DO_STCLASS_AND;
3931 if (flags & SCF_WHILEM_VISITED_POS)
3932 f |= SCF_WHILEM_VISITED_POS;
3934 /* we suppose the run is continuous, last=next...*/
3935 minnext = study_chunk(pRExC_state, &scan, minlenp,
3936 &deltanext, next, &data_fake, stopparen,
3937 recursed_depth, NULL, f,depth+1);
3941 if (deltanext == SSize_t_MAX) {
3942 is_inf = is_inf_internal = 1;
3944 } else if (max1 < minnext + deltanext)
3945 max1 = minnext + deltanext;
3947 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3949 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3950 if ( stopmin > minnext)
3951 stopmin = min + min1;
3952 flags &= ~SCF_DO_SUBSTR;
3954 data->flags |= SCF_SEEN_ACCEPT;
3957 if (data_fake.flags & SF_HAS_EVAL)
3958 data->flags |= SF_HAS_EVAL;
3959 data->whilem_c = data_fake.whilem_c;
3961 if (flags & SCF_DO_STCLASS)
3962 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3964 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3966 if (flags & SCF_DO_SUBSTR) {
3967 data->pos_min += min1;
3968 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3969 data->pos_delta = SSize_t_MAX;
3971 data->pos_delta += max1 - min1;
3972 if (max1 != min1 || is_inf)
3973 data->longest = &(data->longest_float);
3976 if (delta == SSize_t_MAX
3977 || SSize_t_MAX - delta - (max1 - min1) < 0)
3978 delta = SSize_t_MAX;
3980 delta += max1 - min1;
3981 if (flags & SCF_DO_STCLASS_OR) {
3982 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3984 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3985 flags &= ~SCF_DO_STCLASS;
3988 else if (flags & SCF_DO_STCLASS_AND) {
3990 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3991 flags &= ~SCF_DO_STCLASS;
3994 /* Switch to OR mode: cache the old value of
3995 * data->start_class */
3997 StructCopy(data->start_class, and_withp, regnode_ssc);
3998 flags &= ~SCF_DO_STCLASS_AND;
3999 StructCopy(&accum, data->start_class, regnode_ssc);
4000 flags |= SCF_DO_STCLASS_OR;
4004 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4005 OP( startbranch ) == BRANCH )
4009 Assuming this was/is a branch we are dealing with: 'scan'
4010 now points at the item that follows the branch sequence,
4011 whatever it is. We now start at the beginning of the
4012 sequence and look for subsequences of
4018 which would be constructed from a pattern like
4021 If we can find such a subsequence we need to turn the first
4022 element into a trie and then add the subsequent branch exact
4023 strings to the trie.
4027 1. patterns where the whole set of branches can be
4030 2. patterns where only a subset can be converted.
4032 In case 1 we can replace the whole set with a single regop
4033 for the trie. In case 2 we need to keep the start and end
4036 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4037 becomes BRANCH TRIE; BRANCH X;
4039 There is an additional case, that being where there is a
4040 common prefix, which gets split out into an EXACT like node
4041 preceding the TRIE node.
4043 If x(1..n)==tail then we can do a simple trie, if not we make
4044 a "jump" trie, such that when we match the appropriate word
4045 we "jump" to the appropriate tail node. Essentially we turn
4046 a nested if into a case structure of sorts.
4051 if (!re_trie_maxbuff) {
4052 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4053 if (!SvIOK(re_trie_maxbuff))
4054 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4056 if ( SvIV(re_trie_maxbuff)>=0 ) {
4058 regnode *first = (regnode *)NULL;
4059 regnode *last = (regnode *)NULL;
4060 regnode *tail = scan;
4064 /* var tail is used because there may be a TAIL
4065 regop in the way. Ie, the exacts will point to the
4066 thing following the TAIL, but the last branch will
4067 point at the TAIL. So we advance tail. If we
4068 have nested (?:) we may have to move through several
4072 while ( OP( tail ) == TAIL ) {
4073 /* this is the TAIL generated by (?:) */
4074 tail = regnext( tail );
4078 DEBUG_TRIE_COMPILE_r({
4079 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4080 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4081 (int)depth * 2 + 2, "",
4082 "Looking for TRIE'able sequences. Tail node is: ",
4083 SvPV_nolen_const( RExC_mysv )
4089 Step through the branches
4090 cur represents each branch,
4091 noper is the first thing to be matched as part
4093 noper_next is the regnext() of that node.
4095 We normally handle a case like this
4096 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4097 support building with NOJUMPTRIE, which restricts
4098 the trie logic to structures like /FOO|BAR/.
4100 If noper is a trieable nodetype then the branch is
4101 a possible optimization target. If we are building
4102 under NOJUMPTRIE then we require that noper_next is
4103 the same as scan (our current position in the regex
4106 Once we have two or more consecutive such branches
4107 we can create a trie of the EXACT's contents and
4108 stitch it in place into the program.
4110 If the sequence represents all of the branches in
4111 the alternation we replace the entire thing with a
4114 Otherwise when it is a subsequence we need to
4115 stitch it in place and replace only the relevant
4116 branches. This means the first branch has to remain
4117 as it is used by the alternation logic, and its
4118 next pointer, and needs to be repointed at the item
4119 on the branch chain following the last branch we
4120 have optimized away.
4122 This could be either a BRANCH, in which case the
4123 subsequence is internal, or it could be the item
4124 following the branch sequence in which case the
4125 subsequence is at the end (which does not
4126 necessarily mean the first node is the start of the
4129 TRIE_TYPE(X) is a define which maps the optype to a
4133 ----------------+-----------
4137 EXACTFU_SS | EXACTFU
4142 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
4143 ( EXACT == (X) ) ? EXACT : \
4144 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
4145 ( EXACTFA == (X) ) ? EXACTFA : \
4148 /* dont use tail as the end marker for this traverse */
4149 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4150 regnode * const noper = NEXTOPER( cur );
4151 U8 noper_type = OP( noper );
4152 U8 noper_trietype = TRIE_TYPE( noper_type );
4153 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4154 regnode * const noper_next = regnext( noper );
4155 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4156 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4159 DEBUG_TRIE_COMPILE_r({
4160 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4161 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4162 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4164 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4165 PerlIO_printf( Perl_debug_log, " -> %s",
4166 SvPV_nolen_const(RExC_mysv));
4169 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4170 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4171 SvPV_nolen_const(RExC_mysv));
4173 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4174 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4175 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4179 /* Is noper a trieable nodetype that can be merged
4180 * with the current trie (if there is one)? */
4184 ( noper_trietype == NOTHING)
4185 || ( trietype == NOTHING )
4186 || ( trietype == noper_trietype )
4189 && noper_next == tail
4193 /* Handle mergable triable node Either we are
4194 * the first node in a new trieable sequence,
4195 * in which case we do some bookkeeping,
4196 * otherwise we update the end pointer. */
4199 if ( noper_trietype == NOTHING ) {
4200 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4201 regnode * const noper_next = regnext( noper );
4202 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4203 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4206 if ( noper_next_trietype ) {
4207 trietype = noper_next_trietype;
4208 } else if (noper_next_type) {
4209 /* a NOTHING regop is 1 regop wide.
4210 * We need at least two for a trie
4211 * so we can't merge this in */
4215 trietype = noper_trietype;
4218 if ( trietype == NOTHING )
4219 trietype = noper_trietype;
4224 } /* end handle mergable triable node */
4226 /* handle unmergable node -
4227 * noper may either be a triable node which can
4228 * not be tried together with the current trie,
4229 * or a non triable node */
4231 /* If last is set and trietype is not
4232 * NOTHING then we have found at least two
4233 * triable branch sequences in a row of a
4234 * similar trietype so we can turn them
4235 * into a trie. If/when we allow NOTHING to
4236 * start a trie sequence this condition
4237 * will be required, and it isn't expensive
4238 * so we leave it in for now. */
4239 if ( trietype && trietype != NOTHING )
4240 make_trie( pRExC_state,
4241 startbranch, first, cur, tail,
4242 count, trietype, depth+1 );
4243 last = NULL; /* note: we clear/update
4244 first, trietype etc below,
4245 so we dont do it here */
4249 && noper_next == tail
4252 /* noper is triable, so we can start a new
4256 trietype = noper_trietype;
4258 /* if we already saw a first but the
4259 * current node is not triable then we have
4260 * to reset the first information. */
4265 } /* end handle unmergable node */
4266 } /* loop over branches */
4267 DEBUG_TRIE_COMPILE_r({
4268 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4269 PerlIO_printf( Perl_debug_log,
4270 "%*s- %s (%d) <SCAN FINISHED>\n",
4272 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4275 if ( last && trietype ) {
4276 if ( trietype != NOTHING ) {
4277 /* the last branch of the sequence was part of
4278 * a trie, so we have to construct it here
4279 * outside of the loop */
4280 made= make_trie( pRExC_state, startbranch,
4281 first, scan, tail, count,
4282 trietype, depth+1 );
4283 #ifdef TRIE_STUDY_OPT
4284 if ( ((made == MADE_EXACT_TRIE &&
4285 startbranch == first)
4286 || ( first_non_open == first )) &&
4288 flags |= SCF_TRIE_RESTUDY;
4289 if ( startbranch == first
4292 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4297 /* at this point we know whatever we have is a
4298 * NOTHING sequence/branch AND if 'startbranch'
4299 * is 'first' then we can turn the whole thing
4302 if ( startbranch == first ) {
4304 /* the entire thing is a NOTHING sequence,
4305 * something like this: (?:|) So we can
4306 * turn it into a plain NOTHING op. */
4307 DEBUG_TRIE_COMPILE_r({
4308 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309 PerlIO_printf( Perl_debug_log,
4310 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4311 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4314 OP(startbranch)= NOTHING;
4315 NEXT_OFF(startbranch)= tail - startbranch;
4316 for ( opt= startbranch + 1; opt < tail ; opt++ )
4320 } /* end if ( last) */
4321 } /* TRIE_MAXBUF is non zero */
4326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4327 scan = NEXTOPER(NEXTOPER(scan));
4328 } else /* single branch is optimized. */
4329 scan = NEXTOPER(scan);
4331 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4333 regnode *start = NULL;
4334 regnode *end = NULL;
4335 U32 my_recursed_depth= recursed_depth;
4338 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4339 /* Do setup, note this code has side effects beyond
4340 * the rest of this block. Specifically setting
4341 * RExC_recurse[] must happen at least once during
4343 if (OP(scan) == GOSUB) {
4345 RExC_recurse[ARG2L(scan)] = scan;
4346 start = RExC_open_parens[paren-1];
4347 end = RExC_close_parens[paren-1];
4349 start = RExC_rxi->program + 1;
4352 /* NOTE we MUST always execute the above code, even
4353 * if we do nothing with a GOSUB/GOSTART */
4355 ( flags & SCF_IN_DEFINE )
4358 (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4360 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4363 /* no need to do anything here if we are in a define. */
4364 /* or we are after some kind of infinite construct
4365 * so we can skip recursing into this item.
4366 * Since it is infinite we will not change the maxlen
4367 * or delta, and if we miss something that might raise
4368 * the minlen it will merely pessimise a little.
4370 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4371 * might result in a minlen of 1 and not of 4,
4372 * but this doesn't make us mismatch, just try a bit
4373 * harder than we should.
4375 scan= regnext(scan);
4382 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4384 /* it is quite possible that there are more efficient ways
4385 * to do this. We maintain a bitmap per level of recursion
4386 * of which patterns we have entered so we can detect if a
4387 * pattern creates a possible infinite loop. When we
4388 * recurse down a level we copy the previous levels bitmap
4389 * down. When we are at recursion level 0 we zero the top
4390 * level bitmap. It would be nice to implement a different
4391 * more efficient way of doing this. In particular the top
4392 * level bitmap may be unnecessary.
4394 if (!recursed_depth) {
4395 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4397 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4398 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4399 RExC_study_chunk_recursed_bytes, U8);
4401 /* we havent recursed into this paren yet, so recurse into it */
4402 DEBUG_STUDYDATA("set:", data,depth);
4403 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4404 my_recursed_depth= recursed_depth + 1;
4406 DEBUG_STUDYDATA("inf:", data,depth);
4407 /* some form of infinite recursion, assume infinite length
4409 if (flags & SCF_DO_SUBSTR) {
4410 scan_commit(pRExC_state, data, minlenp, is_inf);
4411 data->longest = &(data->longest_float);
4413 is_inf = is_inf_internal = 1;
4414 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4415 ssc_anything(data->start_class);
4416 flags &= ~SCF_DO_STCLASS;
4418 start= NULL; /* reset start so we dont recurse later on. */
4423 end = regnext(scan);
4426 scan_frame *newframe;
4428 if (!RExC_frame_last) {
4429 Newxz(newframe, 1, scan_frame);
4430 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4431 RExC_frame_head= newframe;
4433 } else if (!RExC_frame_last->next_frame) {
4434 Newxz(newframe,1,scan_frame);
4435 RExC_frame_last->next_frame= newframe;
4436 newframe->prev_frame= RExC_frame_last;
4439 newframe= RExC_frame_last->next_frame;
4441 RExC_frame_last= newframe;
4443 newframe->next_regnode = regnext(scan);
4444 newframe->last_regnode = last;
4445 newframe->stopparen = stopparen;
4446 newframe->prev_recursed_depth = recursed_depth;
4447 newframe->this_prev_frame= frame;
4449 DEBUG_STUDYDATA("frame-new:",data,depth);
4450 DEBUG_PEEP("fnew", scan, depth);
4457 recursed_depth= my_recursed_depth;
4462 else if (OP(scan) == EXACT) {
4463 SSize_t l = STR_LEN(scan);
4466 const U8 * const s = (U8*)STRING(scan);
4467 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4468 l = utf8_length(s, s + l);
4470 uc = *((U8*)STRING(scan));
4473 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4474 /* The code below prefers earlier match for fixed
4475 offset, later match for variable offset. */
4476 if (data->last_end == -1) { /* Update the start info. */
4477 data->last_start_min = data->pos_min;
4478 data->last_start_max = is_inf
4479 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4481 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4483 SvUTF8_on(data->last_found);
4485 SV * const sv = data->last_found;
4486 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4487 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4488 if (mg && mg->mg_len >= 0)
4489 mg->mg_len += utf8_length((U8*)STRING(scan),
4490 (U8*)STRING(scan)+STR_LEN(scan));
4492 data->last_end = data->pos_min + l;
4493 data->pos_min += l; /* As in the first entry. */
4494 data->flags &= ~SF_BEFORE_EOL;
4497 /* ANDing the code point leaves at most it, and not in locale, and
4498 * can't match null string */
4499 if (flags & SCF_DO_STCLASS_AND) {
4500 ssc_cp_and(data->start_class, uc);
4501 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4502 ssc_clear_locale(data->start_class);
4504 else if (flags & SCF_DO_STCLASS_OR) {
4505 ssc_add_cp(data->start_class, uc);
4506 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4508 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4509 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4511 flags &= ~SCF_DO_STCLASS;
4513 else if (PL_regkind[OP(scan)] == EXACT) {
4514 /* But OP != EXACT!, so is EXACTFish */
4515 SSize_t l = STR_LEN(scan);
4516 UV uc = *((U8*)STRING(scan));
4517 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4518 separate code points */
4519 const U8 * s = (U8*)STRING(scan);
4521 /* Search for fixed substrings supports EXACT only. */
4522 if (flags & SCF_DO_SUBSTR) {
4524 scan_commit(pRExC_state, data, minlenp, is_inf);
4527 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4528 l = utf8_length(s, s + l);
4530 if (unfolded_multi_char) {
4531 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4533 min += l - min_subtract;
4535 delta += min_subtract;
4536 if (flags & SCF_DO_SUBSTR) {
4537 data->pos_min += l - min_subtract;
4538 if (data->pos_min < 0) {
4541 data->pos_delta += min_subtract;
4543 data->longest = &(data->longest_float);
4547 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4548 ssc_clear_locale(data->start_class);
4553 /* We punt and assume can match anything if the node begins
4554 * with a multi-character fold. Things are complicated. For
4555 * example, /ffi/i could match any of:
4556 * "\N{LATIN SMALL LIGATURE FFI}"
4557 * "\N{LATIN SMALL LIGATURE FF}I"
4558 * "F\N{LATIN SMALL LIGATURE FI}"
4559 * plus several other things; and making sure we have all the
4560 * possibilities is hard. */
4561 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4563 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4567 /* Any Latin1 range character can potentially match any
4568 * other depending on the locale */
4569 if (OP(scan) == EXACTFL) {
4570 _invlist_union(EXACTF_invlist, PL_Latin1,
4574 /* But otherwise, it matches at least itself. We can
4575 * quickly tell if it has a distinct fold, and if so,
4576 * it matches that as well */
4577 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4578 if (IS_IN_SOME_FOLD_L1(uc)) {
4579 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4580 PL_fold_latin1[uc]);
4584 /* Some characters match above-Latin1 ones under /i. This
4585 * is true of EXACTFL ones when the locale is UTF-8 */
4586 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4587 && (! isASCII(uc) || (OP(scan) != EXACTFA
4588 && OP(scan) != EXACTFA_NO_TRIE)))
4590 add_above_Latin1_folds(pRExC_state,
4596 else { /* Pattern is UTF-8 */
4597 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4598 STRLEN foldlen = UTF8SKIP(s);
4599 const U8* e = s + STR_LEN(scan);
4602 /* The only code points that aren't folded in a UTF EXACTFish
4603 * node are are the problematic ones in EXACTFL nodes */
4604 if (OP(scan) == EXACTFL
4605 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4607 /* We need to check for the possibility that this EXACTFL
4608 * node begins with a multi-char fold. Therefore we fold
4609 * the first few characters of it so that we can make that
4614 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4616 *(d++) = (U8) toFOLD(*s);
4621 to_utf8_fold(s, d, &len);
4627 /* And set up so the code below that looks in this folded
4628 * buffer instead of the node's string */
4630 foldlen = UTF8SKIP(folded);
4634 /* When we reach here 's' points to the fold of the first
4635 * character(s) of the node; and 'e' points to far enough along
4636 * the folded string to be just past any possible multi-char
4637 * fold. 'foldlen' is the length in bytes of the first
4640 * Unlike the non-UTF-8 case, the macro for determining if a
4641 * string is a multi-char fold requires all the characters to
4642 * already be folded. This is because of all the complications
4643 * if not. Note that they are folded anyway, except in EXACTFL
4644 * nodes. Like the non-UTF case above, we punt if the node
4645 * begins with a multi-char fold */
4647 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4649 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4651 else { /* Single char fold */
4653 /* It matches all the things that fold to it, which are
4654 * found in PL_utf8_foldclosures (including itself) */
4655 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4656 if (! PL_utf8_foldclosures) {
4657 _load_PL_utf8_foldclosures();
4659 if ((listp = hv_fetch(PL_utf8_foldclosures,
4660 (char *) s, foldlen, FALSE)))
4662 AV* list = (AV*) *listp;
4664 for (k = 0; k <= av_tindex(list); k++) {
4665 SV** c_p = av_fetch(list, k, FALSE);
4671 /* /aa doesn't allow folds between ASCII and non- */
4672 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4673 && isASCII(c) != isASCII(uc))
4678 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4683 if (flags & SCF_DO_STCLASS_AND) {
4684 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685 ANYOF_POSIXL_ZERO(data->start_class);
4686 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4688 else if (flags & SCF_DO_STCLASS_OR) {
4689 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4690 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4692 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4693 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4695 flags &= ~SCF_DO_STCLASS;
4696 SvREFCNT_dec(EXACTF_invlist);
4698 else if (REGNODE_VARIES(OP(scan))) {
4699 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4700 I32 fl = 0, f = flags;
4701 regnode * const oscan = scan;
4702 regnode_ssc this_class;
4703 regnode_ssc *oclass = NULL;
4704 I32 next_is_eval = 0;
4706 switch (PL_regkind[OP(scan)]) {
4707 case WHILEM: /* End of (?:...)* . */
4708 scan = NEXTOPER(scan);
4711 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4712 next = NEXTOPER(scan);
4713 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4715 maxcount = REG_INFTY;
4716 next = regnext(scan);
4717 scan = NEXTOPER(scan);
4721 if (flags & SCF_DO_SUBSTR)
4726 if (flags & SCF_DO_STCLASS) {
4728 maxcount = REG_INFTY;
4729 next = regnext(scan);
4730 scan = NEXTOPER(scan);
4733 if (flags & SCF_DO_SUBSTR) {
4734 scan_commit(pRExC_state, data, minlenp, is_inf);
4735 /* Cannot extend fixed substrings */
4736 data->longest = &(data->longest_float);
4738 is_inf = is_inf_internal = 1;
4739 scan = regnext(scan);
4740 goto optimize_curly_tail;
4742 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4743 && (scan->flags == stopparen))
4748 mincount = ARG1(scan);
4749 maxcount = ARG2(scan);
4751 next = regnext(scan);
4752 if (OP(scan) == CURLYX) {
4753 I32 lp = (data ? *(data->last_closep) : 0);
4754 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4756 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4757 next_is_eval = (OP(scan) == EVAL);
4759 if (flags & SCF_DO_SUBSTR) {
4761 scan_commit(pRExC_state, data, minlenp, is_inf);
4762 /* Cannot extend fixed substrings */
4763 pos_before = data->pos_min;
4767 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4769 data->flags |= SF_IS_INF;
4771 if (flags & SCF_DO_STCLASS) {
4772 ssc_init(pRExC_state, &this_class);
4773 oclass = data->start_class;
4774 data->start_class = &this_class;
4775 f |= SCF_DO_STCLASS_AND;
4776 f &= ~SCF_DO_STCLASS_OR;
4778 /* Exclude from super-linear cache processing any {n,m}
4779 regops for which the combination of input pos and regex
4780 pos is not enough information to determine if a match
4783 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4784 regex pos at the \s*, the prospects for a match depend not
4785 only on the input position but also on how many (bar\s*)
4786 repeats into the {4,8} we are. */
4787 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4788 f &= ~SCF_WHILEM_VISITED_POS;
4790 /* This will finish on WHILEM, setting scan, or on NULL: */
4791 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4792 last, data, stopparen, recursed_depth, NULL,
4794 ? (f & ~SCF_DO_SUBSTR)
4798 if (flags & SCF_DO_STCLASS)
4799 data->start_class = oclass;
4800 if (mincount == 0 || minnext == 0) {
4801 if (flags & SCF_DO_STCLASS_OR) {
4802 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4804 else if (flags & SCF_DO_STCLASS_AND) {
4805 /* Switch to OR mode: cache the old value of
4806 * data->start_class */
4808 StructCopy(data->start_class, and_withp, regnode_ssc);
4809 flags &= ~SCF_DO_STCLASS_AND;
4810 StructCopy(&this_class, data->start_class, regnode_ssc);
4811 flags |= SCF_DO_STCLASS_OR;
4812 ANYOF_FLAGS(data->start_class)
4813 |= SSC_MATCHES_EMPTY_STRING;
4815 } else { /* Non-zero len */
4816 if (flags & SCF_DO_STCLASS_OR) {
4817 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4818 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4820 else if (flags & SCF_DO_STCLASS_AND)
4821 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4822 flags &= ~SCF_DO_STCLASS;
4824 if (!scan) /* It was not CURLYX, but CURLY. */
4826 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4827 /* ? quantifier ok, except for (?{ ... }) */
4828 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4829 && (minnext == 0) && (deltanext == 0)
4830 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4831 && maxcount <= REG_INFTY/3) /* Complement check for big
4834 /* Fatal warnings may leak the regexp without this: */
4835 SAVEFREESV(RExC_rx_sv);
4836 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4837 "Quantifier unexpected on zero-length expression "
4838 "in regex m/%"UTF8f"/",
4839 UTF8fARG(UTF, RExC_end - RExC_precomp,
4841 (void)ReREFCNT_inc(RExC_rx_sv);
4844 min += minnext * mincount;
4845 is_inf_internal |= deltanext == SSize_t_MAX
4846 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4847 is_inf |= is_inf_internal;
4849 delta = SSize_t_MAX;
4851 delta += (minnext + deltanext) * maxcount
4852 - minnext * mincount;
4854 /* Try powerful optimization CURLYX => CURLYN. */
4855 if ( OP(oscan) == CURLYX && data
4856 && data->flags & SF_IN_PAR
4857 && !(data->flags & SF_HAS_EVAL)
4858 && !deltanext && minnext == 1 ) {
4859 /* Try to optimize to CURLYN. */
4860 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4861 regnode * const nxt1 = nxt;
4868 if (!REGNODE_SIMPLE(OP(nxt))
4869 && !(PL_regkind[OP(nxt)] == EXACT
4870 && STR_LEN(nxt) == 1))
4876 if (OP(nxt) != CLOSE)
4878 if (RExC_open_parens) {
4879 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4880 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4882 /* Now we know that nxt2 is the only contents: */
4883 oscan->flags = (U8)ARG(nxt);
4885 OP(nxt1) = NOTHING; /* was OPEN. */
4888 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4889 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4890 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4891 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4892 OP(nxt + 1) = OPTIMIZED; /* was count. */
4893 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4898 /* Try optimization CURLYX => CURLYM. */
4899 if ( OP(oscan) == CURLYX && data
4900 && !(data->flags & SF_HAS_PAR)
4901 && !(data->flags & SF_HAS_EVAL)
4902 && !deltanext /* atom is fixed width */
4903 && minnext != 0 /* CURLYM can't handle zero width */
4905 /* Nor characters whose fold at run-time may be
4906 * multi-character */
4907 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4909 /* XXXX How to optimize if data == 0? */
4910 /* Optimize to a simpler form. */
4911 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4915 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4916 && (OP(nxt2) != WHILEM))
4918 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4919 /* Need to optimize away parenths. */
4920 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4921 /* Set the parenth number. */
4922 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4924 oscan->flags = (U8)ARG(nxt);
4925 if (RExC_open_parens) {
4926 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4927 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4929 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4930 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4933 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4934 OP(nxt + 1) = OPTIMIZED; /* was count. */
4935 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4936 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4939 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4940 regnode *nnxt = regnext(nxt1);
4942 if (reg_off_by_arg[OP(nxt1)])
4943 ARG_SET(nxt1, nxt2 - nxt1);
4944 else if (nxt2 - nxt1 < U16_MAX)
4945 NEXT_OFF(nxt1) = nxt2 - nxt1;
4947 OP(nxt) = NOTHING; /* Cannot beautify */
4952 /* Optimize again: */
4953 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4954 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4959 else if ((OP(oscan) == CURLYX)
4960 && (flags & SCF_WHILEM_VISITED_POS)
4961 /* See the comment on a similar expression above.
4962 However, this time it's not a subexpression
4963 we care about, but the expression itself. */
4964 && (maxcount == REG_INFTY)
4965 && data && ++data->whilem_c < 16) {
4966 /* This stays as CURLYX, we can put the count/of pair. */
4967 /* Find WHILEM (as in regexec.c) */
4968 regnode *nxt = oscan + NEXT_OFF(oscan);
4970 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4972 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4973 | (RExC_whilem_seen << 4)); /* On WHILEM */
4975 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4977 if (flags & SCF_DO_SUBSTR) {
4978 SV *last_str = NULL;
4979 STRLEN last_chrs = 0;
4980 int counted = mincount != 0;
4982 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4984 SSize_t b = pos_before >= data->last_start_min
4985 ? pos_before : data->last_start_min;
4987 const char * const s = SvPV_const(data->last_found, l);
4988 SSize_t old = b - data->last_start_min;
4991 old = utf8_hop((U8*)s, old) - (U8*)s;
4993 /* Get the added string: */
4994 last_str = newSVpvn_utf8(s + old, l, UTF);
4995 last_chrs = UTF ? utf8_length((U8*)(s + old),
4996 (U8*)(s + old + l)) : l;
4997 if (deltanext == 0 && pos_before == b) {
4998 /* What was added is a constant string */
5001 SvGROW(last_str, (mincount * l) + 1);
5002 repeatcpy(SvPVX(last_str) + l,
5003 SvPVX_const(last_str), l,
5005 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5006 /* Add additional parts. */
5007 SvCUR_set(data->last_found,
5008 SvCUR(data->last_found) - l);
5009 sv_catsv(data->last_found, last_str);
5011 SV * sv = data->last_found;
5013 SvUTF8(sv) && SvMAGICAL(sv) ?
5014 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5015 if (mg && mg->mg_len >= 0)
5016 mg->mg_len += last_chrs * (mincount-1);
5018 last_chrs *= mincount;
5019 data->last_end += l * (mincount - 1);
5022 /* start offset must point into the last copy */
5023 data->last_start_min += minnext * (mincount - 1);
5024 data->last_start_max += is_inf ? SSize_t_MAX
5025 : (maxcount - 1) * (minnext + data->pos_delta);
5028 /* It is counted once already... */
5029 data->pos_min += minnext * (mincount - counted);
5031 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5032 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5033 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5034 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5036 if (deltanext != SSize_t_MAX)
5037 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5038 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5039 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5041 if (deltanext == SSize_t_MAX
5042 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5043 data->pos_delta = SSize_t_MAX;
5045 data->pos_delta += - counted * deltanext +
5046 (minnext + deltanext) * maxcount - minnext * mincount;
5047 if (mincount != maxcount) {
5048 /* Cannot extend fixed substrings found inside
5050 scan_commit(pRExC_state, data, minlenp, is_inf);
5051 if (mincount && last_str) {
5052 SV * const sv = data->last_found;
5053 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5054 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5058 sv_setsv(sv, last_str);
5059 data->last_end = data->pos_min;
5060 data->last_start_min = data->pos_min - last_chrs;
5061 data->last_start_max = is_inf
5063 : data->pos_min + data->pos_delta - last_chrs;
5065 data->longest = &(data->longest_float);
5067 SvREFCNT_dec(last_str);
5069 if (data && (fl & SF_HAS_EVAL))
5070 data->flags |= SF_HAS_EVAL;
5071 optimize_curly_tail:
5072 if (OP(oscan) != CURLYX) {
5073 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5075 NEXT_OFF(oscan) += NEXT_OFF(next);
5081 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5086 if (flags & SCF_DO_SUBSTR) {
5087 /* Cannot expect anything... */
5088 scan_commit(pRExC_state, data, minlenp, is_inf);
5089 data->longest = &(data->longest_float);
5091 is_inf = is_inf_internal = 1;
5092 if (flags & SCF_DO_STCLASS_OR) {
5093 if (OP(scan) == CLUMP) {
5094 /* Actually is any start char, but very few code points
5095 * aren't start characters */
5096 ssc_match_all_cp(data->start_class);
5099 ssc_anything(data->start_class);
5102 flags &= ~SCF_DO_STCLASS;
5106 else if (OP(scan) == LNBREAK) {
5107 if (flags & SCF_DO_STCLASS) {
5108 if (flags & SCF_DO_STCLASS_AND) {
5109 ssc_intersection(data->start_class,
5110 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5111 ssc_clear_locale(data->start_class);
5112 ANYOF_FLAGS(data->start_class)
5113 &= ~SSC_MATCHES_EMPTY_STRING;
5115 else if (flags & SCF_DO_STCLASS_OR) {
5116 ssc_union(data->start_class,
5117 PL_XPosix_ptrs[_CC_VERTSPACE],
5119 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5121 /* See commit msg for
5122 * 749e076fceedeb708a624933726e7989f2302f6a */
5123 ANYOF_FLAGS(data->start_class)
5124 &= ~SSC_MATCHES_EMPTY_STRING;
5126 flags &= ~SCF_DO_STCLASS;
5129 delta++; /* Because of the 2 char string cr-lf */
5130 if (flags & SCF_DO_SUBSTR) {
5131 /* Cannot expect anything... */
5132 scan_commit(pRExC_state, data, minlenp, is_inf);
5134 data->pos_delta += 1;
5135 data->longest = &(data->longest_float);
5138 else if (REGNODE_SIMPLE(OP(scan))) {
5140 if (flags & SCF_DO_SUBSTR) {
5141 scan_commit(pRExC_state, data, minlenp, is_inf);
5145 if (flags & SCF_DO_STCLASS) {
5147 SV* my_invlist = NULL;
5150 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5151 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5153 /* Some of the logic below assumes that switching
5154 locale on will only add false positives. */
5159 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5164 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5165 ssc_match_all_cp(data->start_class);
5170 SV* REG_ANY_invlist = _new_invlist(2);
5171 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5173 if (flags & SCF_DO_STCLASS_OR) {
5174 ssc_union(data->start_class,
5176 TRUE /* TRUE => invert, hence all but \n
5180 else if (flags & SCF_DO_STCLASS_AND) {
5181 ssc_intersection(data->start_class,
5183 TRUE /* TRUE => invert */
5185 ssc_clear_locale(data->start_class);
5187 SvREFCNT_dec_NN(REG_ANY_invlist);
5192 if (flags & SCF_DO_STCLASS_AND)
5193 ssc_and(pRExC_state, data->start_class,
5194 (regnode_charclass *) scan);
5196 ssc_or(pRExC_state, data->start_class,
5197 (regnode_charclass *) scan);
5205 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5206 if (flags & SCF_DO_STCLASS_AND) {
5207 bool was_there = cBOOL(
5208 ANYOF_POSIXL_TEST(data->start_class,
5210 ANYOF_POSIXL_ZERO(data->start_class);
5211 if (was_there) { /* Do an AND */
5212 ANYOF_POSIXL_SET(data->start_class, namedclass);
5214 /* No individual code points can now match */
5215 data->start_class->invlist
5216 = sv_2mortal(_new_invlist(0));
5219 int complement = namedclass + ((invert) ? -1 : 1);
5221 assert(flags & SCF_DO_STCLASS_OR);
5223 /* If the complement of this class was already there,
5224 * the result is that they match all code points,
5225 * (\d + \D == everything). Remove the classes from
5226 * future consideration. Locale is not relevant in
5228 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5229 ssc_match_all_cp(data->start_class);
5230 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5231 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5233 else { /* The usual case; just add this class to the
5235 ANYOF_POSIXL_SET(data->start_class, namedclass);
5240 case NPOSIXA: /* For these, we always know the exact set of
5245 if (FLAGS(scan) == _CC_ASCII) {
5246 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5249 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5250 PL_XPosix_ptrs[_CC_ASCII],
5261 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5263 /* NPOSIXD matches all upper Latin1 code points unless the
5264 * target string being matched is UTF-8, which is
5265 * unknowable until match time. Since we are going to
5266 * invert, we want to get rid of all of them so that the
5267 * inversion will match all */
5268 if (OP(scan) == NPOSIXD) {
5269 _invlist_subtract(my_invlist, PL_UpperLatin1,
5275 if (flags & SCF_DO_STCLASS_AND) {
5276 ssc_intersection(data->start_class, my_invlist, invert);
5277 ssc_clear_locale(data->start_class);
5280 assert(flags & SCF_DO_STCLASS_OR);
5281 ssc_union(data->start_class, my_invlist, invert);
5283 SvREFCNT_dec(my_invlist);
5285 if (flags & SCF_DO_STCLASS_OR)
5286 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5287 flags &= ~SCF_DO_STCLASS;
5290 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5291 data->flags |= (OP(scan) == MEOL
5294 scan_commit(pRExC_state, data, minlenp, is_inf);
5297 else if ( PL_regkind[OP(scan)] == BRANCHJ
5298 /* Lookbehind, or need to calculate parens/evals/stclass: */
5299 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5300 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5302 if ( OP(scan) == UNLESSM &&
5304 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5305 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5308 regnode *upto= regnext(scan);
5310 DEBUG_STUDYDATA("OPFAIL",data,depth);
5312 /*DEBUG_PARSE_MSG("opfail");*/
5313 regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state);
5314 PerlIO_printf(Perl_debug_log,
5315 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5316 SvPV_nolen_const(RExC_mysv),
5317 (IV)REG_NODE_NUM(upto),
5322 NEXT_OFF(scan) = upto - scan;
5323 for (opt= scan + 1; opt < upto ; opt++)
5324 OP(opt) = OPTIMIZED;
5328 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5329 || OP(scan) == UNLESSM )
5331 /* Negative Lookahead/lookbehind
5332 In this case we can't do fixed string optimisation.
5335 SSize_t deltanext, minnext, fake = 0;
5340 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5342 data_fake.whilem_c = data->whilem_c;
5343 data_fake.last_closep = data->last_closep;
5346 data_fake.last_closep = &fake;
5347 data_fake.pos_delta = delta;
5348 if ( flags & SCF_DO_STCLASS && !scan->flags
5349 && OP(scan) == IFMATCH ) { /* Lookahead */
5350 ssc_init(pRExC_state, &intrnl);
5351 data_fake.start_class = &intrnl;
5352 f |= SCF_DO_STCLASS_AND;
5354 if (flags & SCF_WHILEM_VISITED_POS)
5355 f |= SCF_WHILEM_VISITED_POS;
5356 next = regnext(scan);
5357 nscan = NEXTOPER(NEXTOPER(scan));
5358 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5359 last, &data_fake, stopparen,
5360 recursed_depth, NULL, f, depth+1);
5363 FAIL("Variable length lookbehind not implemented");
5365 else if (minnext > (I32)U8_MAX) {
5366 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5369 scan->flags = (U8)minnext;
5372 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5374 if (data_fake.flags & SF_HAS_EVAL)
5375 data->flags |= SF_HAS_EVAL;
5376 data->whilem_c = data_fake.whilem_c;
5378 if (f & SCF_DO_STCLASS_AND) {
5379 if (flags & SCF_DO_STCLASS_OR) {
5380 /* OR before, AND after: ideally we would recurse with
5381 * data_fake to get the AND applied by study of the
5382 * remainder of the pattern, and then derecurse;
5383 * *** HACK *** for now just treat as "no information".
5384 * See [perl #56690].
5386 ssc_init(pRExC_state, data->start_class);
5388 /* AND before and after: combine and continue. These
5389 * assertions are zero-length, so can match an EMPTY
5391 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5392 ANYOF_FLAGS(data->start_class)
5393 |= SSC_MATCHES_EMPTY_STRING;
5397 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5399 /* Positive Lookahead/lookbehind
5400 In this case we can do fixed string optimisation,
5401 but we must be careful about it. Note in the case of
5402 lookbehind the positions will be offset by the minimum
5403 length of the pattern, something we won't know about
5404 until after the recurse.
5406 SSize_t deltanext, fake = 0;
5410 /* We use SAVEFREEPV so that when the full compile
5411 is finished perl will clean up the allocated
5412 minlens when it's all done. This way we don't
5413 have to worry about freeing them when we know
5414 they wont be used, which would be a pain.
5417 Newx( minnextp, 1, SSize_t );
5418 SAVEFREEPV(minnextp);
5421 StructCopy(data, &data_fake, scan_data_t);
5422 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5425 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5426 data_fake.last_found=newSVsv(data->last_found);
5430 data_fake.last_closep = &fake;
5431 data_fake.flags = 0;
5432 data_fake.pos_delta = delta;
5434 data_fake.flags |= SF_IS_INF;
5435 if ( flags & SCF_DO_STCLASS && !scan->flags
5436 && OP(scan) == IFMATCH ) { /* Lookahead */
5437 ssc_init(pRExC_state, &intrnl);
5438 data_fake.start_class = &intrnl;
5439 f |= SCF_DO_STCLASS_AND;
5441 if (flags & SCF_WHILEM_VISITED_POS)
5442 f |= SCF_WHILEM_VISITED_POS;
5443 next = regnext(scan);
5444 nscan = NEXTOPER(NEXTOPER(scan));
5446 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5447 &deltanext, last, &data_fake,
5448 stopparen, recursed_depth, NULL,
5452 FAIL("Variable length lookbehind not implemented");
5454 else if (*minnextp > (I32)U8_MAX) {
5455 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5458 scan->flags = (U8)*minnextp;
5463 if (f & SCF_DO_STCLASS_AND) {
5464 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5465 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5468 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5470 if (data_fake.flags & SF_HAS_EVAL)
5471 data->flags |= SF_HAS_EVAL;
5472 data->whilem_c = data_fake.whilem_c;
5473 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5474 if (RExC_rx->minlen<*minnextp)
5475 RExC_rx->minlen=*minnextp;
5476 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5477 SvREFCNT_dec_NN(data_fake.last_found);
5479 if ( data_fake.minlen_fixed != minlenp )
5481 data->offset_fixed= data_fake.offset_fixed;
5482 data->minlen_fixed= data_fake.minlen_fixed;
5483 data->lookbehind_fixed+= scan->flags;
5485 if ( data_fake.minlen_float != minlenp )
5487 data->minlen_float= data_fake.minlen_float;
5488 data->offset_float_min=data_fake.offset_float_min;
5489 data->offset_float_max=data_fake.offset_float_max;
5490 data->lookbehind_float+= scan->flags;
5497 else if (OP(scan) == OPEN) {
5498 if (stopparen != (I32)ARG(scan))
5501 else if (OP(scan) == CLOSE) {
5502 if (stopparen == (I32)ARG(scan)) {
5505 if ((I32)ARG(scan) == is_par) {
5506 next = regnext(scan);
5508 if ( next && (OP(next) != WHILEM) && next < last)
5509 is_par = 0; /* Disable optimization */
5512 *(data->last_closep) = ARG(scan);
5514 else if (OP(scan) == EVAL) {
5516 data->flags |= SF_HAS_EVAL;
5518 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5519 if (flags & SCF_DO_SUBSTR) {
5520 scan_commit(pRExC_state, data, minlenp, is_inf);
5521 flags &= ~SCF_DO_SUBSTR;
5523 if (data && OP(scan)==ACCEPT) {
5524 data->flags |= SCF_SEEN_ACCEPT;
5529 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5531 if (flags & SCF_DO_SUBSTR) {
5532 scan_commit(pRExC_state, data, minlenp, is_inf);
5533 data->longest = &(data->longest_float);
5535 is_inf = is_inf_internal = 1;
5536 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5537 ssc_anything(data->start_class);
5538 flags &= ~SCF_DO_STCLASS;
5540 else if (OP(scan) == GPOS) {
5541 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5542 !(delta || is_inf || (data && data->pos_delta)))
5544 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5545 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5546 if (RExC_rx->gofs < (STRLEN)min)
5547 RExC_rx->gofs = min;
5549 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5553 #ifdef TRIE_STUDY_OPT
5554 #ifdef FULL_TRIE_STUDY
5555 else if (PL_regkind[OP(scan)] == TRIE) {
5556 /* NOTE - There is similar code to this block above for handling
5557 BRANCH nodes on the initial study. If you change stuff here
5559 regnode *trie_node= scan;
5560 regnode *tail= regnext(scan);
5561 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5562 SSize_t max1 = 0, min1 = SSize_t_MAX;
5565 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5566 /* Cannot merge strings after this. */
5567 scan_commit(pRExC_state, data, minlenp, is_inf);
5569 if (flags & SCF_DO_STCLASS)
5570 ssc_init_zero(pRExC_state, &accum);
5576 const regnode *nextbranch= NULL;
5579 for ( word=1 ; word <= trie->wordcount ; word++)
5581 SSize_t deltanext=0, minnext=0, f = 0, fake;
5582 regnode_ssc this_class;
5584 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5586 data_fake.whilem_c = data->whilem_c;
5587 data_fake.last_closep = data->last_closep;
5590 data_fake.last_closep = &fake;
5591 data_fake.pos_delta = delta;
5592 if (flags & SCF_DO_STCLASS) {
5593 ssc_init(pRExC_state, &this_class);
5594 data_fake.start_class = &this_class;
5595 f = SCF_DO_STCLASS_AND;
5597 if (flags & SCF_WHILEM_VISITED_POS)
5598 f |= SCF_WHILEM_VISITED_POS;
5600 if (trie->jump[word]) {
5602 nextbranch = trie_node + trie->jump[0];
5603 scan= trie_node + trie->jump[word];
5604 /* We go from the jump point to the branch that follows
5605 it. Note this means we need the vestigal unused
5606 branches even though they arent otherwise used. */
5607 minnext = study_chunk(pRExC_state, &scan, minlenp,
5608 &deltanext, (regnode *)nextbranch, &data_fake,
5609 stopparen, recursed_depth, NULL, f,depth+1);
5611 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5612 nextbranch= regnext((regnode*)nextbranch);
5614 if (min1 > (SSize_t)(minnext + trie->minlen))
5615 min1 = minnext + trie->minlen;
5616 if (deltanext == SSize_t_MAX) {
5617 is_inf = is_inf_internal = 1;
5619 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5620 max1 = minnext + deltanext + trie->maxlen;
5622 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5624 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5625 if ( stopmin > min + min1)
5626 stopmin = min + min1;
5627 flags &= ~SCF_DO_SUBSTR;
5629 data->flags |= SCF_SEEN_ACCEPT;
5632 if (data_fake.flags & SF_HAS_EVAL)
5633 data->flags |= SF_HAS_EVAL;
5634 data->whilem_c = data_fake.whilem_c;
5636 if (flags & SCF_DO_STCLASS)
5637 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5640 if (flags & SCF_DO_SUBSTR) {
5641 data->pos_min += min1;
5642 data->pos_delta += max1 - min1;
5643 if (max1 != min1 || is_inf)
5644 data->longest = &(data->longest_float);
5647 delta += max1 - min1;
5648 if (flags & SCF_DO_STCLASS_OR) {
5649 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5651 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5652 flags &= ~SCF_DO_STCLASS;
5655 else if (flags & SCF_DO_STCLASS_AND) {
5657 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5658 flags &= ~SCF_DO_STCLASS;
5661 /* Switch to OR mode: cache the old value of
5662 * data->start_class */
5664 StructCopy(data->start_class, and_withp, regnode_ssc);
5665 flags &= ~SCF_DO_STCLASS_AND;
5666 StructCopy(&accum, data->start_class, regnode_ssc);
5667 flags |= SCF_DO_STCLASS_OR;
5674 else if (PL_regkind[OP(scan)] == TRIE) {
5675 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5678 min += trie->minlen;
5679 delta += (trie->maxlen - trie->minlen);
5680 flags &= ~SCF_DO_STCLASS; /* xxx */
5681 if (flags & SCF_DO_SUBSTR) {
5682 /* Cannot expect anything... */
5683 scan_commit(pRExC_state, data, minlenp, is_inf);
5684 data->pos_min += trie->minlen;
5685 data->pos_delta += (trie->maxlen - trie->minlen);
5686 if (trie->maxlen != trie->minlen)
5687 data->longest = &(data->longest_float);
5689 if (trie->jump) /* no more substrings -- for now /grr*/
5690 flags &= ~SCF_DO_SUBSTR;
5692 #endif /* old or new */
5693 #endif /* TRIE_STUDY_OPT */
5695 /* Else: zero-length, ignore. */
5696 scan = regnext(scan);
5698 /* If we are exiting a recursion we can unset its recursed bit
5699 * and allow ourselves to enter it again - no danger of an
5700 * infinite loop there.
5701 if (stopparen > -1 && recursed) {
5702 DEBUG_STUDYDATA("unset:", data,depth);
5703 PAREN_UNSET( recursed, stopparen);
5709 DEBUG_STUDYDATA("frame-end:",data,depth);
5710 DEBUG_PEEP("fend", scan, depth);
5712 /* restore previous context */
5713 last = frame->last_regnode;
5714 scan = frame->next_regnode;
5715 stopparen = frame->stopparen;
5716 recursed_depth = frame->prev_recursed_depth;
5718 RExC_frame_last = frame->prev_frame;
5719 frame = frame->this_prev_frame;
5720 goto fake_study_recurse;
5725 DEBUG_STUDYDATA("pre-fin:",data,depth);
5728 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5730 if (flags & SCF_DO_SUBSTR && is_inf)
5731 data->pos_delta = SSize_t_MAX - data->pos_min;
5732 if (is_par > (I32)U8_MAX)
5734 if (is_par && pars==1 && data) {
5735 data->flags |= SF_IN_PAR;
5736 data->flags &= ~SF_HAS_PAR;
5738 else if (pars && data) {
5739 data->flags |= SF_HAS_PAR;
5740 data->flags &= ~SF_IN_PAR;
5742 if (flags & SCF_DO_STCLASS_OR)
5743 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5744 if (flags & SCF_TRIE_RESTUDY)
5745 data->flags |= SCF_TRIE_RESTUDY;
5747 DEBUG_STUDYDATA("post-fin:",data,depth);
5750 SSize_t final_minlen= min < stopmin ? min : stopmin;
5752 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5753 RExC_maxlen = final_minlen + delta;
5755 return final_minlen;
5761 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5763 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5765 PERL_ARGS_ASSERT_ADD_DATA;
5767 Renewc(RExC_rxi->data,
5768 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5769 char, struct reg_data);
5771 Renew(RExC_rxi->data->what, count + n, U8);
5773 Newx(RExC_rxi->data->what, n, U8);
5774 RExC_rxi->data->count = count + n;
5775 Copy(s, RExC_rxi->data->what + count, n, U8);
5779 /*XXX: todo make this not included in a non debugging perl, but appears to be
5780 * used anyway there, in 'use re' */
5781 #ifndef PERL_IN_XSUB_RE
5783 Perl_reginitcolors(pTHX)
5785 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5787 char *t = savepv(s);
5791 t = strchr(t, '\t');
5797 PL_colors[i] = t = (char *)"";
5802 PL_colors[i++] = (char *)"";
5809 #ifdef TRIE_STUDY_OPT
5810 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5813 (data.flags & SCF_TRIE_RESTUDY) \
5821 #define CHECK_RESTUDY_GOTO_butfirst
5825 * pregcomp - compile a regular expression into internal code
5827 * Decides which engine's compiler to call based on the hint currently in
5831 #ifndef PERL_IN_XSUB_RE
5833 /* return the currently in-scope regex engine (or the default if none) */
5835 regexp_engine const *
5836 Perl_current_re_engine(pTHX)
5838 if (IN_PERL_COMPILETIME) {
5839 HV * const table = GvHV(PL_hintgv);
5842 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5843 return &PL_core_reg_engine;
5844 ptr = hv_fetchs(table, "regcomp", FALSE);
5845 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5846 return &PL_core_reg_engine;
5847 return INT2PTR(regexp_engine*,SvIV(*ptr));
5851 if (!PL_curcop->cop_hints_hash)
5852 return &PL_core_reg_engine;
5853 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5854 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5855 return &PL_core_reg_engine;
5856 return INT2PTR(regexp_engine*,SvIV(ptr));
5862 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5864 regexp_engine const *eng = current_re_engine();
5865 GET_RE_DEBUG_FLAGS_DECL;
5867 PERL_ARGS_ASSERT_PREGCOMP;
5869 /* Dispatch a request to compile a regexp to correct regexp engine. */
5871 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5874 return CALLREGCOMP_ENG(eng, pattern, flags);
5878 /* public(ish) entry point for the perl core's own regex compiling code.
5879 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5880 * pattern rather than a list of OPs, and uses the internal engine rather
5881 * than the current one */
5884 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5886 SV *pat = pattern; /* defeat constness! */
5887 PERL_ARGS_ASSERT_RE_COMPILE;
5888 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5889 #ifdef PERL_IN_XSUB_RE
5892 &PL_core_reg_engine,
5894 NULL, NULL, rx_flags, 0);
5898 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5899 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5900 * point to the realloced string and length.
5902 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5906 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5907 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5909 U8 *const src = (U8*)*pat_p;
5914 GET_RE_DEBUG_FLAGS_DECL;
5916 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5917 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5919 Newx(dst, *plen_p * 2 + 1, U8);
5922 while (s < *plen_p) {
5923 append_utf8_from_native_byte(src[s], &d);
5924 if (n < num_code_blocks) {
5925 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5926 pRExC_state->code_blocks[n].start = d - dst - 1;
5927 assert(*(d - 1) == '(');
5930 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5931 pRExC_state->code_blocks[n].end = d - dst - 1;
5932 assert(*(d - 1) == ')');
5941 *pat_p = (char*) dst;
5943 RExC_orig_utf8 = RExC_utf8 = 1;
5948 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5949 * while recording any code block indices, and handling overloading,
5950 * nested qr// objects etc. If pat is null, it will allocate a new
5951 * string, or just return the first arg, if there's only one.
5953 * Returns the malloced/updated pat.
5954 * patternp and pat_count is the array of SVs to be concatted;
5955 * oplist is the optional list of ops that generated the SVs;
5956 * recompile_p is a pointer to a boolean that will be set if
5957 * the regex will need to be recompiled.
5958 * delim, if non-null is an SV that will be inserted between each element
5962 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5963 SV *pat, SV ** const patternp, int pat_count,
5964 OP *oplist, bool *recompile_p, SV *delim)
5968 bool use_delim = FALSE;
5969 bool alloced = FALSE;
5971 /* if we know we have at least two args, create an empty string,
5972 * then concatenate args to that. For no args, return an empty string */
5973 if (!pat && pat_count != 1) {
5979 for (svp = patternp; svp < patternp + pat_count; svp++) {
5982 STRLEN orig_patlen = 0;
5984 SV *msv = use_delim ? delim : *svp;
5985 if (!msv) msv = &PL_sv_undef;
5987 /* if we've got a delimiter, we go round the loop twice for each
5988 * svp slot (except the last), using the delimiter the second
5997 if (SvTYPE(msv) == SVt_PVAV) {
5998 /* we've encountered an interpolated array within
5999 * the pattern, e.g. /...@a..../. Expand the list of elements,
6000 * then recursively append elements.
6001 * The code in this block is based on S_pushav() */
6003 AV *const av = (AV*)msv;
6004 const SSize_t maxarg = AvFILL(av) + 1;
6008 assert(oplist->op_type == OP_PADAV
6009 || oplist->op_type == OP_RV2AV);
6010 oplist = OP_SIBLING(oplist);
6013 if (SvRMAGICAL(av)) {
6016 Newx(array, maxarg, SV*);
6018 for (i=0; i < maxarg; i++) {
6019 SV ** const svp = av_fetch(av, i, FALSE);
6020 array[i] = svp ? *svp : &PL_sv_undef;
6024 array = AvARRAY(av);
6026 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6027 array, maxarg, NULL, recompile_p,
6029 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6035 /* we make the assumption here that each op in the list of
6036 * op_siblings maps to one SV pushed onto the stack,
6037 * except for code blocks, with have both an OP_NULL and
6039 * This allows us to match up the list of SVs against the
6040 * list of OPs to find the next code block.
6042 * Note that PUSHMARK PADSV PADSV ..
6044 * PADRANGE PADSV PADSV ..
6045 * so the alignment still works. */
6048 if (oplist->op_type == OP_NULL
6049 && (oplist->op_flags & OPf_SPECIAL))
6051 assert(n < pRExC_state->num_code_blocks);
6052 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6053 pRExC_state->code_blocks[n].block = oplist;
6054 pRExC_state->code_blocks[n].src_regex = NULL;
6057 oplist = OP_SIBLING(oplist); /* skip CONST */
6060 oplist = OP_SIBLING(oplist);;
6063 /* apply magic and QR overloading to arg */
6066 if (SvROK(msv) && SvAMAGIC(msv)) {
6067 SV *sv = AMG_CALLunary(msv, regexp_amg);
6071 if (SvTYPE(sv) != SVt_REGEXP)
6072 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6077 /* try concatenation overload ... */
6078 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6079 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6082 /* overloading involved: all bets are off over literal
6083 * code. Pretend we haven't seen it */
6084 pRExC_state->num_code_blocks -= n;
6088 /* ... or failing that, try "" overload */
6089 while (SvAMAGIC(msv)
6090 && (sv = AMG_CALLunary(msv, string_amg))
6094 && SvRV(msv) == SvRV(sv))
6099 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6103 /* this is a partially unrolled
6104 * sv_catsv_nomg(pat, msv);
6105 * that allows us to adjust code block indices if
6108 char *dst = SvPV_force_nomg(pat, dlen);
6110 if (SvUTF8(msv) && !SvUTF8(pat)) {
6111 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6112 sv_setpvn(pat, dst, dlen);
6115 sv_catsv_nomg(pat, msv);
6122 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6125 /* extract any code blocks within any embedded qr//'s */
6126 if (rx && SvTYPE(rx) == SVt_REGEXP
6127 && RX_ENGINE((REGEXP*)rx)->op_comp)
6130 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6131 if (ri->num_code_blocks) {
6133 /* the presence of an embedded qr// with code means
6134 * we should always recompile: the text of the
6135 * qr// may not have changed, but it may be a
6136 * different closure than last time */
6138 Renew(pRExC_state->code_blocks,
6139 pRExC_state->num_code_blocks + ri->num_code_blocks,
6140 struct reg_code_block);
6141 pRExC_state->num_code_blocks += ri->num_code_blocks;
6143 for (i=0; i < ri->num_code_blocks; i++) {
6144 struct reg_code_block *src, *dst;
6145 STRLEN offset = orig_patlen
6146 + ReANY((REGEXP *)rx)->pre_prefix;
6147 assert(n < pRExC_state->num_code_blocks);
6148 src = &ri->code_blocks[i];
6149 dst = &pRExC_state->code_blocks[n];
6150 dst->start = src->start + offset;
6151 dst->end = src->end + offset;
6152 dst->block = src->block;
6153 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6162 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6171 /* see if there are any run-time code blocks in the pattern.
6172 * False positives are allowed */
6175 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6176 char *pat, STRLEN plen)
6181 PERL_UNUSED_CONTEXT;
6183 for (s = 0; s < plen; s++) {
6184 if (n < pRExC_state->num_code_blocks
6185 && s == pRExC_state->code_blocks[n].start)
6187 s = pRExC_state->code_blocks[n].end;
6191 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6193 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6195 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6202 /* Handle run-time code blocks. We will already have compiled any direct
6203 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6204 * copy of it, but with any literal code blocks blanked out and
6205 * appropriate chars escaped; then feed it into
6207 * eval "qr'modified_pattern'"
6211 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6215 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6217 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6218 * and merge them with any code blocks of the original regexp.
6220 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6221 * instead, just save the qr and return FALSE; this tells our caller that
6222 * the original pattern needs upgrading to utf8.
6226 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6227 char *pat, STRLEN plen)
6231 GET_RE_DEBUG_FLAGS_DECL;
6233 if (pRExC_state->runtime_code_qr) {
6234 /* this is the second time we've been called; this should
6235 * only happen if the main pattern got upgraded to utf8
6236 * during compilation; re-use the qr we compiled first time
6237 * round (which should be utf8 too)
6239 qr = pRExC_state->runtime_code_qr;
6240 pRExC_state->runtime_code_qr = NULL;
6241 assert(RExC_utf8 && SvUTF8(qr));
6247 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6251 /* determine how many extra chars we need for ' and \ escaping */
6252 for (s = 0; s < plen; s++) {
6253 if (pat[s] == '\'' || pat[s] == '\\')
6257 Newx(newpat, newlen, char);
6259 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6261 for (s = 0; s < plen; s++) {
6262 if (n < pRExC_state->num_code_blocks
6263 && s == pRExC_state->code_blocks[n].start)
6265 /* blank out literal code block */
6266 assert(pat[s] == '(');
6267 while (s <= pRExC_state->code_blocks[n].end) {
6275 if (pat[s] == '\'' || pat[s] == '\\')
6280 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6284 PerlIO_printf(Perl_debug_log,
6285 "%sre-parsing pattern for runtime code:%s %s\n",
6286 PL_colors[4],PL_colors[5],newpat);
6289 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6294 PUSHSTACKi(PERLSI_REQUIRE);
6295 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6296 * parsing qr''; normally only q'' does this. It also alters
6298 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6299 SvREFCNT_dec_NN(sv);
6304 SV * const errsv = ERRSV;
6305 if (SvTRUE_NN(errsv))
6307 Safefree(pRExC_state->code_blocks);
6308 /* use croak_sv ? */
6309 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6312 assert(SvROK(qr_ref));
6314 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6315 /* the leaving below frees the tmp qr_ref.
6316 * Give qr a life of its own */
6324 if (!RExC_utf8 && SvUTF8(qr)) {
6325 /* first time through; the pattern got upgraded; save the
6326 * qr for the next time through */
6327 assert(!pRExC_state->runtime_code_qr);
6328 pRExC_state->runtime_code_qr = qr;
6333 /* extract any code blocks within the returned qr// */
6336 /* merge the main (r1) and run-time (r2) code blocks into one */
6338 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6339 struct reg_code_block *new_block, *dst;
6340 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6343 if (!r2->num_code_blocks) /* we guessed wrong */
6345 SvREFCNT_dec_NN(qr);
6350 r1->num_code_blocks + r2->num_code_blocks,
6351 struct reg_code_block);
6354 while ( i1 < r1->num_code_blocks
6355 || i2 < r2->num_code_blocks)
6357 struct reg_code_block *src;
6360 if (i1 == r1->num_code_blocks) {
6361 src = &r2->code_blocks[i2++];
6364 else if (i2 == r2->num_code_blocks)
6365 src = &r1->code_blocks[i1++];
6366 else if ( r1->code_blocks[i1].start
6367 < r2->code_blocks[i2].start)
6369 src = &r1->code_blocks[i1++];
6370 assert(src->end < r2->code_blocks[i2].start);
6373 assert( r1->code_blocks[i1].start
6374 > r2->code_blocks[i2].start);
6375 src = &r2->code_blocks[i2++];
6377 assert(src->end < r1->code_blocks[i1].start);
6380 assert(pat[src->start] == '(');
6381 assert(pat[src->end] == ')');
6382 dst->start = src->start;
6383 dst->end = src->end;
6384 dst->block = src->block;
6385 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6389 r1->num_code_blocks += r2->num_code_blocks;
6390 Safefree(r1->code_blocks);
6391 r1->code_blocks = new_block;
6394 SvREFCNT_dec_NN(qr);
6400 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6401 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6402 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6403 STRLEN longest_length, bool eol, bool meol)
6405 /* This is the common code for setting up the floating and fixed length
6406 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6407 * as to whether succeeded or not */
6412 if (! (longest_length
6413 || (eol /* Can't have SEOL and MULTI */
6414 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6416 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6417 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6422 /* copy the information about the longest from the reg_scan_data
6423 over to the program. */
6424 if (SvUTF8(sv_longest)) {
6425 *rx_utf8 = sv_longest;
6428 *rx_substr = sv_longest;
6431 /* end_shift is how many chars that must be matched that
6432 follow this item. We calculate it ahead of time as once the
6433 lookbehind offset is added in we lose the ability to correctly
6435 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6436 *rx_end_shift = ml - offset
6437 - longest_length + (SvTAIL(sv_longest) != 0)
6440 t = (eol/* Can't have SEOL and MULTI */
6441 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6442 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6448 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6449 * regular expression into internal code.
6450 * The pattern may be passed either as:
6451 * a list of SVs (patternp plus pat_count)
6452 * a list of OPs (expr)
6453 * If both are passed, the SV list is used, but the OP list indicates
6454 * which SVs are actually pre-compiled code blocks
6456 * The SVs in the list have magic and qr overloading applied to them (and
6457 * the list may be modified in-place with replacement SVs in the latter
6460 * If the pattern hasn't changed from old_re, then old_re will be
6463 * eng is the current engine. If that engine has an op_comp method, then
6464 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6465 * do the initial concatenation of arguments and pass on to the external
6468 * If is_bare_re is not null, set it to a boolean indicating whether the
6469 * arg list reduced (after overloading) to a single bare regex which has
6470 * been returned (i.e. /$qr/).
6472 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6474 * pm_flags contains the PMf_* flags, typically based on those from the
6475 * pm_flags field of the related PMOP. Currently we're only interested in
6476 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6478 * We can't allocate space until we know how big the compiled form will be,
6479 * but we can't compile it (and thus know how big it is) until we've got a
6480 * place to put the code. So we cheat: we compile it twice, once with code
6481 * generation turned off and size counting turned on, and once "for real".
6482 * This also means that we don't allocate space until we are sure that the
6483 * thing really will compile successfully, and we never have to move the
6484 * code and thus invalidate pointers into it. (Note that it has to be in
6485 * one piece because free() must be able to free it all.) [NB: not true in perl]
6487 * Beware that the optimization-preparation code in here knows about some
6488 * of the structure of the compiled regexp. [I'll say.]
6492 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6493 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6494 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6498 regexp_internal *ri;
6506 SV *code_blocksv = NULL;
6507 SV** new_patternp = patternp;
6509 /* these are all flags - maybe they should be turned
6510 * into a single int with different bit masks */
6511 I32 sawlookahead = 0;
6516 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6518 bool runtime_code = 0;
6520 RExC_state_t RExC_state;
6521 RExC_state_t * const pRExC_state = &RExC_state;
6522 #ifdef TRIE_STUDY_OPT
6524 RExC_state_t copyRExC_state;
6526 GET_RE_DEBUG_FLAGS_DECL;
6528 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6530 DEBUG_r(if (!PL_colorset) reginitcolors());
6532 #ifndef PERL_IN_XSUB_RE
6533 /* Initialize these here instead of as-needed, as is quick and avoids
6534 * having to test them each time otherwise */
6535 if (! PL_AboveLatin1) {
6536 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6537 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6538 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6539 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6540 PL_HasMultiCharFold =
6541 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6543 /* This is calculated here, because the Perl program that generates the
6544 * static global ones doesn't currently have access to
6545 * NUM_ANYOF_CODE_POINTS */
6546 PL_InBitmap = _new_invlist(2);
6547 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6548 NUM_ANYOF_CODE_POINTS - 1);
6552 pRExC_state->code_blocks = NULL;
6553 pRExC_state->num_code_blocks = 0;
6556 *is_bare_re = FALSE;
6558 if (expr && (expr->op_type == OP_LIST ||
6559 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6560 /* allocate code_blocks if needed */
6564 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6565 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6566 ncode++; /* count of DO blocks */
6568 pRExC_state->num_code_blocks = ncode;
6569 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6574 /* compile-time pattern with just OP_CONSTs and DO blocks */
6579 /* find how many CONSTs there are */
6582 if (expr->op_type == OP_CONST)
6585 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6586 if (o->op_type == OP_CONST)
6590 /* fake up an SV array */
6592 assert(!new_patternp);
6593 Newx(new_patternp, n, SV*);
6594 SAVEFREEPV(new_patternp);
6598 if (expr->op_type == OP_CONST)
6599 new_patternp[n] = cSVOPx_sv(expr);
6601 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6602 if (o->op_type == OP_CONST)
6603 new_patternp[n++] = cSVOPo_sv;
6608 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6609 "Assembling pattern from %d elements%s\n", pat_count,
6610 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6612 /* set expr to the first arg op */
6614 if (pRExC_state->num_code_blocks
6615 && expr->op_type != OP_CONST)
6617 expr = cLISTOPx(expr)->op_first;
6618 assert( expr->op_type == OP_PUSHMARK
6619 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6620 || expr->op_type == OP_PADRANGE);
6621 expr = OP_SIBLING(expr);
6624 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6625 expr, &recompile, NULL);
6627 /* handle bare (possibly after overloading) regex: foo =~ $re */
6632 if (SvTYPE(re) == SVt_REGEXP) {
6636 Safefree(pRExC_state->code_blocks);
6637 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6638 "Precompiled pattern%s\n",
6639 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6645 exp = SvPV_nomg(pat, plen);
6647 if (!eng->op_comp) {
6648 if ((SvUTF8(pat) && IN_BYTES)
6649 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6651 /* make a temporary copy; either to convert to bytes,
6652 * or to avoid repeating get-magic / overloaded stringify */
6653 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6654 (IN_BYTES ? 0 : SvUTF8(pat)));
6656 Safefree(pRExC_state->code_blocks);
6657 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6660 /* ignore the utf8ness if the pattern is 0 length */
6661 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6662 RExC_uni_semantics = 0;
6663 RExC_contains_locale = 0;
6664 RExC_contains_i = 0;
6665 pRExC_state->runtime_code_qr = NULL;
6666 RExC_frame_head= NULL;
6667 RExC_frame_last= NULL;
6668 RExC_frame_count= 0;
6671 RExC_mysv1= sv_newmortal();
6672 RExC_mysv2= sv_newmortal();
6675 SV *dsv= sv_newmortal();
6676 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6677 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6678 PL_colors[4],PL_colors[5],s);
6682 /* we jump here if we upgrade the pattern to utf8 and have to
6685 if ((pm_flags & PMf_USE_RE_EVAL)
6686 /* this second condition covers the non-regex literal case,
6687 * i.e. $foo =~ '(?{})'. */
6688 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6690 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6692 /* return old regex if pattern hasn't changed */
6693 /* XXX: note in the below we have to check the flags as well as the
6696 * Things get a touch tricky as we have to compare the utf8 flag
6697 * independently from the compile flags. */
6701 && !!RX_UTF8(old_re) == !!RExC_utf8
6702 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6703 && RX_PRECOMP(old_re)
6704 && RX_PRELEN(old_re) == plen
6705 && memEQ(RX_PRECOMP(old_re), exp, plen)
6706 && !runtime_code /* with runtime code, always recompile */ )
6708 Safefree(pRExC_state->code_blocks);
6712 rx_flags = orig_rx_flags;
6714 if (rx_flags & PMf_FOLD) {
6715 RExC_contains_i = 1;
6717 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6719 /* Set to use unicode semantics if the pattern is in utf8 and has the
6720 * 'depends' charset specified, as it means unicode when utf8 */
6721 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6725 RExC_flags = rx_flags;
6726 RExC_pm_flags = pm_flags;
6729 if (TAINTING_get && TAINT_get)
6730 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6732 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6733 /* whoops, we have a non-utf8 pattern, whilst run-time code
6734 * got compiled as utf8. Try again with a utf8 pattern */
6735 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6736 pRExC_state->num_code_blocks);
6737 goto redo_first_pass;
6740 assert(!pRExC_state->runtime_code_qr);
6746 RExC_in_lookbehind = 0;
6747 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6749 RExC_override_recoding = 0;
6750 RExC_in_multi_char_class = 0;
6752 /* First pass: determine size, legality. */
6755 RExC_end = exp + plen;
6760 RExC_emit = (regnode *) &RExC_emit_dummy;
6761 RExC_whilem_seen = 0;
6762 RExC_open_parens = NULL;
6763 RExC_close_parens = NULL;
6765 RExC_paren_names = NULL;
6767 RExC_paren_name_list = NULL;
6769 RExC_recurse = NULL;
6770 RExC_study_chunk_recursed = NULL;
6771 RExC_study_chunk_recursed_bytes= 0;
6772 RExC_recurse_count = 0;
6773 pRExC_state->code_index = 0;
6775 #if 0 /* REGC() is (currently) a NOP at the first pass.
6776 * Clever compilers notice this and complain. --jhi */
6777 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6780 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6782 RExC_lastparse=NULL;
6784 /* reg may croak on us, not giving us a chance to free
6785 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6786 need it to survive as long as the regexp (qr/(?{})/).
6787 We must check that code_blocksv is not already set, because we may
6788 have jumped back to restart the sizing pass. */
6789 if (pRExC_state->code_blocks && !code_blocksv) {
6790 code_blocksv = newSV_type(SVt_PV);
6791 SAVEFREESV(code_blocksv);
6792 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6793 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6795 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6796 /* It's possible to write a regexp in ascii that represents Unicode
6797 codepoints outside of the byte range, such as via \x{100}. If we
6798 detect such a sequence we have to convert the entire pattern to utf8
6799 and then recompile, as our sizing calculation will have been based
6800 on 1 byte == 1 character, but we will need to use utf8 to encode
6801 at least some part of the pattern, and therefore must convert the whole
6804 if (flags & RESTART_UTF8) {
6805 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6806 pRExC_state->num_code_blocks);
6807 goto redo_first_pass;
6809 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6812 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6815 PerlIO_printf(Perl_debug_log,
6816 "Required size %"IVdf" nodes\n"
6817 "Starting second pass (creation)\n",
6820 RExC_lastparse=NULL;
6823 /* The first pass could have found things that force Unicode semantics */
6824 if ((RExC_utf8 || RExC_uni_semantics)
6825 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6827 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6830 /* Small enough for pointer-storage convention?
6831 If extralen==0, this means that we will not need long jumps. */
6832 if (RExC_size >= 0x10000L && RExC_extralen)
6833 RExC_size += RExC_extralen;
6836 if (RExC_whilem_seen > 15)
6837 RExC_whilem_seen = 15;
6839 /* Allocate space and zero-initialize. Note, the two step process
6840 of zeroing when in debug mode, thus anything assigned has to
6841 happen after that */
6842 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6844 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6845 char, regexp_internal);
6846 if ( r == NULL || ri == NULL )
6847 FAIL("Regexp out of space");
6849 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6850 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6853 /* bulk initialize base fields with 0. */
6854 Zero(ri, sizeof(regexp_internal), char);
6857 /* non-zero initialization begins here */
6860 r->extflags = rx_flags;
6861 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6863 if (pm_flags & PMf_IS_QR) {
6864 ri->code_blocks = pRExC_state->code_blocks;
6865 ri->num_code_blocks = pRExC_state->num_code_blocks;
6870 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6871 if (pRExC_state->code_blocks[n].src_regex)
6872 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6873 SAVEFREEPV(pRExC_state->code_blocks);
6877 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6878 bool has_charset = (get_regex_charset(r->extflags)
6879 != REGEX_DEPENDS_CHARSET);
6881 /* The caret is output if there are any defaults: if not all the STD
6882 * flags are set, or if no character set specifier is needed */
6884 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6886 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6887 == REG_RUN_ON_COMMENT_SEEN);
6888 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6889 >> RXf_PMf_STD_PMMOD_SHIFT);
6890 const char *fptr = STD_PAT_MODS; /*"msix"*/
6892 /* Allocate for the worst case, which is all the std flags are turned
6893 * on. If more precision is desired, we could do a population count of
6894 * the flags set. This could be done with a small lookup table, or by
6895 * shifting, masking and adding, or even, when available, assembly
6896 * language for a machine-language population count.
6897 * We never output a minus, as all those are defaults, so are
6898 * covered by the caret */
6899 const STRLEN wraplen = plen + has_p + has_runon
6900 + has_default /* If needs a caret */
6902 /* If needs a character set specifier */
6903 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6904 + (sizeof(STD_PAT_MODS) - 1)
6905 + (sizeof("(?:)") - 1);
6907 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6908 r->xpv_len_u.xpvlenu_pv = p;
6910 SvFLAGS(rx) |= SVf_UTF8;
6913 /* If a default, cover it using the caret */
6915 *p++= DEFAULT_PAT_MOD;
6919 const char* const name = get_regex_charset_name(r->extflags, &len);
6920 Copy(name, p, len, char);
6924 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6927 while((ch = *fptr++)) {
6935 Copy(RExC_precomp, p, plen, char);
6936 assert ((RX_WRAPPED(rx) - p) < 16);
6937 r->pre_prefix = p - RX_WRAPPED(rx);
6943 SvCUR_set(rx, p - RX_WRAPPED(rx));
6947 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6949 /* setup various meta data about recursion, this all requires
6950 * RExC_npar to be correctly set, and a bit later on we clear it */
6951 if (RExC_seen & REG_RECURSE_SEEN) {
6952 Newxz(RExC_open_parens, RExC_npar,regnode *);
6953 SAVEFREEPV(RExC_open_parens);
6954 Newxz(RExC_close_parens,RExC_npar,regnode *);
6955 SAVEFREEPV(RExC_close_parens);
6957 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6958 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6959 * So its 1 if there are no parens. */
6960 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6961 ((RExC_npar & 0x07) != 0);
6962 Newx(RExC_study_chunk_recursed,
6963 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6964 SAVEFREEPV(RExC_study_chunk_recursed);
6967 /* Useful during FAIL. */
6968 #ifdef RE_TRACK_PATTERN_OFFSETS
6969 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6970 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6971 "%s %"UVuf" bytes for offset annotations.\n",
6972 ri->u.offsets ? "Got" : "Couldn't get",
6973 (UV)((2*RExC_size+1) * sizeof(U32))));
6975 SetProgLen(ri,RExC_size);
6980 /* Second pass: emit code. */
6981 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6982 RExC_pm_flags = pm_flags;
6984 RExC_end = exp + plen;
6987 RExC_emit_start = ri->program;
6988 RExC_emit = ri->program;
6989 RExC_emit_bound = ri->program + RExC_size + 1;
6990 pRExC_state->code_index = 0;
6992 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6993 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6995 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6997 /* XXXX To minimize changes to RE engine we always allocate
6998 3-units-long substrs field. */
6999 Newx(r->substrs, 1, struct reg_substr_data);
7000 if (RExC_recurse_count) {
7001 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7002 SAVEFREEPV(RExC_recurse);
7006 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7008 RExC_study_chunk_recursed_count= 0;
7010 Zero(r->substrs, 1, struct reg_substr_data);
7011 if (RExC_study_chunk_recursed) {
7012 Zero(RExC_study_chunk_recursed,
7013 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7017 #ifdef TRIE_STUDY_OPT
7019 StructCopy(&zero_scan_data, &data, scan_data_t);
7020 copyRExC_state = RExC_state;
7023 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7025 RExC_state = copyRExC_state;
7026 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7027 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7029 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7030 StructCopy(&zero_scan_data, &data, scan_data_t);
7033 StructCopy(&zero_scan_data, &data, scan_data_t);
7036 /* Dig out information for optimizations. */
7037 r->extflags = RExC_flags; /* was pm_op */
7038 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7041 SvUTF8_on(rx); /* Unicode in it? */
7042 ri->regstclass = NULL;
7043 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
7044 r->intflags |= PREGf_NAUGHTY;
7045 scan = ri->program + 1; /* First BRANCH. */
7047 /* testing for BRANCH here tells us whether there is "must appear"
7048 data in the pattern. If there is then we can use it for optimisations */
7049 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7052 STRLEN longest_float_length, longest_fixed_length;
7053 regnode_ssc ch_class; /* pointed to by data */
7055 SSize_t last_close = 0; /* pointed to by data */
7056 regnode *first= scan;
7057 regnode *first_next= regnext(first);
7059 * Skip introductions and multiplicators >= 1
7060 * so that we can extract the 'meat' of the pattern that must
7061 * match in the large if() sequence following.
7062 * NOTE that EXACT is NOT covered here, as it is normally
7063 * picked up by the optimiser separately.
7065 * This is unfortunate as the optimiser isnt handling lookahead
7066 * properly currently.
7069 while ((OP(first) == OPEN && (sawopen = 1)) ||
7070 /* An OR of *one* alternative - should not happen now. */
7071 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7072 /* for now we can't handle lookbehind IFMATCH*/
7073 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7074 (OP(first) == PLUS) ||
7075 (OP(first) == MINMOD) ||
7076 /* An {n,m} with n>0 */
7077 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7078 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7081 * the only op that could be a regnode is PLUS, all the rest
7082 * will be regnode_1 or regnode_2.
7084 * (yves doesn't think this is true)
7086 if (OP(first) == PLUS)
7089 if (OP(first) == MINMOD)
7091 first += regarglen[OP(first)];
7093 first = NEXTOPER(first);
7094 first_next= regnext(first);
7097 /* Starting-point info. */
7099 DEBUG_PEEP("first:",first,0);
7100 /* Ignore EXACT as we deal with it later. */
7101 if (PL_regkind[OP(first)] == EXACT) {
7102 if (OP(first) == EXACT)
7103 NOOP; /* Empty, get anchored substr later. */
7105 ri->regstclass = first;
7108 else if (PL_regkind[OP(first)] == TRIE &&
7109 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7111 /* this can happen only on restudy */
7112 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7115 else if (REGNODE_SIMPLE(OP(first)))
7116 ri->regstclass = first;
7117 else if (PL_regkind[OP(first)] == BOUND ||
7118 PL_regkind[OP(first)] == NBOUND)
7119 ri->regstclass = first;
7120 else if (PL_regkind[OP(first)] == BOL) {
7121 r->intflags |= (OP(first) == MBOL
7124 first = NEXTOPER(first);
7127 else if (OP(first) == GPOS) {
7128 r->intflags |= PREGf_ANCH_GPOS;
7129 first = NEXTOPER(first);
7132 else if ((!sawopen || !RExC_sawback) &&
7134 (OP(first) == STAR &&
7135 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7136 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7138 /* turn .* into ^.* with an implied $*=1 */
7140 (OP(NEXTOPER(first)) == REG_ANY)
7143 r->intflags |= (type | PREGf_IMPLICIT);
7144 first = NEXTOPER(first);
7147 if (sawplus && !sawminmod && !sawlookahead
7148 && (!sawopen || !RExC_sawback)
7149 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7150 /* x+ must match at the 1st pos of run of x's */
7151 r->intflags |= PREGf_SKIP;
7153 /* Scan is after the zeroth branch, first is atomic matcher. */
7154 #ifdef TRIE_STUDY_OPT
7157 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7158 (IV)(first - scan + 1))
7162 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7163 (IV)(first - scan + 1))
7169 * If there's something expensive in the r.e., find the
7170 * longest literal string that must appear and make it the
7171 * regmust. Resolve ties in favor of later strings, since
7172 * the regstart check works with the beginning of the r.e.
7173 * and avoiding duplication strengthens checking. Not a
7174 * strong reason, but sufficient in the absence of others.
7175 * [Now we resolve ties in favor of the earlier string if
7176 * it happens that c_offset_min has been invalidated, since the
7177 * earlier string may buy us something the later one won't.]
7180 data.longest_fixed = newSVpvs("");
7181 data.longest_float = newSVpvs("");
7182 data.last_found = newSVpvs("");
7183 data.longest = &(data.longest_fixed);
7184 ENTER_with_name("study_chunk");
7185 SAVEFREESV(data.longest_fixed);
7186 SAVEFREESV(data.longest_float);
7187 SAVEFREESV(data.last_found);
7189 if (!ri->regstclass) {
7190 ssc_init(pRExC_state, &ch_class);
7191 data.start_class = &ch_class;
7192 stclass_flag = SCF_DO_STCLASS_AND;
7193 } else /* XXXX Check for BOUND? */
7195 data.last_closep = &last_close;
7198 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7199 scan + RExC_size, /* Up to end */
7201 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7202 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7206 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7209 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7210 && data.last_start_min == 0 && data.last_end > 0
7211 && !RExC_seen_zerolen
7212 && !(RExC_seen & REG_VERBARG_SEEN)
7213 && !(RExC_seen & REG_GPOS_SEEN)
7215 r->extflags |= RXf_CHECK_ALL;
7217 scan_commit(pRExC_state, &data,&minlen,0);
7219 longest_float_length = CHR_SVLEN(data.longest_float);
7221 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7222 && data.offset_fixed == data.offset_float_min
7223 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7224 && S_setup_longest (aTHX_ pRExC_state,
7228 &(r->float_end_shift),
7229 data.lookbehind_float,
7230 data.offset_float_min,
7232 longest_float_length,
7233 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7234 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7236 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7237 r->float_max_offset = data.offset_float_max;
7238 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7239 r->float_max_offset -= data.lookbehind_float;
7240 SvREFCNT_inc_simple_void_NN(data.longest_float);
7243 r->float_substr = r->float_utf8 = NULL;
7244 longest_float_length = 0;
7247 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7249 if (S_setup_longest (aTHX_ pRExC_state,
7251 &(r->anchored_utf8),
7252 &(r->anchored_substr),
7253 &(r->anchored_end_shift),
7254 data.lookbehind_fixed,
7257 longest_fixed_length,
7258 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7259 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7261 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7262 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7265 r->anchored_substr = r->anchored_utf8 = NULL;
7266 longest_fixed_length = 0;
7268 LEAVE_with_name("study_chunk");
7271 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7272 ri->regstclass = NULL;
7274 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7276 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7277 && is_ssc_worth_it(pRExC_state, data.start_class))
7279 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7281 ssc_finalize(pRExC_state, data.start_class);
7283 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7284 StructCopy(data.start_class,
7285 (regnode_ssc*)RExC_rxi->data->data[n],
7287 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7288 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7289 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7290 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7291 PerlIO_printf(Perl_debug_log,
7292 "synthetic stclass \"%s\".\n",
7293 SvPVX_const(sv));});
7294 data.start_class = NULL;
7297 /* A temporary algorithm prefers floated substr to fixed one to dig
7299 if (longest_fixed_length > longest_float_length) {
7300 r->substrs->check_ix = 0;
7301 r->check_end_shift = r->anchored_end_shift;
7302 r->check_substr = r->anchored_substr;
7303 r->check_utf8 = r->anchored_utf8;
7304 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7305 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7306 r->intflags |= PREGf_NOSCAN;
7309 r->substrs->check_ix = 1;
7310 r->check_end_shift = r->float_end_shift;
7311 r->check_substr = r->float_substr;
7312 r->check_utf8 = r->float_utf8;
7313 r->check_offset_min = r->float_min_offset;
7314 r->check_offset_max = r->float_max_offset;
7316 if ((r->check_substr || r->check_utf8) ) {
7317 r->extflags |= RXf_USE_INTUIT;
7318 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7319 r->extflags |= RXf_INTUIT_TAIL;
7321 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7323 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7324 if ( (STRLEN)minlen < longest_float_length )
7325 minlen= longest_float_length;
7326 if ( (STRLEN)minlen < longest_fixed_length )
7327 minlen= longest_fixed_length;
7331 /* Several toplevels. Best we can is to set minlen. */
7333 regnode_ssc ch_class;
7334 SSize_t last_close = 0;
7336 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7338 scan = ri->program + 1;
7339 ssc_init(pRExC_state, &ch_class);
7340 data.start_class = &ch_class;
7341 data.last_closep = &last_close;
7344 minlen = study_chunk(pRExC_state,
7345 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7346 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7347 ? SCF_TRIE_DOING_RESTUDY
7351 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7353 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7354 = r->float_substr = r->float_utf8 = NULL;
7356 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7357 && is_ssc_worth_it(pRExC_state, data.start_class))
7359 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7361 ssc_finalize(pRExC_state, data.start_class);
7363 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7364 StructCopy(data.start_class,
7365 (regnode_ssc*)RExC_rxi->data->data[n],
7367 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7368 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7369 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7370 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7371 PerlIO_printf(Perl_debug_log,
7372 "synthetic stclass \"%s\".\n",
7373 SvPVX_const(sv));});
7374 data.start_class = NULL;
7378 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7379 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7380 r->maxlen = REG_INFTY;
7383 r->maxlen = RExC_maxlen;
7386 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7387 the "real" pattern. */
7389 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7390 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7392 r->minlenret = minlen;
7393 if (r->minlen < minlen)
7396 if (RExC_seen & REG_GPOS_SEEN)
7397 r->intflags |= PREGf_GPOS_SEEN;
7398 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7399 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7401 if (pRExC_state->num_code_blocks)
7402 r->extflags |= RXf_EVAL_SEEN;
7403 if (RExC_seen & REG_CANY_SEEN)
7404 r->intflags |= PREGf_CANY_SEEN;
7405 if (RExC_seen & REG_VERBARG_SEEN)
7407 r->intflags |= PREGf_VERBARG_SEEN;
7408 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7410 if (RExC_seen & REG_CUTGROUP_SEEN)
7411 r->intflags |= PREGf_CUTGROUP_SEEN;
7412 if (pm_flags & PMf_USE_RE_EVAL)
7413 r->intflags |= PREGf_USE_RE_EVAL;
7414 if (RExC_paren_names)
7415 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7417 RXp_PAREN_NAMES(r) = NULL;
7419 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7420 * so it can be used in pp.c */
7421 if (r->intflags & PREGf_ANCH)
7422 r->extflags |= RXf_IS_ANCHORED;
7426 /* this is used to identify "special" patterns that might result
7427 * in Perl NOT calling the regex engine and instead doing the match "itself",
7428 * particularly special cases in split//. By having the regex compiler
7429 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7430 * we avoid weird issues with equivalent patterns resulting in different behavior,
7431 * AND we allow non Perl engines to get the same optimizations by the setting the
7432 * flags appropriately - Yves */
7433 regnode *first = ri->program + 1;
7435 regnode *next = NEXTOPER(first);
7438 if (PL_regkind[fop] == NOTHING && nop == END)
7439 r->extflags |= RXf_NULL;
7440 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7441 /* when fop is SBOL first->flags will be true only when it was
7442 * produced by parsing /\A/, and not when parsing /^/. This is
7443 * very important for the split code as there we want to
7444 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7445 * See rt #122761 for more details. -- Yves */
7446 r->extflags |= RXf_START_ONLY;
7447 else if (fop == PLUS
7448 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7449 && OP(regnext(first)) == END)
7450 r->extflags |= RXf_WHITE;
7451 else if ( r->extflags & RXf_SPLIT
7453 && STR_LEN(first) == 1
7454 && *(STRING(first)) == ' '
7455 && OP(regnext(first)) == END )
7456 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7460 if (RExC_contains_locale) {
7461 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7465 if (RExC_paren_names) {
7466 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7467 ri->data->data[ri->name_list_idx]
7468 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7471 ri->name_list_idx = 0;
7473 if (RExC_recurse_count) {
7474 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7475 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7476 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7479 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7480 /* assume we don't need to swap parens around before we match */
7482 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7483 (unsigned long)RExC_study_chunk_recursed_count);
7487 PerlIO_printf(Perl_debug_log,"Final program:\n");
7490 #ifdef RE_TRACK_PATTERN_OFFSETS
7491 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7492 const STRLEN len = ri->u.offsets[0];
7494 GET_RE_DEBUG_FLAGS_DECL;
7495 PerlIO_printf(Perl_debug_log,
7496 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7497 for (i = 1; i <= len; i++) {
7498 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7499 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7500 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7502 PerlIO_printf(Perl_debug_log, "\n");
7507 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7508 * by setting the regexp SV to readonly-only instead. If the
7509 * pattern's been recompiled, the USEDness should remain. */
7510 if (old_re && SvREADONLY(old_re))
7518 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7521 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7523 PERL_UNUSED_ARG(value);
7525 if (flags & RXapif_FETCH) {
7526 return reg_named_buff_fetch(rx, key, flags);
7527 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7528 Perl_croak_no_modify();
7530 } else if (flags & RXapif_EXISTS) {
7531 return reg_named_buff_exists(rx, key, flags)
7534 } else if (flags & RXapif_REGNAMES) {
7535 return reg_named_buff_all(rx, flags);
7536 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7537 return reg_named_buff_scalar(rx, flags);
7539 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7545 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7548 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7549 PERL_UNUSED_ARG(lastkey);
7551 if (flags & RXapif_FIRSTKEY)
7552 return reg_named_buff_firstkey(rx, flags);
7553 else if (flags & RXapif_NEXTKEY)
7554 return reg_named_buff_nextkey(rx, flags);
7556 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7563 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7566 AV *retarray = NULL;
7568 struct regexp *const rx = ReANY(r);
7570 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7572 if (flags & RXapif_ALL)
7575 if (rx && RXp_PAREN_NAMES(rx)) {
7576 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7579 SV* sv_dat=HeVAL(he_str);
7580 I32 *nums=(I32*)SvPVX(sv_dat);
7581 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7582 if ((I32)(rx->nparens) >= nums[i]
7583 && rx->offs[nums[i]].start != -1
7584 && rx->offs[nums[i]].end != -1)
7587 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7592 ret = newSVsv(&PL_sv_undef);
7595 av_push(retarray, ret);
7598 return newRV_noinc(MUTABLE_SV(retarray));
7605 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7608 struct regexp *const rx = ReANY(r);
7610 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7612 if (rx && RXp_PAREN_NAMES(rx)) {
7613 if (flags & RXapif_ALL) {
7614 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7616 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7618 SvREFCNT_dec_NN(sv);
7630 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7632 struct regexp *const rx = ReANY(r);
7634 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7636 if ( rx && RXp_PAREN_NAMES(rx) ) {
7637 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7639 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7646 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7648 struct regexp *const rx = ReANY(r);
7649 GET_RE_DEBUG_FLAGS_DECL;
7651 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7653 if (rx && RXp_PAREN_NAMES(rx)) {
7654 HV *hv = RXp_PAREN_NAMES(rx);
7656 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7659 SV* sv_dat = HeVAL(temphe);
7660 I32 *nums = (I32*)SvPVX(sv_dat);
7661 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7662 if ((I32)(rx->lastparen) >= nums[i] &&
7663 rx->offs[nums[i]].start != -1 &&
7664 rx->offs[nums[i]].end != -1)
7670 if (parno || flags & RXapif_ALL) {
7671 return newSVhek(HeKEY_hek(temphe));
7679 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7684 struct regexp *const rx = ReANY(r);
7686 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7688 if (rx && RXp_PAREN_NAMES(rx)) {
7689 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7690 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7691 } else if (flags & RXapif_ONE) {
7692 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7693 av = MUTABLE_AV(SvRV(ret));
7694 length = av_tindex(av);
7695 SvREFCNT_dec_NN(ret);
7696 return newSViv(length + 1);
7698 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7703 return &PL_sv_undef;
7707 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7709 struct regexp *const rx = ReANY(r);
7712 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7714 if (rx && RXp_PAREN_NAMES(rx)) {
7715 HV *hv= RXp_PAREN_NAMES(rx);
7717 (void)hv_iterinit(hv);
7718 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7721 SV* sv_dat = HeVAL(temphe);
7722 I32 *nums = (I32*)SvPVX(sv_dat);
7723 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7724 if ((I32)(rx->lastparen) >= nums[i] &&
7725 rx->offs[nums[i]].start != -1 &&
7726 rx->offs[nums[i]].end != -1)
7732 if (parno || flags & RXapif_ALL) {
7733 av_push(av, newSVhek(HeKEY_hek(temphe)));
7738 return newRV_noinc(MUTABLE_SV(av));
7742 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7745 struct regexp *const rx = ReANY(r);
7751 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7753 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7754 || n == RX_BUFF_IDX_CARET_FULLMATCH
7755 || n == RX_BUFF_IDX_CARET_POSTMATCH
7758 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7760 /* on something like
7763 * the KEEPCOPY is set on the PMOP rather than the regex */
7764 if (PL_curpm && r == PM_GETRE(PL_curpm))
7765 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7774 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7775 /* no need to distinguish between them any more */
7776 n = RX_BUFF_IDX_FULLMATCH;
7778 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7779 && rx->offs[0].start != -1)
7781 /* $`, ${^PREMATCH} */
7782 i = rx->offs[0].start;
7786 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7787 && rx->offs[0].end != -1)
7789 /* $', ${^POSTMATCH} */
7790 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7791 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7794 if ( 0 <= n && n <= (I32)rx->nparens &&
7795 (s1 = rx->offs[n].start) != -1 &&
7796 (t1 = rx->offs[n].end) != -1)
7798 /* $&, ${^MATCH}, $1 ... */
7800 s = rx->subbeg + s1 - rx->suboffset;
7805 assert(s >= rx->subbeg);
7806 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7808 #ifdef NO_TAINT_SUPPORT
7809 sv_setpvn(sv, s, i);
7811 const int oldtainted = TAINT_get;
7813 sv_setpvn(sv, s, i);
7814 TAINT_set(oldtainted);
7816 if ( (rx->intflags & PREGf_CANY_SEEN)
7817 ? (RXp_MATCH_UTF8(rx)
7818 && (!i || is_utf8_string((U8*)s, i)))
7819 : (RXp_MATCH_UTF8(rx)) )
7826 if (RXp_MATCH_TAINTED(rx)) {
7827 if (SvTYPE(sv) >= SVt_PVMG) {
7828 MAGIC* const mg = SvMAGIC(sv);
7831 SvMAGIC_set(sv, mg->mg_moremagic);
7833 if ((mgt = SvMAGIC(sv))) {
7834 mg->mg_moremagic = mgt;
7835 SvMAGIC_set(sv, mg);
7846 sv_setsv(sv,&PL_sv_undef);
7852 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7853 SV const * const value)
7855 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7857 PERL_UNUSED_ARG(rx);
7858 PERL_UNUSED_ARG(paren);
7859 PERL_UNUSED_ARG(value);
7862 Perl_croak_no_modify();
7866 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7869 struct regexp *const rx = ReANY(r);
7873 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7875 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7876 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7877 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7880 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7882 /* on something like
7885 * the KEEPCOPY is set on the PMOP rather than the regex */
7886 if (PL_curpm && r == PM_GETRE(PL_curpm))
7887 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7893 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7895 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7896 case RX_BUFF_IDX_PREMATCH: /* $` */
7897 if (rx->offs[0].start != -1) {
7898 i = rx->offs[0].start;
7907 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7908 case RX_BUFF_IDX_POSTMATCH: /* $' */
7909 if (rx->offs[0].end != -1) {
7910 i = rx->sublen - rx->offs[0].end;
7912 s1 = rx->offs[0].end;
7919 default: /* $& / ${^MATCH}, $1, $2, ... */
7920 if (paren <= (I32)rx->nparens &&
7921 (s1 = rx->offs[paren].start) != -1 &&
7922 (t1 = rx->offs[paren].end) != -1)
7928 if (ckWARN(WARN_UNINITIALIZED))
7929 report_uninit((const SV *)sv);
7934 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7935 const char * const s = rx->subbeg - rx->suboffset + s1;
7940 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7947 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7949 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7950 PERL_UNUSED_ARG(rx);
7954 return newSVpvs("Regexp");
7957 /* Scans the name of a named buffer from the pattern.
7958 * If flags is REG_RSN_RETURN_NULL returns null.
7959 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7960 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7961 * to the parsed name as looked up in the RExC_paren_names hash.
7962 * If there is an error throws a vFAIL().. type exception.
7965 #define REG_RSN_RETURN_NULL 0
7966 #define REG_RSN_RETURN_NAME 1
7967 #define REG_RSN_RETURN_DATA 2
7970 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7972 char *name_start = RExC_parse;
7974 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7976 assert (RExC_parse <= RExC_end);
7977 if (RExC_parse == RExC_end) NOOP;
7978 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7979 /* skip IDFIRST by using do...while */
7982 RExC_parse += UTF8SKIP(RExC_parse);
7983 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7987 } while (isWORDCHAR(*RExC_parse));
7989 RExC_parse++; /* so the <- from the vFAIL is after the offending
7991 vFAIL("Group name must start with a non-digit word character");
7995 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7996 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7997 if ( flags == REG_RSN_RETURN_NAME)
7999 else if (flags==REG_RSN_RETURN_DATA) {
8002 if ( ! sv_name ) /* should not happen*/
8003 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8004 if (RExC_paren_names)
8005 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8007 sv_dat = HeVAL(he_str);
8009 vFAIL("Reference to nonexistent named group");
8013 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8014 (unsigned long) flags);
8016 assert(0); /* NOT REACHED */
8021 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8023 if (RExC_lastparse!=RExC_parse) { \
8024 PerlIO_printf(Perl_debug_log, "%s", \
8025 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8026 RExC_end - RExC_parse, 16, \
8028 PERL_PV_ESCAPE_UNI_DETECT | \
8029 PERL_PV_PRETTY_ELLIPSES | \
8030 PERL_PV_PRETTY_LTGT | \
8031 PERL_PV_ESCAPE_RE | \
8032 PERL_PV_PRETTY_EXACTSIZE \
8036 PerlIO_printf(Perl_debug_log,"%16s",""); \
8039 num = RExC_size + 1; \
8041 num=REG_NODE_NUM(RExC_emit); \
8042 if (RExC_lastnum!=num) \
8043 PerlIO_printf(Perl_debug_log,"|%4d",num); \
8045 PerlIO_printf(Perl_debug_log,"|%4s",""); \
8046 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8047 (int)((depth*2)), "", \
8051 RExC_lastparse=RExC_parse; \
8056 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8057 DEBUG_PARSE_MSG((funcname)); \
8058 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8060 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8061 DEBUG_PARSE_MSG((funcname)); \
8062 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8065 /* This section of code defines the inversion list object and its methods. The
8066 * interfaces are highly subject to change, so as much as possible is static to
8067 * this file. An inversion list is here implemented as a malloc'd C UV array
8068 * as an SVt_INVLIST scalar.
8070 * An inversion list for Unicode is an array of code points, sorted by ordinal
8071 * number. The zeroth element is the first code point in the list. The 1th
8072 * element is the first element beyond that not in the list. In other words,
8073 * the first range is
8074 * invlist[0]..(invlist[1]-1)
8075 * The other ranges follow. Thus every element whose index is divisible by two
8076 * marks the beginning of a range that is in the list, and every element not
8077 * divisible by two marks the beginning of a range not in the list. A single
8078 * element inversion list that contains the single code point N generally
8079 * consists of two elements
8082 * (The exception is when N is the highest representable value on the
8083 * machine, in which case the list containing just it would be a single
8084 * element, itself. By extension, if the last range in the list extends to
8085 * infinity, then the first element of that range will be in the inversion list
8086 * at a position that is divisible by two, and is the final element in the
8088 * Taking the complement (inverting) an inversion list is quite simple, if the
8089 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8090 * This implementation reserves an element at the beginning of each inversion
8091 * list to always contain 0; there is an additional flag in the header which
8092 * indicates if the list begins at the 0, or is offset to begin at the next
8095 * More about inversion lists can be found in "Unicode Demystified"
8096 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8097 * More will be coming when functionality is added later.
8099 * The inversion list data structure is currently implemented as an SV pointing
8100 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8101 * array of UV whose memory management is automatically handled by the existing
8102 * facilities for SV's.
8104 * Some of the methods should always be private to the implementation, and some
8105 * should eventually be made public */
8107 /* The header definitions are in F<inline_invlist.c> */
8109 PERL_STATIC_INLINE UV*
8110 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8112 /* Returns a pointer to the first element in the inversion list's array.
8113 * This is called upon initialization of an inversion list. Where the
8114 * array begins depends on whether the list has the code point U+0000 in it
8115 * or not. The other parameter tells it whether the code that follows this
8116 * call is about to put a 0 in the inversion list or not. The first
8117 * element is either the element reserved for 0, if TRUE, or the element
8118 * after it, if FALSE */
8120 bool* offset = get_invlist_offset_addr(invlist);
8121 UV* zero_addr = (UV *) SvPVX(invlist);
8123 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8126 assert(! _invlist_len(invlist));
8130 /* 1^1 = 0; 1^0 = 1 */
8131 *offset = 1 ^ will_have_0;
8132 return zero_addr + *offset;
8135 PERL_STATIC_INLINE UV*
8136 S_invlist_array(SV* const invlist)
8138 /* Returns the pointer to the inversion list's array. Every time the
8139 * length changes, this needs to be called in case malloc or realloc moved
8142 PERL_ARGS_ASSERT_INVLIST_ARRAY;
8144 /* Must not be empty. If these fail, you probably didn't check for <len>
8145 * being non-zero before trying to get the array */
8146 assert(_invlist_len(invlist));
8148 /* The very first element always contains zero, The array begins either
8149 * there, or if the inversion list is offset, at the element after it.
8150 * The offset header field determines which; it contains 0 or 1 to indicate
8151 * how much additionally to add */
8152 assert(0 == *(SvPVX(invlist)));
8153 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8156 PERL_STATIC_INLINE void
8157 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8159 /* Sets the current number of elements stored in the inversion list.
8160 * Updates SvCUR correspondingly */
8161 PERL_UNUSED_CONTEXT;
8162 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8164 assert(SvTYPE(invlist) == SVt_INVLIST);
8169 : TO_INTERNAL_SIZE(len + offset));
8170 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8173 PERL_STATIC_INLINE IV*
8174 S_get_invlist_previous_index_addr(SV* invlist)
8176 /* Return the address of the IV that is reserved to hold the cached index
8178 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8180 assert(SvTYPE(invlist) == SVt_INVLIST);
8182 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8185 PERL_STATIC_INLINE IV
8186 S_invlist_previous_index(SV* const invlist)
8188 /* Returns cached index of previous search */
8190 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8192 return *get_invlist_previous_index_addr(invlist);
8195 PERL_STATIC_INLINE void
8196 S_invlist_set_previous_index(SV* const invlist, const IV index)
8198 /* Caches <index> for later retrieval */
8200 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8202 assert(index == 0 || index < (int) _invlist_len(invlist));
8204 *get_invlist_previous_index_addr(invlist) = index;
8207 PERL_STATIC_INLINE UV
8208 S_invlist_max(SV* const invlist)
8210 /* Returns the maximum number of elements storable in the inversion list's
8211 * array, without having to realloc() */
8213 PERL_ARGS_ASSERT_INVLIST_MAX;
8215 assert(SvTYPE(invlist) == SVt_INVLIST);
8217 /* Assumes worst case, in which the 0 element is not counted in the
8218 * inversion list, so subtracts 1 for that */
8219 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8220 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8221 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8224 #ifndef PERL_IN_XSUB_RE
8226 Perl__new_invlist(pTHX_ IV initial_size)
8229 /* Return a pointer to a newly constructed inversion list, with enough
8230 * space to store 'initial_size' elements. If that number is negative, a
8231 * system default is used instead */
8235 if (initial_size < 0) {
8239 /* Allocate the initial space */
8240 new_list = newSV_type(SVt_INVLIST);
8242 /* First 1 is in case the zero element isn't in the list; second 1 is for
8244 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8245 invlist_set_len(new_list, 0, 0);
8247 /* Force iterinit() to be used to get iteration to work */
8248 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8250 *get_invlist_previous_index_addr(new_list) = 0;
8256 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8258 /* Return a pointer to a newly constructed inversion list, initialized to
8259 * point to <list>, which has to be in the exact correct inversion list
8260 * form, including internal fields. Thus this is a dangerous routine that
8261 * should not be used in the wrong hands. The passed in 'list' contains
8262 * several header fields at the beginning that are not part of the
8263 * inversion list body proper */
8265 const STRLEN length = (STRLEN) list[0];
8266 const UV version_id = list[1];
8267 const bool offset = cBOOL(list[2]);
8268 #define HEADER_LENGTH 3
8269 /* If any of the above changes in any way, you must change HEADER_LENGTH
8270 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8271 * perl -E 'say int(rand 2**31-1)'
8273 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8274 data structure type, so that one being
8275 passed in can be validated to be an
8276 inversion list of the correct vintage.
8279 SV* invlist = newSV_type(SVt_INVLIST);
8281 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8283 if (version_id != INVLIST_VERSION_ID) {
8284 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8287 /* The generated array passed in includes header elements that aren't part
8288 * of the list proper, so start it just after them */
8289 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8291 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8292 shouldn't touch it */
8294 *(get_invlist_offset_addr(invlist)) = offset;
8296 /* The 'length' passed to us is the physical number of elements in the
8297 * inversion list. But if there is an offset the logical number is one
8299 invlist_set_len(invlist, length - offset, offset);
8301 invlist_set_previous_index(invlist, 0);
8303 /* Initialize the iteration pointer. */
8304 invlist_iterfinish(invlist);
8306 SvREADONLY_on(invlist);
8310 #endif /* ifndef PERL_IN_XSUB_RE */
8313 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8315 /* Grow the maximum size of an inversion list */
8317 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8319 assert(SvTYPE(invlist) == SVt_INVLIST);
8321 /* Add one to account for the zero element at the beginning which may not
8322 * be counted by the calling parameters */
8323 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8326 PERL_STATIC_INLINE void
8327 S_invlist_trim(SV* const invlist)
8329 PERL_ARGS_ASSERT_INVLIST_TRIM;
8331 assert(SvTYPE(invlist) == SVt_INVLIST);
8333 /* Change the length of the inversion list to how many entries it currently
8335 SvPV_shrink_to_cur((SV *) invlist);
8339 S__append_range_to_invlist(pTHX_ SV* const invlist,
8340 const UV start, const UV end)
8342 /* Subject to change or removal. Append the range from 'start' to 'end' at
8343 * the end of the inversion list. The range must be above any existing
8347 UV max = invlist_max(invlist);
8348 UV len = _invlist_len(invlist);
8351 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8353 if (len == 0) { /* Empty lists must be initialized */
8354 offset = start != 0;
8355 array = _invlist_array_init(invlist, ! offset);
8358 /* Here, the existing list is non-empty. The current max entry in the
8359 * list is generally the first value not in the set, except when the
8360 * set extends to the end of permissible values, in which case it is
8361 * the first entry in that final set, and so this call is an attempt to
8362 * append out-of-order */
8364 UV final_element = len - 1;
8365 array = invlist_array(invlist);
8366 if (array[final_element] > start
8367 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8369 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8370 array[final_element], start,
8371 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8374 /* Here, it is a legal append. If the new range begins with the first
8375 * value not in the set, it is extending the set, so the new first
8376 * value not in the set is one greater than the newly extended range.
8378 offset = *get_invlist_offset_addr(invlist);
8379 if (array[final_element] == start) {
8380 if (end != UV_MAX) {
8381 array[final_element] = end + 1;
8384 /* But if the end is the maximum representable on the machine,
8385 * just let the range that this would extend to have no end */
8386 invlist_set_len(invlist, len - 1, offset);
8392 /* Here the new range doesn't extend any existing set. Add it */
8394 len += 2; /* Includes an element each for the start and end of range */
8396 /* If wll overflow the existing space, extend, which may cause the array to
8399 invlist_extend(invlist, len);
8401 /* Have to set len here to avoid assert failure in invlist_array() */
8402 invlist_set_len(invlist, len, offset);
8404 array = invlist_array(invlist);
8407 invlist_set_len(invlist, len, offset);
8410 /* The next item on the list starts the range, the one after that is
8411 * one past the new range. */
8412 array[len - 2] = start;
8413 if (end != UV_MAX) {
8414 array[len - 1] = end + 1;
8417 /* But if the end is the maximum representable on the machine, just let
8418 * the range have no end */
8419 invlist_set_len(invlist, len - 1, offset);
8423 #ifndef PERL_IN_XSUB_RE
8426 Perl__invlist_search(SV* const invlist, const UV cp)
8428 /* Searches the inversion list for the entry that contains the input code
8429 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8430 * return value is the index into the list's array of the range that
8435 IV high = _invlist_len(invlist);
8436 const IV highest_element = high - 1;
8439 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8441 /* If list is empty, return failure. */
8446 /* (We can't get the array unless we know the list is non-empty) */
8447 array = invlist_array(invlist);
8449 mid = invlist_previous_index(invlist);
8450 assert(mid >=0 && mid <= highest_element);
8452 /* <mid> contains the cache of the result of the previous call to this
8453 * function (0 the first time). See if this call is for the same result,
8454 * or if it is for mid-1. This is under the theory that calls to this
8455 * function will often be for related code points that are near each other.
8456 * And benchmarks show that caching gives better results. We also test
8457 * here if the code point is within the bounds of the list. These tests
8458 * replace others that would have had to be made anyway to make sure that
8459 * the array bounds were not exceeded, and these give us extra information
8460 * at the same time */
8461 if (cp >= array[mid]) {
8462 if (cp >= array[highest_element]) {
8463 return highest_element;
8466 /* Here, array[mid] <= cp < array[highest_element]. This means that
8467 * the final element is not the answer, so can exclude it; it also
8468 * means that <mid> is not the final element, so can refer to 'mid + 1'
8470 if (cp < array[mid + 1]) {
8476 else { /* cp < aray[mid] */
8477 if (cp < array[0]) { /* Fail if outside the array */
8481 if (cp >= array[mid - 1]) {
8486 /* Binary search. What we are looking for is <i> such that
8487 * array[i] <= cp < array[i+1]
8488 * The loop below converges on the i+1. Note that there may not be an
8489 * (i+1)th element in the array, and things work nonetheless */
8490 while (low < high) {
8491 mid = (low + high) / 2;
8492 assert(mid <= highest_element);
8493 if (array[mid] <= cp) { /* cp >= array[mid] */
8496 /* We could do this extra test to exit the loop early.
8497 if (cp < array[low]) {
8502 else { /* cp < array[mid] */
8509 invlist_set_previous_index(invlist, high);
8514 Perl__invlist_populate_swatch(SV* const invlist,
8515 const UV start, const UV end, U8* swatch)
8517 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8518 * but is used when the swash has an inversion list. This makes this much
8519 * faster, as it uses a binary search instead of a linear one. This is
8520 * intimately tied to that function, and perhaps should be in utf8.c,
8521 * except it is intimately tied to inversion lists as well. It assumes
8522 * that <swatch> is all 0's on input */
8525 const IV len = _invlist_len(invlist);
8529 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8531 if (len == 0) { /* Empty inversion list */
8535 array = invlist_array(invlist);
8537 /* Find which element it is */
8538 i = _invlist_search(invlist, start);
8540 /* We populate from <start> to <end> */
8541 while (current < end) {
8544 /* The inversion list gives the results for every possible code point
8545 * after the first one in the list. Only those ranges whose index is
8546 * even are ones that the inversion list matches. For the odd ones,
8547 * and if the initial code point is not in the list, we have to skip
8548 * forward to the next element */
8549 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8551 if (i >= len) { /* Finished if beyond the end of the array */
8555 if (current >= end) { /* Finished if beyond the end of what we
8557 if (LIKELY(end < UV_MAX)) {
8561 /* We get here when the upper bound is the maximum
8562 * representable on the machine, and we are looking for just
8563 * that code point. Have to special case it */
8565 goto join_end_of_list;
8568 assert(current >= start);
8570 /* The current range ends one below the next one, except don't go past
8573 upper = (i < len && array[i] < end) ? array[i] : end;
8575 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8576 * for each code point in it */
8577 for (; current < upper; current++) {
8578 const STRLEN offset = (STRLEN)(current - start);
8579 swatch[offset >> 3] |= 1 << (offset & 7);
8584 /* Quit if at the end of the list */
8587 /* But first, have to deal with the highest possible code point on
8588 * the platform. The previous code assumes that <end> is one
8589 * beyond where we want to populate, but that is impossible at the
8590 * platform's infinity, so have to handle it specially */
8591 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8593 const STRLEN offset = (STRLEN)(end - start);
8594 swatch[offset >> 3] |= 1 << (offset & 7);
8599 /* Advance to the next range, which will be for code points not in the
8608 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8609 const bool complement_b, SV** output)
8611 /* Take the union of two inversion lists and point <output> to it. *output
8612 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8613 * the reference count to that list will be decremented if not already a
8614 * temporary (mortal); otherwise *output will be made correspondingly
8615 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8616 * second list is returned. If <complement_b> is TRUE, the union is taken
8617 * of the complement (inversion) of <b> instead of b itself.
8619 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8620 * Richard Gillam, published by Addison-Wesley, and explained at some
8621 * length there. The preface says to incorporate its examples into your
8622 * code at your own risk.
8624 * The algorithm is like a merge sort.
8626 * XXX A potential performance improvement is to keep track as we go along
8627 * if only one of the inputs contributes to the result, meaning the other
8628 * is a subset of that one. In that case, we can skip the final copy and
8629 * return the larger of the input lists, but then outside code might need
8630 * to keep track of whether to free the input list or not */
8632 const UV* array_a; /* a's array */
8634 UV len_a; /* length of a's array */
8637 SV* u; /* the resulting union */
8641 UV i_a = 0; /* current index into a's array */
8645 /* running count, as explained in the algorithm source book; items are
8646 * stopped accumulating and are output when the count changes to/from 0.
8647 * The count is incremented when we start a range that's in the set, and
8648 * decremented when we start a range that's not in the set. So its range
8649 * is 0 to 2. Only when the count is zero is something not in the set.
8653 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8656 /* If either one is empty, the union is the other one */
8657 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8658 bool make_temp = FALSE; /* Should we mortalize the result? */
8662 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8668 *output = invlist_clone(b);
8670 _invlist_invert(*output);
8672 } /* else *output already = b; */
8675 sv_2mortal(*output);
8679 else if ((len_b = _invlist_len(b)) == 0) {
8680 bool make_temp = FALSE;
8682 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8687 /* The complement of an empty list is a list that has everything in it,
8688 * so the union with <a> includes everything too */
8691 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8695 *output = _new_invlist(1);
8696 _append_range_to_invlist(*output, 0, UV_MAX);
8698 else if (*output != a) {
8699 *output = invlist_clone(a);
8701 /* else *output already = a; */
8704 sv_2mortal(*output);
8709 /* Here both lists exist and are non-empty */
8710 array_a = invlist_array(a);
8711 array_b = invlist_array(b);
8713 /* If are to take the union of 'a' with the complement of b, set it
8714 * up so are looking at b's complement. */
8717 /* To complement, we invert: if the first element is 0, remove it. To
8718 * do this, we just pretend the array starts one later */
8719 if (array_b[0] == 0) {
8725 /* But if the first element is not zero, we pretend the list starts
8726 * at the 0 that is always stored immediately before the array. */
8732 /* Size the union for the worst case: that the sets are completely
8734 u = _new_invlist(len_a + len_b);
8736 /* Will contain U+0000 if either component does */
8737 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8738 || (len_b > 0 && array_b[0] == 0));
8740 /* Go through each list item by item, stopping when exhausted one of
8742 while (i_a < len_a && i_b < len_b) {
8743 UV cp; /* The element to potentially add to the union's array */
8744 bool cp_in_set; /* is it in the the input list's set or not */
8746 /* We need to take one or the other of the two inputs for the union.
8747 * Since we are merging two sorted lists, we take the smaller of the
8748 * next items. In case of a tie, we take the one that is in its set
8749 * first. If we took one not in the set first, it would decrement the
8750 * count, possibly to 0 which would cause it to be output as ending the
8751 * range, and the next time through we would take the same number, and
8752 * output it again as beginning the next range. By doing it the
8753 * opposite way, there is no possibility that the count will be
8754 * momentarily decremented to 0, and thus the two adjoining ranges will
8755 * be seamlessly merged. (In a tie and both are in the set or both not
8756 * in the set, it doesn't matter which we take first.) */
8757 if (array_a[i_a] < array_b[i_b]
8758 || (array_a[i_a] == array_b[i_b]
8759 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8761 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8765 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8766 cp = array_b[i_b++];
8769 /* Here, have chosen which of the two inputs to look at. Only output
8770 * if the running count changes to/from 0, which marks the
8771 * beginning/end of a range in that's in the set */
8774 array_u[i_u++] = cp;
8781 array_u[i_u++] = cp;
8786 /* Here, we are finished going through at least one of the lists, which
8787 * means there is something remaining in at most one. We check if the list
8788 * that hasn't been exhausted is positioned such that we are in the middle
8789 * of a range in its set or not. (i_a and i_b point to the element beyond
8790 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8791 * is potentially more to output.
8792 * There are four cases:
8793 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8794 * in the union is entirely from the non-exhausted set.
8795 * 2) Both were in their sets, count is 2. Nothing further should
8796 * be output, as everything that remains will be in the exhausted
8797 * list's set, hence in the union; decrementing to 1 but not 0 insures
8799 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8800 * Nothing further should be output because the union includes
8801 * everything from the exhausted set. Not decrementing ensures that.
8802 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8803 * decrementing to 0 insures that we look at the remainder of the
8804 * non-exhausted set */
8805 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8806 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8811 /* The final length is what we've output so far, plus what else is about to
8812 * be output. (If 'count' is non-zero, then the input list we exhausted
8813 * has everything remaining up to the machine's limit in its set, and hence
8814 * in the union, so there will be no further output. */
8817 /* At most one of the subexpressions will be non-zero */
8818 len_u += (len_a - i_a) + (len_b - i_b);
8821 /* Set result to final length, which can change the pointer to array_u, so
8823 if (len_u != _invlist_len(u)) {
8824 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8826 array_u = invlist_array(u);
8829 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8830 * the other) ended with everything above it not in its set. That means
8831 * that the remaining part of the union is precisely the same as the
8832 * non-exhausted list, so can just copy it unchanged. (If both list were
8833 * exhausted at the same time, then the operations below will be both 0.)
8836 IV copy_count; /* At most one will have a non-zero copy count */
8837 if ((copy_count = len_a - i_a) > 0) {
8838 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8840 else if ((copy_count = len_b - i_b) > 0) {
8841 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8845 /* We may be removing a reference to one of the inputs. If so, the output
8846 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8847 * count decremented) */
8848 if (a == *output || b == *output) {
8849 assert(! invlist_is_iterating(*output));
8850 if ((SvTEMP(*output))) {
8854 SvREFCNT_dec_NN(*output);
8864 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8865 const bool complement_b, SV** i)
8867 /* Take the intersection of two inversion lists and point <i> to it. *i
8868 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8869 * the reference count to that list will be decremented if not already a
8870 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8871 * The first list, <a>, may be NULL, in which case an empty list is
8872 * returned. If <complement_b> is TRUE, the result will be the
8873 * intersection of <a> and the complement (or inversion) of <b> instead of
8876 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8877 * Richard Gillam, published by Addison-Wesley, and explained at some
8878 * length there. The preface says to incorporate its examples into your
8879 * code at your own risk. In fact, it had bugs
8881 * The algorithm is like a merge sort, and is essentially the same as the
8885 const UV* array_a; /* a's array */
8887 UV len_a; /* length of a's array */
8890 SV* r; /* the resulting intersection */
8894 UV i_a = 0; /* current index into a's array */
8898 /* running count, as explained in the algorithm source book; items are
8899 * stopped accumulating and are output when the count changes to/from 2.
8900 * The count is incremented when we start a range that's in the set, and
8901 * decremented when we start a range that's not in the set. So its range
8902 * is 0 to 2. Only when the count is 2 is something in the intersection.
8906 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8909 /* Special case if either one is empty */
8910 len_a = (a == NULL) ? 0 : _invlist_len(a);
8911 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8912 bool make_temp = FALSE;
8914 if (len_a != 0 && complement_b) {
8916 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8917 * be empty. Here, also we are using 'b's complement, which hence
8918 * must be every possible code point. Thus the intersection is
8922 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8927 *i = invlist_clone(a);
8929 /* else *i is already 'a' */
8937 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8938 * intersection must be empty */
8940 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8945 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8949 *i = _new_invlist(0);
8957 /* Here both lists exist and are non-empty */
8958 array_a = invlist_array(a);
8959 array_b = invlist_array(b);
8961 /* If are to take the intersection of 'a' with the complement of b, set it
8962 * up so are looking at b's complement. */
8965 /* To complement, we invert: if the first element is 0, remove it. To
8966 * do this, we just pretend the array starts one later */
8967 if (array_b[0] == 0) {
8973 /* But if the first element is not zero, we pretend the list starts
8974 * at the 0 that is always stored immediately before the array. */
8980 /* Size the intersection for the worst case: that the intersection ends up
8981 * fragmenting everything to be completely disjoint */
8982 r= _new_invlist(len_a + len_b);
8984 /* Will contain U+0000 iff both components do */
8985 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8986 && len_b > 0 && array_b[0] == 0);
8988 /* Go through each list item by item, stopping when exhausted one of
8990 while (i_a < len_a && i_b < len_b) {
8991 UV cp; /* The element to potentially add to the intersection's
8993 bool cp_in_set; /* Is it in the input list's set or not */
8995 /* We need to take one or the other of the two inputs for the
8996 * intersection. Since we are merging two sorted lists, we take the
8997 * smaller of the next items. In case of a tie, we take the one that
8998 * is not in its set first (a difference from the union algorithm). If
8999 * we took one in the set first, it would increment the count, possibly
9000 * to 2 which would cause it to be output as starting a range in the
9001 * intersection, and the next time through we would take that same
9002 * number, and output it again as ending the set. By doing it the
9003 * opposite of this, there is no possibility that the count will be
9004 * momentarily incremented to 2. (In a tie and both are in the set or
9005 * both not in the set, it doesn't matter which we take first.) */
9006 if (array_a[i_a] < array_b[i_b]
9007 || (array_a[i_a] == array_b[i_b]
9008 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9010 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9014 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9018 /* Here, have chosen which of the two inputs to look at. Only output
9019 * if the running count changes to/from 2, which marks the
9020 * beginning/end of a range that's in the intersection */
9024 array_r[i_r++] = cp;
9029 array_r[i_r++] = cp;
9035 /* Here, we are finished going through at least one of the lists, which
9036 * means there is something remaining in at most one. We check if the list
9037 * that has been exhausted is positioned such that we are in the middle
9038 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
9039 * the ones we care about.) There are four cases:
9040 * 1) Both weren't in their sets, count is 0, and remains 0. There's
9041 * nothing left in the intersection.
9042 * 2) Both were in their sets, count is 2 and perhaps is incremented to
9043 * above 2. What should be output is exactly that which is in the
9044 * non-exhausted set, as everything it has is also in the intersection
9045 * set, and everything it doesn't have can't be in the intersection
9046 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9047 * gets incremented to 2. Like the previous case, the intersection is
9048 * everything that remains in the non-exhausted set.
9049 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9050 * remains 1. And the intersection has nothing more. */
9051 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9052 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9057 /* The final length is what we've output so far plus what else is in the
9058 * intersection. At most one of the subexpressions below will be non-zero
9062 len_r += (len_a - i_a) + (len_b - i_b);
9065 /* Set result to final length, which can change the pointer to array_r, so
9067 if (len_r != _invlist_len(r)) {
9068 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9070 array_r = invlist_array(r);
9073 /* Finish outputting any remaining */
9074 if (count >= 2) { /* At most one will have a non-zero copy count */
9076 if ((copy_count = len_a - i_a) > 0) {
9077 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9079 else if ((copy_count = len_b - i_b) > 0) {
9080 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9084 /* We may be removing a reference to one of the inputs. If so, the output
9085 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9086 * count decremented) */
9087 if (a == *i || b == *i) {
9088 assert(! invlist_is_iterating(*i));
9093 SvREFCNT_dec_NN(*i);
9103 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9105 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9106 * set. A pointer to the inversion list is returned. This may actually be
9107 * a new list, in which case the passed in one has been destroyed. The
9108 * passed-in inversion list can be NULL, in which case a new one is created
9109 * with just the one range in it */
9114 if (invlist == NULL) {
9115 invlist = _new_invlist(2);
9119 len = _invlist_len(invlist);
9122 /* If comes after the final entry actually in the list, can just append it
9125 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9126 && start >= invlist_array(invlist)[len - 1]))
9128 _append_range_to_invlist(invlist, start, end);
9132 /* Here, can't just append things, create and return a new inversion list
9133 * which is the union of this range and the existing inversion list */
9134 range_invlist = _new_invlist(2);
9135 _append_range_to_invlist(range_invlist, start, end);
9137 _invlist_union(invlist, range_invlist, &invlist);
9139 /* The temporary can be freed */
9140 SvREFCNT_dec_NN(range_invlist);
9146 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9147 UV** other_elements_ptr)
9149 /* Create and return an inversion list whose contents are to be populated
9150 * by the caller. The caller gives the number of elements (in 'size') and
9151 * the very first element ('element0'). This function will set
9152 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9155 * Obviously there is some trust involved that the caller will properly
9156 * fill in the other elements of the array.
9158 * (The first element needs to be passed in, as the underlying code does
9159 * things differently depending on whether it is zero or non-zero) */
9161 SV* invlist = _new_invlist(size);
9164 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9166 _append_range_to_invlist(invlist, element0, element0);
9167 offset = *get_invlist_offset_addr(invlist);
9169 invlist_set_len(invlist, size, offset);
9170 *other_elements_ptr = invlist_array(invlist) + 1;
9176 PERL_STATIC_INLINE SV*
9177 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9178 return _add_range_to_invlist(invlist, cp, cp);
9181 #ifndef PERL_IN_XSUB_RE
9183 Perl__invlist_invert(pTHX_ SV* const invlist)
9185 /* Complement the input inversion list. This adds a 0 if the list didn't
9186 * have a zero; removes it otherwise. As described above, the data
9187 * structure is set up so that this is very efficient */
9189 PERL_ARGS_ASSERT__INVLIST_INVERT;
9191 assert(! invlist_is_iterating(invlist));
9193 /* The inverse of matching nothing is matching everything */
9194 if (_invlist_len(invlist) == 0) {
9195 _append_range_to_invlist(invlist, 0, UV_MAX);
9199 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9204 PERL_STATIC_INLINE SV*
9205 S_invlist_clone(pTHX_ SV* const invlist)
9208 /* Return a new inversion list that is a copy of the input one, which is
9209 * unchanged. The new list will not be mortal even if the old one was. */
9211 /* Need to allocate extra space to accommodate Perl's addition of a
9212 * trailing NUL to SvPV's, since it thinks they are always strings */
9213 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9214 STRLEN physical_length = SvCUR(invlist);
9215 bool offset = *(get_invlist_offset_addr(invlist));
9217 PERL_ARGS_ASSERT_INVLIST_CLONE;
9219 *(get_invlist_offset_addr(new_invlist)) = offset;
9220 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9221 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9226 PERL_STATIC_INLINE STRLEN*
9227 S_get_invlist_iter_addr(SV* invlist)
9229 /* Return the address of the UV that contains the current iteration
9232 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9234 assert(SvTYPE(invlist) == SVt_INVLIST);
9236 return &(((XINVLIST*) SvANY(invlist))->iterator);
9239 PERL_STATIC_INLINE void
9240 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9242 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9244 *get_invlist_iter_addr(invlist) = 0;
9247 PERL_STATIC_INLINE void
9248 S_invlist_iterfinish(SV* invlist)
9250 /* Terminate iterator for invlist. This is to catch development errors.
9251 * Any iteration that is interrupted before completed should call this
9252 * function. Functions that add code points anywhere else but to the end
9253 * of an inversion list assert that they are not in the middle of an
9254 * iteration. If they were, the addition would make the iteration
9255 * problematical: if the iteration hadn't reached the place where things
9256 * were being added, it would be ok */
9258 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9260 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9264 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9266 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9267 * This call sets in <*start> and <*end>, the next range in <invlist>.
9268 * Returns <TRUE> if successful and the next call will return the next
9269 * range; <FALSE> if was already at the end of the list. If the latter,
9270 * <*start> and <*end> are unchanged, and the next call to this function
9271 * will start over at the beginning of the list */
9273 STRLEN* pos = get_invlist_iter_addr(invlist);
9274 UV len = _invlist_len(invlist);
9277 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9280 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9284 array = invlist_array(invlist);
9286 *start = array[(*pos)++];
9292 *end = array[(*pos)++] - 1;
9298 PERL_STATIC_INLINE bool
9299 S_invlist_is_iterating(SV* const invlist)
9301 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9303 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9306 PERL_STATIC_INLINE UV
9307 S_invlist_highest(SV* const invlist)
9309 /* Returns the highest code point that matches an inversion list. This API
9310 * has an ambiguity, as it returns 0 under either the highest is actually
9311 * 0, or if the list is empty. If this distinction matters to you, check
9312 * for emptiness before calling this function */
9314 UV len = _invlist_len(invlist);
9317 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9323 array = invlist_array(invlist);
9325 /* The last element in the array in the inversion list always starts a
9326 * range that goes to infinity. That range may be for code points that are
9327 * matched in the inversion list, or it may be for ones that aren't
9328 * matched. In the latter case, the highest code point in the set is one
9329 * less than the beginning of this range; otherwise it is the final element
9330 * of this range: infinity */
9331 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9333 : array[len - 1] - 1;
9336 #ifndef PERL_IN_XSUB_RE
9338 Perl__invlist_contents(pTHX_ SV* const invlist)
9340 /* Get the contents of an inversion list into a string SV so that they can
9341 * be printed out. It uses the format traditionally done for debug tracing
9345 SV* output = newSVpvs("\n");
9347 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9349 assert(! invlist_is_iterating(invlist));
9351 invlist_iterinit(invlist);
9352 while (invlist_iternext(invlist, &start, &end)) {
9353 if (end == UV_MAX) {
9354 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9356 else if (end != start) {
9357 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9361 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9369 #ifndef PERL_IN_XSUB_RE
9371 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9372 const char * const indent, SV* const invlist)
9374 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9375 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9376 * the string 'indent'. The output looks like this:
9377 [0] 0x000A .. 0x000D
9379 [4] 0x2028 .. 0x2029
9380 [6] 0x3104 .. INFINITY
9381 * This means that the first range of code points matched by the list are
9382 * 0xA through 0xD; the second range contains only the single code point
9383 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9384 * are used to define each range (except if the final range extends to
9385 * infinity, only a single element is needed). The array index of the
9386 * first element for the corresponding range is given in brackets. */
9391 PERL_ARGS_ASSERT__INVLIST_DUMP;
9393 if (invlist_is_iterating(invlist)) {
9394 Perl_dump_indent(aTHX_ level, file,
9395 "%sCan't dump inversion list because is in middle of iterating\n",
9400 invlist_iterinit(invlist);
9401 while (invlist_iternext(invlist, &start, &end)) {
9402 if (end == UV_MAX) {
9403 Perl_dump_indent(aTHX_ level, file,
9404 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9405 indent, (UV)count, start);
9407 else if (end != start) {
9408 Perl_dump_indent(aTHX_ level, file,
9409 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9410 indent, (UV)count, start, end);
9413 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9414 indent, (UV)count, start);
9421 Perl__load_PL_utf8_foldclosures (pTHX)
9423 assert(! PL_utf8_foldclosures);
9425 /* If the folds haven't been read in, call a fold function
9427 if (! PL_utf8_tofold) {
9428 U8 dummy[UTF8_MAXBYTES_CASE+1];
9430 /* This string is just a short named one above \xff */
9431 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9432 assert(PL_utf8_tofold); /* Verify that worked */
9434 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9438 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9440 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9442 /* Return a boolean as to if the two passed in inversion lists are
9443 * identical. The final argument, if TRUE, says to take the complement of
9444 * the second inversion list before doing the comparison */
9446 const UV* array_a = invlist_array(a);
9447 const UV* array_b = invlist_array(b);
9448 UV len_a = _invlist_len(a);
9449 UV len_b = _invlist_len(b);
9451 UV i = 0; /* current index into the arrays */
9452 bool retval = TRUE; /* Assume are identical until proven otherwise */
9454 PERL_ARGS_ASSERT__INVLISTEQ;
9456 /* If are to compare 'a' with the complement of b, set it
9457 * up so are looking at b's complement. */
9460 /* The complement of nothing is everything, so <a> would have to have
9461 * just one element, starting at zero (ending at infinity) */
9463 return (len_a == 1 && array_a[0] == 0);
9465 else if (array_b[0] == 0) {
9467 /* Otherwise, to complement, we invert. Here, the first element is
9468 * 0, just remove it. To do this, we just pretend the array starts
9476 /* But if the first element is not zero, we pretend the list starts
9477 * at the 0 that is always stored immediately before the array. */
9483 /* Make sure that the lengths are the same, as well as the final element
9484 * before looping through the remainder. (Thus we test the length, final,
9485 * and first elements right off the bat) */
9486 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9489 else for (i = 0; i < len_a - 1; i++) {
9490 if (array_a[i] != array_b[i]) {
9500 #undef HEADER_LENGTH
9501 #undef TO_INTERNAL_SIZE
9502 #undef FROM_INTERNAL_SIZE
9503 #undef INVLIST_VERSION_ID
9505 /* End of inversion list object */
9508 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9510 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9511 * constructs, and updates RExC_flags with them. On input, RExC_parse
9512 * should point to the first flag; it is updated on output to point to the
9513 * final ')' or ':'. There needs to be at least one flag, or this will
9516 /* for (?g), (?gc), and (?o) warnings; warning
9517 about (?c) will warn about (?g) -- japhy */
9519 #define WASTED_O 0x01
9520 #define WASTED_G 0x02
9521 #define WASTED_C 0x04
9522 #define WASTED_GC (WASTED_G|WASTED_C)
9523 I32 wastedflags = 0x00;
9524 U32 posflags = 0, negflags = 0;
9525 U32 *flagsp = &posflags;
9526 char has_charset_modifier = '\0';
9528 bool has_use_defaults = FALSE;
9529 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9530 int x_mod_count = 0;
9532 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9534 /* '^' as an initial flag sets certain defaults */
9535 if (UCHARAT(RExC_parse) == '^') {
9537 has_use_defaults = TRUE;
9538 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9539 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9540 ? REGEX_UNICODE_CHARSET
9541 : REGEX_DEPENDS_CHARSET);
9544 cs = get_regex_charset(RExC_flags);
9545 if (cs == REGEX_DEPENDS_CHARSET
9546 && (RExC_utf8 || RExC_uni_semantics))
9548 cs = REGEX_UNICODE_CHARSET;
9551 while (*RExC_parse) {
9552 /* && strchr("iogcmsx", *RExC_parse) */
9553 /* (?g), (?gc) and (?o) are useless here
9554 and must be globally applied -- japhy */
9555 switch (*RExC_parse) {
9557 /* Code for the imsx flags */
9558 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9560 case LOCALE_PAT_MOD:
9561 if (has_charset_modifier) {
9562 goto excess_modifier;
9564 else if (flagsp == &negflags) {
9567 cs = REGEX_LOCALE_CHARSET;
9568 has_charset_modifier = LOCALE_PAT_MOD;
9570 case UNICODE_PAT_MOD:
9571 if (has_charset_modifier) {
9572 goto excess_modifier;
9574 else if (flagsp == &negflags) {
9577 cs = REGEX_UNICODE_CHARSET;
9578 has_charset_modifier = UNICODE_PAT_MOD;
9580 case ASCII_RESTRICT_PAT_MOD:
9581 if (flagsp == &negflags) {
9584 if (has_charset_modifier) {
9585 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9586 goto excess_modifier;
9588 /* Doubled modifier implies more restricted */
9589 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9592 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9594 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9596 case DEPENDS_PAT_MOD:
9597 if (has_use_defaults) {
9598 goto fail_modifiers;
9600 else if (flagsp == &negflags) {
9603 else if (has_charset_modifier) {
9604 goto excess_modifier;
9607 /* The dual charset means unicode semantics if the
9608 * pattern (or target, not known until runtime) are
9609 * utf8, or something in the pattern indicates unicode
9611 cs = (RExC_utf8 || RExC_uni_semantics)
9612 ? REGEX_UNICODE_CHARSET
9613 : REGEX_DEPENDS_CHARSET;
9614 has_charset_modifier = DEPENDS_PAT_MOD;
9618 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9619 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9621 else if (has_charset_modifier == *(RExC_parse - 1)) {
9622 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9626 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9631 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9634 case ONCE_PAT_MOD: /* 'o' */
9635 case GLOBAL_PAT_MOD: /* 'g' */
9636 if (PASS2 && ckWARN(WARN_REGEXP)) {
9637 const I32 wflagbit = *RExC_parse == 'o'
9640 if (! (wastedflags & wflagbit) ) {
9641 wastedflags |= wflagbit;
9642 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9645 "Useless (%s%c) - %suse /%c modifier",
9646 flagsp == &negflags ? "?-" : "?",
9648 flagsp == &negflags ? "don't " : "",
9655 case CONTINUE_PAT_MOD: /* 'c' */
9656 if (PASS2 && ckWARN(WARN_REGEXP)) {
9657 if (! (wastedflags & WASTED_C) ) {
9658 wastedflags |= WASTED_GC;
9659 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9662 "Useless (%sc) - %suse /gc modifier",
9663 flagsp == &negflags ? "?-" : "?",
9664 flagsp == &negflags ? "don't " : ""
9669 case KEEPCOPY_PAT_MOD: /* 'p' */
9670 if (flagsp == &negflags) {
9672 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9674 *flagsp |= RXf_PMf_KEEPCOPY;
9678 /* A flag is a default iff it is following a minus, so
9679 * if there is a minus, it means will be trying to
9680 * re-specify a default which is an error */
9681 if (has_use_defaults || flagsp == &negflags) {
9682 goto fail_modifiers;
9685 wastedflags = 0; /* reset so (?g-c) warns twice */
9689 RExC_flags |= posflags;
9690 RExC_flags &= ~negflags;
9691 set_regex_charset(&RExC_flags, cs);
9692 if (RExC_flags & RXf_PMf_FOLD) {
9693 RExC_contains_i = 1;
9696 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9702 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9703 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9704 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9705 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9713 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9718 - reg - regular expression, i.e. main body or parenthesized thing
9720 * Caller must absorb opening parenthesis.
9722 * Combining parenthesis handling with the base level of regular expression
9723 * is a trifle forced, but the need to tie the tails of the branches to what
9724 * follows makes it hard to avoid.
9726 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9728 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9730 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9733 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9734 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9735 needs to be restarted.
9736 Otherwise would only return NULL if regbranch() returns NULL, which
9739 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9740 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9741 * 2 is like 1, but indicates that nextchar() has been called to advance
9742 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9743 * this flag alerts us to the need to check for that */
9745 regnode *ret; /* Will be the head of the group. */
9748 regnode *ender = NULL;
9751 U32 oregflags = RExC_flags;
9752 bool have_branch = 0;
9754 I32 freeze_paren = 0;
9755 I32 after_freeze = 0;
9756 I32 num; /* numeric backreferences */
9758 char * parse_start = RExC_parse; /* MJD */
9759 char * const oregcomp_parse = RExC_parse;
9761 GET_RE_DEBUG_FLAGS_DECL;
9763 PERL_ARGS_ASSERT_REG;
9764 DEBUG_PARSE("reg ");
9766 *flagp = 0; /* Tentatively. */
9769 /* Make an OPEN node, if parenthesized. */
9772 /* Under /x, space and comments can be gobbled up between the '(' and
9773 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9774 * intervening space, as the sequence is a token, and a token should be
9776 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9778 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9779 char *start_verb = RExC_parse;
9780 STRLEN verb_len = 0;
9781 char *start_arg = NULL;
9782 unsigned char op = 0;
9784 int internal_argval = 0; /* internal_argval is only useful if
9787 if (has_intervening_patws) {
9789 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9791 while ( *RExC_parse && *RExC_parse != ')' ) {
9792 if ( *RExC_parse == ':' ) {
9793 start_arg = RExC_parse + 1;
9799 verb_len = RExC_parse - start_verb;
9802 while ( *RExC_parse && *RExC_parse != ')' )
9804 if ( *RExC_parse != ')' )
9805 vFAIL("Unterminated verb pattern argument");
9806 if ( RExC_parse == start_arg )
9809 if ( *RExC_parse != ')' )
9810 vFAIL("Unterminated verb pattern");
9813 switch ( *start_verb ) {
9814 case 'A': /* (*ACCEPT) */
9815 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9817 internal_argval = RExC_nestroot;
9820 case 'C': /* (*COMMIT) */
9821 if ( memEQs(start_verb,verb_len,"COMMIT") )
9824 case 'F': /* (*FAIL) */
9825 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9830 case ':': /* (*:NAME) */
9831 case 'M': /* (*MARK:NAME) */
9832 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9837 case 'P': /* (*PRUNE) */
9838 if ( memEQs(start_verb,verb_len,"PRUNE") )
9841 case 'S': /* (*SKIP) */
9842 if ( memEQs(start_verb,verb_len,"SKIP") )
9845 case 'T': /* (*THEN) */
9846 /* [19:06] <TimToady> :: is then */
9847 if ( memEQs(start_verb,verb_len,"THEN") ) {
9849 RExC_seen |= REG_CUTGROUP_SEEN;
9854 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9856 "Unknown verb pattern '%"UTF8f"'",
9857 UTF8fARG(UTF, verb_len, start_verb));
9860 if ( start_arg && internal_argval ) {
9861 vFAIL3("Verb pattern '%.*s' may not have an argument",
9862 verb_len, start_verb);
9863 } else if ( argok < 0 && !start_arg ) {
9864 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9865 verb_len, start_verb);
9867 ret = reganode(pRExC_state, op, internal_argval);
9868 if ( ! internal_argval && ! SIZE_ONLY ) {
9870 SV *sv = newSVpvn( start_arg,
9871 RExC_parse - start_arg);
9872 ARG(ret) = add_data( pRExC_state,
9874 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9881 if (!internal_argval)
9882 RExC_seen |= REG_VERBARG_SEEN;
9883 } else if ( start_arg ) {
9884 vFAIL3("Verb pattern '%.*s' may not have an argument",
9885 verb_len, start_verb);
9887 ret = reg_node(pRExC_state, op);
9889 nextchar(pRExC_state);
9892 else if (*RExC_parse == '?') { /* (?...) */
9893 bool is_logical = 0;
9894 const char * const seqstart = RExC_parse;
9895 const char * endptr;
9896 if (has_intervening_patws) {
9898 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9902 paren = *RExC_parse++;
9903 ret = NULL; /* For look-ahead/behind. */
9906 case 'P': /* (?P...) variants for those used to PCRE/Python */
9907 paren = *RExC_parse++;
9908 if ( paren == '<') /* (?P<...>) named capture */
9910 else if (paren == '>') { /* (?P>name) named recursion */
9911 goto named_recursion;
9913 else if (paren == '=') { /* (?P=...) named backref */
9914 /* this pretty much dupes the code for \k<NAME> in
9915 * regatom(), if you change this make sure you change that
9917 char* name_start = RExC_parse;
9919 SV *sv_dat = reg_scan_name(pRExC_state,
9920 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9921 if (RExC_parse == name_start || *RExC_parse != ')')
9922 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9923 vFAIL2("Sequence %.3s... not terminated",parse_start);
9926 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9927 RExC_rxi->data->data[num]=(void*)sv_dat;
9928 SvREFCNT_inc_simple_void(sv_dat);
9931 ret = reganode(pRExC_state,
9934 : (ASCII_FOLD_RESTRICTED)
9936 : (AT_LEAST_UNI_SEMANTICS)
9944 Set_Node_Offset(ret, parse_start+1);
9945 Set_Node_Cur_Length(ret, parse_start);
9947 nextchar(pRExC_state);
9951 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9952 vFAIL3("Sequence (%.*s...) not recognized",
9953 RExC_parse-seqstart, seqstart);
9955 case '<': /* (?<...) */
9956 if (*RExC_parse == '!')
9958 else if (*RExC_parse != '=')
9964 case '\'': /* (?'...') */
9965 name_start= RExC_parse;
9966 svname = reg_scan_name(pRExC_state,
9967 SIZE_ONLY /* reverse test from the others */
9968 ? REG_RSN_RETURN_NAME
9969 : REG_RSN_RETURN_NULL);
9970 if (RExC_parse == name_start || *RExC_parse != paren)
9971 vFAIL2("Sequence (?%c... not terminated",
9972 paren=='>' ? '<' : paren);
9976 if (!svname) /* shouldn't happen */
9978 "panic: reg_scan_name returned NULL");
9979 if (!RExC_paren_names) {
9980 RExC_paren_names= newHV();
9981 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9983 RExC_paren_name_list= newAV();
9984 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9987 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9989 sv_dat = HeVAL(he_str);
9991 /* croak baby croak */
9993 "panic: paren_name hash element allocation failed");
9994 } else if ( SvPOK(sv_dat) ) {
9995 /* (?|...) can mean we have dupes so scan to check
9996 its already been stored. Maybe a flag indicating
9997 we are inside such a construct would be useful,
9998 but the arrays are likely to be quite small, so
9999 for now we punt -- dmq */
10000 IV count = SvIV(sv_dat);
10001 I32 *pv = (I32*)SvPVX(sv_dat);
10003 for ( i = 0 ; i < count ; i++ ) {
10004 if ( pv[i] == RExC_npar ) {
10010 pv = (I32*)SvGROW(sv_dat,
10011 SvCUR(sv_dat) + sizeof(I32)+1);
10012 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10013 pv[count] = RExC_npar;
10014 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10017 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10018 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10021 SvIV_set(sv_dat, 1);
10024 /* Yes this does cause a memory leak in debugging Perls
10026 if (!av_store(RExC_paren_name_list,
10027 RExC_npar, SvREFCNT_inc(svname)))
10028 SvREFCNT_dec_NN(svname);
10031 /*sv_dump(sv_dat);*/
10033 nextchar(pRExC_state);
10035 goto capturing_parens;
10037 RExC_seen |= REG_LOOKBEHIND_SEEN;
10038 RExC_in_lookbehind++;
10041 case '=': /* (?=...) */
10042 RExC_seen_zerolen++;
10044 case '!': /* (?!...) */
10045 RExC_seen_zerolen++;
10046 if (*RExC_parse == ')') {
10047 ret=reg_node(pRExC_state, OPFAIL);
10048 nextchar(pRExC_state);
10052 case '|': /* (?|...) */
10053 /* branch reset, behave like a (?:...) except that
10054 buffers in alternations share the same numbers */
10056 after_freeze = freeze_paren = RExC_npar;
10058 case ':': /* (?:...) */
10059 case '>': /* (?>...) */
10061 case '$': /* (?$...) */
10062 case '@': /* (?@...) */
10063 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10065 case '0' : /* (?0) */
10066 case 'R' : /* (?R) */
10067 if (*RExC_parse != ')')
10068 FAIL("Sequence (?R) not terminated");
10069 ret = reg_node(pRExC_state, GOSTART);
10070 RExC_seen |= REG_GOSTART_SEEN;
10071 *flagp |= POSTPONED;
10072 nextchar(pRExC_state);
10075 /* named and numeric backreferences */
10076 case '&': /* (?&NAME) */
10077 parse_start = RExC_parse - 1;
10080 SV *sv_dat = reg_scan_name(pRExC_state,
10081 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10082 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10084 if (RExC_parse == RExC_end || *RExC_parse != ')')
10085 vFAIL("Sequence (?&... not terminated");
10086 goto gen_recurse_regop;
10087 assert(0); /* NOT REACHED */
10089 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10091 vFAIL("Illegal pattern");
10093 goto parse_recursion;
10095 case '-': /* (?-1) */
10096 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10097 RExC_parse--; /* rewind to let it be handled later */
10101 case '1': case '2': case '3': case '4': /* (?1) */
10102 case '5': case '6': case '7': case '8': case '9':
10106 bool is_neg = FALSE;
10107 parse_start = RExC_parse - 1; /* MJD */
10108 if (*RExC_parse == '-') {
10112 num = grok_atou(RExC_parse, &endptr);
10114 RExC_parse = (char*)endptr;
10116 /* Some limit for num? */
10120 if (*RExC_parse!=')')
10121 vFAIL("Expecting close bracket");
10124 if ( paren == '-' ) {
10126 Diagram of capture buffer numbering.
10127 Top line is the normal capture buffer numbers
10128 Bottom line is the negative indexing as from
10132 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10136 num = RExC_npar + num;
10139 vFAIL("Reference to nonexistent group");
10141 } else if ( paren == '+' ) {
10142 num = RExC_npar + num - 1;
10145 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10147 if (num > (I32)RExC_rx->nparens) {
10149 vFAIL("Reference to nonexistent group");
10151 RExC_recurse_count++;
10152 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10153 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10154 22, "| |", (int)(depth * 2 + 1), "",
10155 (UV)ARG(ret), (IV)ARG2L(ret)));
10157 RExC_seen |= REG_RECURSE_SEEN;
10158 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10159 Set_Node_Offset(ret, parse_start); /* MJD */
10161 *flagp |= POSTPONED;
10162 nextchar(pRExC_state);
10165 assert(0); /* NOT REACHED */
10167 case '?': /* (??...) */
10169 if (*RExC_parse != '{') {
10171 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10173 "Sequence (%"UTF8f"...) not recognized",
10174 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10177 *flagp |= POSTPONED;
10178 paren = *RExC_parse++;
10180 case '{': /* (?{...}) */
10183 struct reg_code_block *cb;
10185 RExC_seen_zerolen++;
10187 if ( !pRExC_state->num_code_blocks
10188 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10189 || pRExC_state->code_blocks[pRExC_state->code_index].start
10190 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10193 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10194 FAIL("panic: Sequence (?{...}): no code block found\n");
10195 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10197 /* this is a pre-compiled code block (?{...}) */
10198 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10199 RExC_parse = RExC_start + cb->end;
10202 if (cb->src_regex) {
10203 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10204 RExC_rxi->data->data[n] =
10205 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10206 RExC_rxi->data->data[n+1] = (void*)o;
10209 n = add_data(pRExC_state,
10210 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10211 RExC_rxi->data->data[n] = (void*)o;
10214 pRExC_state->code_index++;
10215 nextchar(pRExC_state);
10219 ret = reg_node(pRExC_state, LOGICAL);
10221 eval = reg2Lanode(pRExC_state, EVAL,
10224 /* for later propagation into (??{})
10226 RExC_flags & RXf_PMf_COMPILETIME
10231 REGTAIL(pRExC_state, ret, eval);
10232 /* deal with the length of this later - MJD */
10235 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10236 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10237 Set_Node_Offset(ret, parse_start);
10240 case '(': /* (?(?{...})...) and (?(?=...)...) */
10243 const int DEFINE_len = sizeof("DEFINE") - 1;
10244 if (RExC_parse[0] == '?') { /* (?(?...)) */
10245 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10246 || RExC_parse[1] == '<'
10247 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10251 ret = reg_node(pRExC_state, LOGICAL);
10255 tail = reg(pRExC_state, 1, &flag, depth+1);
10256 if (flag & RESTART_UTF8) {
10257 *flagp = RESTART_UTF8;
10260 REGTAIL(pRExC_state, ret, tail);
10263 /* Fall through to ‘Unknown switch condition’ at the
10264 end of the if/else chain. */
10266 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10267 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10269 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10270 char *name_start= RExC_parse++;
10272 SV *sv_dat=reg_scan_name(pRExC_state,
10273 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10274 if (RExC_parse == name_start || *RExC_parse != ch)
10275 vFAIL2("Sequence (?(%c... not terminated",
10276 (ch == '>' ? '<' : ch));
10279 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10280 RExC_rxi->data->data[num]=(void*)sv_dat;
10281 SvREFCNT_inc_simple_void(sv_dat);
10283 ret = reganode(pRExC_state,NGROUPP,num);
10284 goto insert_if_check_paren;
10286 else if (strnEQ(RExC_parse, "DEFINE",
10287 MIN(DEFINE_len, RExC_end - RExC_parse)))
10289 ret = reganode(pRExC_state,DEFINEP,0);
10290 RExC_parse += DEFINE_len;
10292 goto insert_if_check_paren;
10294 else if (RExC_parse[0] == 'R') {
10297 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10298 parno = grok_atou(RExC_parse, &endptr);
10300 RExC_parse = (char*)endptr;
10301 } else if (RExC_parse[0] == '&') {
10304 sv_dat = reg_scan_name(pRExC_state,
10306 ? REG_RSN_RETURN_NULL
10307 : REG_RSN_RETURN_DATA);
10308 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10310 ret = reganode(pRExC_state,INSUBP,parno);
10311 goto insert_if_check_paren;
10313 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10317 parno = grok_atou(RExC_parse, &endptr);
10319 RExC_parse = (char*)endptr;
10320 ret = reganode(pRExC_state, GROUPP, parno);
10322 insert_if_check_paren:
10323 if (*(tmp = nextchar(pRExC_state)) != ')') {
10324 /* nextchar also skips comments, so undo its work
10325 * and skip over the the next character.
10328 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10329 vFAIL("Switch condition not recognized");
10332 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10333 br = regbranch(pRExC_state, &flags, 1,depth+1);
10335 if (flags & RESTART_UTF8) {
10336 *flagp = RESTART_UTF8;
10339 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10342 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10344 c = *nextchar(pRExC_state);
10345 if (flags&HASWIDTH)
10346 *flagp |= HASWIDTH;
10349 vFAIL("(?(DEFINE)....) does not allow branches");
10351 /* Fake one for optimizer. */
10352 lastbr = reganode(pRExC_state, IFTHEN, 0);
10354 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10355 if (flags & RESTART_UTF8) {
10356 *flagp = RESTART_UTF8;
10359 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10362 REGTAIL(pRExC_state, ret, lastbr);
10363 if (flags&HASWIDTH)
10364 *flagp |= HASWIDTH;
10365 c = *nextchar(pRExC_state);
10370 if (RExC_parse>RExC_end)
10371 vFAIL("Switch (?(condition)... not terminated");
10373 vFAIL("Switch (?(condition)... contains too many branches");
10375 ender = reg_node(pRExC_state, TAIL);
10376 REGTAIL(pRExC_state, br, ender);
10378 REGTAIL(pRExC_state, lastbr, ender);
10379 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10382 REGTAIL(pRExC_state, ret, ender);
10383 RExC_size++; /* XXX WHY do we need this?!!
10384 For large programs it seems to be required
10385 but I can't figure out why. -- dmq*/
10388 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10389 vFAIL("Unknown switch condition (?(...))");
10391 case '[': /* (?[ ... ]) */
10392 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10395 RExC_parse--; /* for vFAIL to print correctly */
10396 vFAIL("Sequence (? incomplete");
10398 default: /* e.g., (?i) */
10401 parse_lparen_question_flags(pRExC_state);
10402 if (UCHARAT(RExC_parse) != ':') {
10403 nextchar(pRExC_state);
10408 nextchar(pRExC_state);
10418 ret = reganode(pRExC_state, OPEN, parno);
10420 if (!RExC_nestroot)
10421 RExC_nestroot = parno;
10422 if (RExC_seen & REG_RECURSE_SEEN
10423 && !RExC_open_parens[parno-1])
10425 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10426 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10427 22, "| |", (int)(depth * 2 + 1), "",
10428 (IV)parno, REG_NODE_NUM(ret)));
10429 RExC_open_parens[parno-1]= ret;
10432 Set_Node_Length(ret, 1); /* MJD */
10433 Set_Node_Offset(ret, RExC_parse); /* MJD */
10441 /* Pick up the branches, linking them together. */
10442 parse_start = RExC_parse; /* MJD */
10443 br = regbranch(pRExC_state, &flags, 1,depth+1);
10445 /* branch_len = (paren != 0); */
10448 if (flags & RESTART_UTF8) {
10449 *flagp = RESTART_UTF8;
10452 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10454 if (*RExC_parse == '|') {
10455 if (!SIZE_ONLY && RExC_extralen) {
10456 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10459 reginsert(pRExC_state, BRANCH, br, depth+1);
10460 Set_Node_Length(br, paren != 0);
10461 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10465 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10467 else if (paren == ':') {
10468 *flagp |= flags&SIMPLE;
10470 if (is_open) { /* Starts with OPEN. */
10471 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10473 else if (paren != '?') /* Not Conditional */
10475 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10477 while (*RExC_parse == '|') {
10478 if (!SIZE_ONLY && RExC_extralen) {
10479 ender = reganode(pRExC_state, LONGJMP,0);
10481 /* Append to the previous. */
10482 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10485 RExC_extralen += 2; /* Account for LONGJMP. */
10486 nextchar(pRExC_state);
10487 if (freeze_paren) {
10488 if (RExC_npar > after_freeze)
10489 after_freeze = RExC_npar;
10490 RExC_npar = freeze_paren;
10492 br = regbranch(pRExC_state, &flags, 0, depth+1);
10495 if (flags & RESTART_UTF8) {
10496 *flagp = RESTART_UTF8;
10499 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10501 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10503 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10506 if (have_branch || paren != ':') {
10507 /* Make a closing node, and hook it on the end. */
10510 ender = reg_node(pRExC_state, TAIL);
10513 ender = reganode(pRExC_state, CLOSE, parno);
10514 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10515 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10516 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10517 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10518 RExC_close_parens[parno-1]= ender;
10519 if (RExC_nestroot == parno)
10522 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10523 Set_Node_Length(ender,1); /* MJD */
10529 *flagp &= ~HASWIDTH;
10532 ender = reg_node(pRExC_state, SUCCEED);
10535 ender = reg_node(pRExC_state, END);
10537 assert(!RExC_opend); /* there can only be one! */
10538 RExC_opend = ender;
10542 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10543 DEBUG_PARSE_MSG("lsbr");
10544 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10545 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10546 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10547 SvPV_nolen_const(RExC_mysv1),
10548 (IV)REG_NODE_NUM(lastbr),
10549 SvPV_nolen_const(RExC_mysv2),
10550 (IV)REG_NODE_NUM(ender),
10551 (IV)(ender - lastbr)
10554 REGTAIL(pRExC_state, lastbr, ender);
10556 if (have_branch && !SIZE_ONLY) {
10557 char is_nothing= 1;
10559 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10561 /* Hook the tails of the branches to the closing node. */
10562 for (br = ret; br; br = regnext(br)) {
10563 const U8 op = PL_regkind[OP(br)];
10564 if (op == BRANCH) {
10565 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10566 if ( OP(NEXTOPER(br)) != NOTHING
10567 || regnext(NEXTOPER(br)) != ender)
10570 else if (op == BRANCHJ) {
10571 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10572 /* for now we always disable this optimisation * /
10573 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10574 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10580 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10581 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10582 DEBUG_PARSE_MSG("NADA");
10583 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10584 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10585 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10586 SvPV_nolen_const(RExC_mysv1),
10587 (IV)REG_NODE_NUM(ret),
10588 SvPV_nolen_const(RExC_mysv2),
10589 (IV)REG_NODE_NUM(ender),
10594 if (OP(ender) == TAIL) {
10599 for ( opt= br + 1; opt < ender ; opt++ )
10600 OP(opt)= OPTIMIZED;
10601 NEXT_OFF(br)= ender - br;
10609 static const char parens[] = "=!<,>";
10611 if (paren && (p = strchr(parens, paren))) {
10612 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10613 int flag = (p - parens) > 1;
10616 node = SUSPEND, flag = 0;
10617 reginsert(pRExC_state, node,ret, depth+1);
10618 Set_Node_Cur_Length(ret, parse_start);
10619 Set_Node_Offset(ret, parse_start + 1);
10621 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10625 /* Check for proper termination. */
10627 /* restore original flags, but keep (?p) */
10628 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10629 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10630 RExC_parse = oregcomp_parse;
10631 vFAIL("Unmatched (");
10634 else if (!paren && RExC_parse < RExC_end) {
10635 if (*RExC_parse == ')') {
10637 vFAIL("Unmatched )");
10640 FAIL("Junk on end of regexp"); /* "Can't happen". */
10641 assert(0); /* NOTREACHED */
10644 if (RExC_in_lookbehind) {
10645 RExC_in_lookbehind--;
10647 if (after_freeze > RExC_npar)
10648 RExC_npar = after_freeze;
10653 - regbranch - one alternative of an | operator
10655 * Implements the concatenation operator.
10657 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10661 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10664 regnode *chain = NULL;
10666 I32 flags = 0, c = 0;
10667 GET_RE_DEBUG_FLAGS_DECL;
10669 PERL_ARGS_ASSERT_REGBRANCH;
10671 DEBUG_PARSE("brnc");
10676 if (!SIZE_ONLY && RExC_extralen)
10677 ret = reganode(pRExC_state, BRANCHJ,0);
10679 ret = reg_node(pRExC_state, BRANCH);
10680 Set_Node_Length(ret, 1);
10684 if (!first && SIZE_ONLY)
10685 RExC_extralen += 1; /* BRANCHJ */
10687 *flagp = WORST; /* Tentatively. */
10690 nextchar(pRExC_state);
10691 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10692 flags &= ~TRYAGAIN;
10693 latest = regpiece(pRExC_state, &flags,depth+1);
10694 if (latest == NULL) {
10695 if (flags & TRYAGAIN)
10697 if (flags & RESTART_UTF8) {
10698 *flagp = RESTART_UTF8;
10701 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10703 else if (ret == NULL)
10705 *flagp |= flags&(HASWIDTH|POSTPONED);
10706 if (chain == NULL) /* First piece. */
10707 *flagp |= flags&SPSTART;
10710 REGTAIL(pRExC_state, chain, latest);
10715 if (chain == NULL) { /* Loop ran zero times. */
10716 chain = reg_node(pRExC_state, NOTHING);
10721 *flagp |= flags&SIMPLE;
10728 - regpiece - something followed by possible [*+?]
10730 * Note that the branching code sequences used for ? and the general cases
10731 * of * and + are somewhat optimized: they use the same NOTHING node as
10732 * both the endmarker for their branch list and the body of the last branch.
10733 * It might seem that this node could be dispensed with entirely, but the
10734 * endmarker role is not redundant.
10736 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10738 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10742 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10748 const char * const origparse = RExC_parse;
10750 I32 max = REG_INFTY;
10751 #ifdef RE_TRACK_PATTERN_OFFSETS
10754 const char *maxpos = NULL;
10756 /* Save the original in case we change the emitted regop to a FAIL. */
10757 regnode * const orig_emit = RExC_emit;
10759 GET_RE_DEBUG_FLAGS_DECL;
10761 PERL_ARGS_ASSERT_REGPIECE;
10763 DEBUG_PARSE("piec");
10765 ret = regatom(pRExC_state, &flags,depth+1);
10767 if (flags & (TRYAGAIN|RESTART_UTF8))
10768 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10770 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10776 if (op == '{' && regcurly(RExC_parse)) {
10778 #ifdef RE_TRACK_PATTERN_OFFSETS
10779 parse_start = RExC_parse; /* MJD */
10781 next = RExC_parse + 1;
10782 while (isDIGIT(*next) || *next == ',') {
10783 if (*next == ',') {
10791 if (*next == '}') { /* got one */
10792 const char* endptr;
10796 min = grok_atou(RExC_parse, &endptr);
10797 if (*maxpos == ',')
10800 maxpos = RExC_parse;
10801 max = grok_atou(maxpos, &endptr);
10802 if (!max && *maxpos != '0')
10803 max = REG_INFTY; /* meaning "infinity" */
10804 else if (max >= REG_INFTY)
10805 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10807 nextchar(pRExC_state);
10808 if (max < min) { /* If can't match, warn and optimize to fail
10812 /* We can't back off the size because we have to reserve
10813 * enough space for all the things we are about to throw
10814 * away, but we can shrink it by the ammount we are about
10815 * to re-use here */
10816 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10819 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10820 RExC_emit = orig_emit;
10822 ret = reg_node(pRExC_state, OPFAIL);
10825 else if (min == max
10826 && RExC_parse < RExC_end
10827 && (*RExC_parse == '?' || *RExC_parse == '+'))
10830 ckWARN2reg(RExC_parse + 1,
10831 "Useless use of greediness modifier '%c'",
10834 /* Absorb the modifier, so later code doesn't see nor use
10836 nextchar(pRExC_state);
10840 if ((flags&SIMPLE)) {
10841 RExC_naughty += 2 + RExC_naughty / 2;
10842 reginsert(pRExC_state, CURLY, ret, depth+1);
10843 Set_Node_Offset(ret, parse_start+1); /* MJD */
10844 Set_Node_Cur_Length(ret, parse_start);
10847 regnode * const w = reg_node(pRExC_state, WHILEM);
10850 REGTAIL(pRExC_state, ret, w);
10851 if (!SIZE_ONLY && RExC_extralen) {
10852 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10853 reginsert(pRExC_state, NOTHING,ret, depth+1);
10854 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10856 reginsert(pRExC_state, CURLYX,ret, depth+1);
10858 Set_Node_Offset(ret, parse_start+1);
10859 Set_Node_Length(ret,
10860 op == '{' ? (RExC_parse - parse_start) : 1);
10862 if (!SIZE_ONLY && RExC_extralen)
10863 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10864 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10866 RExC_whilem_seen++, RExC_extralen += 3;
10867 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10874 *flagp |= HASWIDTH;
10876 ARG1_SET(ret, (U16)min);
10877 ARG2_SET(ret, (U16)max);
10879 if (max == REG_INFTY)
10880 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10886 if (!ISMULT1(op)) {
10891 #if 0 /* Now runtime fix should be reliable. */
10893 /* if this is reinstated, don't forget to put this back into perldiag:
10895 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10897 (F) The part of the regexp subject to either the * or + quantifier
10898 could match an empty string. The {#} shows in the regular
10899 expression about where the problem was discovered.
10903 if (!(flags&HASWIDTH) && op != '?')
10904 vFAIL("Regexp *+ operand could be empty");
10907 #ifdef RE_TRACK_PATTERN_OFFSETS
10908 parse_start = RExC_parse;
10910 nextchar(pRExC_state);
10912 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10914 if (op == '*' && (flags&SIMPLE)) {
10915 reginsert(pRExC_state, STAR, ret, depth+1);
10918 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10920 else if (op == '*') {
10924 else if (op == '+' && (flags&SIMPLE)) {
10925 reginsert(pRExC_state, PLUS, ret, depth+1);
10928 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10930 else if (op == '+') {
10934 else if (op == '?') {
10939 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10940 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10941 ckWARN2reg(RExC_parse,
10942 "%"UTF8f" matches null string many times",
10943 UTF8fARG(UTF, (RExC_parse >= origparse
10944 ? RExC_parse - origparse
10947 (void)ReREFCNT_inc(RExC_rx_sv);
10950 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10951 nextchar(pRExC_state);
10952 reginsert(pRExC_state, MINMOD, ret, depth+1);
10953 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10956 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10958 nextchar(pRExC_state);
10959 ender = reg_node(pRExC_state, SUCCEED);
10960 REGTAIL(pRExC_state, ret, ender);
10961 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10963 ender = reg_node(pRExC_state, TAIL);
10964 REGTAIL(pRExC_state, ret, ender);
10967 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10969 vFAIL("Nested quantifiers");
10976 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10977 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10981 /* This is expected to be called by a parser routine that has recognized '\N'
10982 and needs to handle the rest. RExC_parse is expected to point at the first
10983 char following the N at the time of the call. On successful return,
10984 RExC_parse has been updated to point to just after the sequence identified
10985 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10986 have been set appropriately.
10988 The typical case for this is \N{some character name}. This is usually
10989 called while parsing the input, filling in or ready to fill in an EXACTish
10990 node, and the code point for the character should be returned, so that it
10991 can be added to the node, and parsing continued with the next input
10992 character. But it may be that instead of a single character the \N{}
10993 expands to more than one, a named sequence. In this case any following
10994 quantifier applies to the whole sequence, and it is easier, given the code
10995 structure that calls this, to handle it from a different area of the code.
10996 For this reason, the input parameters can be set so that it returns valid
10997 only on one or the other of these cases.
10999 Another possibility is for the input to be an empty \N{}, which for
11000 backwards compatibility we accept, but generate a NOTHING node which should
11001 later get optimized out. This is handled from the area of code which can
11002 handle a named sequence, so if called with the parameters for the other, it
11005 Still another possibility is for the \N to mean [^\n], and not a single
11006 character or explicit sequence at all. This is determined by context.
11007 Again, this is handled from the area of code which can handle a named
11008 sequence, so if called with the parameters for the other, it also fails.
11010 And the final possibility is for the \N to be called from within a bracketed
11011 character class. In this case the [^\n] meaning makes no sense, and so is
11012 an error. Other anomalous situations are left to the calling code to handle.
11014 For non-single-quoted regexes, the tokenizer has attempted to decide which
11015 of the above applies, and in the case of a named sequence, has converted it
11016 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11017 where c1... are the characters in the sequence. For single-quoted regexes,
11018 the tokenizer passes the \N sequence through unchanged; this code will not
11019 attempt to determine this nor expand those, instead raising a syntax error.
11020 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11021 or there is no '}', it signals that this \N occurrence means to match a
11022 non-newline. (This mostly was done because of [perl #56444].)
11024 The API is somewhat convoluted due to historical and the above reasons.
11026 The function raises an error (via vFAIL), and doesn't return for various
11027 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
11028 it returns a count of how many characters were accounted for by it. (This
11029 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11030 points in the sequence. It sets <node_p>, <valuep>, and/or
11031 <substitute_parse> on success.
11033 If <valuep> is non-null, it means the caller can accept an input sequence
11034 consisting of a just a single code point; <*valuep> is set to the value
11035 of the only or first code point in the input.
11037 If <substitute_parse> is non-null, it means the caller can accept an input
11038 sequence consisting of one or more code points; <*substitute_parse> is a
11039 newly created mortal SV* in this case, containing \x{} escapes representing
11042 Both <valuep> and <substitute_parse> can be non-NULL.
11044 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
11045 that the caller can accept any legal sequence other than a single code
11046 point. To wit, <*node_p> is set as follows:
11047 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11048 2) \N{}: points to a new NOTHING node; return is 0
11049 3) otherwise: points to a new EXACT node containing the resolved
11050 string; return is the number of code points in the
11051 string. This will never be 1.
11052 Note that failure is returned for single code point sequences if <valuep> is
11053 null and <node_p> is not.
11056 char * endbrace; /* '}' following the name */
11058 char *endchar; /* Points to '.' or '}' ending cur char in the input
11060 bool has_multiple_chars; /* true if the input stream contains a sequence of
11061 more than one character */
11062 bool in_char_class = substitute_parse != NULL;
11063 STRLEN count = 0; /* Number of characters in this sequence */
11065 GET_RE_DEBUG_FLAGS_DECL;
11067 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11069 GET_RE_DEBUG_FLAGS;
11071 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
11072 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11074 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11075 * modifier. The other meaning does not, so use a temporary until we find
11076 * out which we are being called with */
11077 p = (RExC_flags & RXf_PMf_EXTENDED)
11078 ? regpatws(pRExC_state, RExC_parse,
11079 TRUE) /* means recognize comments */
11082 /* Disambiguate between \N meaning a named character versus \N meaning
11083 * [^\n]. The former is assumed when it can't be the latter. */
11084 if (*p != '{' || regcurly(p)) {
11087 /* no bare \N allowed in a charclass */
11088 if (in_char_class) {
11089 vFAIL("\\N in a character class must be a named character: \\N{...}");
11091 return (STRLEN) -1;
11093 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11095 nextchar(pRExC_state);
11096 *node_p = reg_node(pRExC_state, REG_ANY);
11097 *flagp |= HASWIDTH|SIMPLE;
11099 Set_Node_Length(*node_p, 1); /* MJD */
11103 /* Here, we have decided it should be a named character or sequence */
11105 /* The test above made sure that the next real character is a '{', but
11106 * under the /x modifier, it could be separated by space (or a comment and
11107 * \n) and this is not allowed (for consistency with \x{...} and the
11108 * tokenizer handling of \N{NAME}). */
11109 if (*RExC_parse != '{') {
11110 vFAIL("Missing braces on \\N{}");
11113 RExC_parse++; /* Skip past the '{' */
11115 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11116 || ! (endbrace == RExC_parse /* nothing between the {} */
11117 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
11119 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
11122 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11123 vFAIL("\\N{NAME} must be resolved by the lexer");
11126 if (endbrace == RExC_parse) { /* empty: \N{} */
11128 *node_p = reg_node(pRExC_state,NOTHING);
11130 else if (! in_char_class) {
11131 return (STRLEN) -1;
11133 nextchar(pRExC_state);
11137 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11138 RExC_parse += 2; /* Skip past the 'U+' */
11140 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11142 /* Code points are separated by dots. If none, there is only one code
11143 * point, and is terminated by the brace */
11144 has_multiple_chars = (endchar < endbrace);
11146 /* We get the first code point if we want it, and either there is only one,
11147 * or we can accept both cases of one and more than one */
11148 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11149 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11150 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11151 | PERL_SCAN_DISALLOW_PREFIX
11153 /* No errors in the first pass (See [perl
11154 * #122671].) We let the code below find the
11155 * errors when there are multiple chars. */
11156 | ((SIZE_ONLY || has_multiple_chars)
11157 ? PERL_SCAN_SILENT_ILLDIGIT
11160 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11162 /* The tokenizer should have guaranteed validity, but it's possible to
11163 * bypass it by using single quoting, so check. Don't do the check
11164 * here when there are multiple chars; we do it below anyway. */
11165 if (! has_multiple_chars) {
11166 if (length_of_hex == 0
11167 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11169 RExC_parse += length_of_hex; /* Includes all the valid */
11170 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11171 ? UTF8SKIP(RExC_parse)
11173 /* Guard against malformed utf8 */
11174 if (RExC_parse >= endchar) {
11175 RExC_parse = endchar;
11177 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11180 RExC_parse = endbrace + 1;
11185 /* Here, we should have already handled the case where a single character
11186 * is expected and found. So it is a failure if we aren't expecting
11187 * multiple chars and got them; or didn't get them but wanted them. We
11188 * fail without advancing the parse, so that the caller can try again with
11189 * different acceptance criteria */
11190 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11192 return (STRLEN) -1;
11197 /* What is done here is to convert this to a sub-pattern of the form
11198 * \x{char1}\x{char2}...
11199 * and then either return it in <*substitute_parse> if non-null; or
11200 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11201 * way, it retains its atomicness, while not having to worry about
11202 * special handling that some code points may have. toke.c has
11203 * converted the original Unicode values to native, so that we can just
11204 * pass on the hex values unchanged. We do have to set a flag to keep
11205 * recoding from happening in the recursion */
11209 char *orig_end = RExC_end;
11212 if (substitute_parse) {
11213 *substitute_parse = newSVpvs("");
11216 substitute_parse = &dummy;
11217 *substitute_parse = newSVpvs("?:");
11219 *substitute_parse = sv_2mortal(*substitute_parse);
11221 while (RExC_parse < endbrace) {
11223 /* Convert to notation the rest of the code understands */
11224 sv_catpv(*substitute_parse, "\\x{");
11225 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11226 sv_catpv(*substitute_parse, "}");
11228 /* Point to the beginning of the next character in the sequence. */
11229 RExC_parse = endchar + 1;
11230 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11234 if (! in_char_class) {
11235 sv_catpv(*substitute_parse, ")");
11238 RExC_parse = SvPV(*substitute_parse, len);
11240 /* Don't allow empty number */
11241 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11242 RExC_parse = endbrace;
11243 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11245 RExC_end = RExC_parse + len;
11247 /* The values are Unicode, and therefore not subject to recoding */
11248 RExC_override_recoding = 1;
11251 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11252 if (flags & RESTART_UTF8) {
11253 *flagp = RESTART_UTF8;
11254 return (STRLEN) -1;
11256 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11259 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11262 RExC_parse = endbrace;
11263 RExC_end = orig_end;
11264 RExC_override_recoding = 0;
11266 nextchar(pRExC_state);
11276 * It returns the code point in utf8 for the value in *encp.
11277 * value: a code value in the source encoding
11278 * encp: a pointer to an Encode object
11280 * If the result from Encode is not a single character,
11281 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11284 S_reg_recode(pTHX_ const char value, SV **encp)
11287 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11288 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11289 const STRLEN newlen = SvCUR(sv);
11290 UV uv = UNICODE_REPLACEMENT;
11292 PERL_ARGS_ASSERT_REG_RECODE;
11296 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11299 if (!newlen || numlen != newlen) {
11300 uv = UNICODE_REPLACEMENT;
11306 PERL_STATIC_INLINE U8
11307 S_compute_EXACTish(RExC_state_t *pRExC_state)
11311 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11317 op = get_regex_charset(RExC_flags);
11318 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11319 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11320 been, so there is no hole */
11323 return op + EXACTF;
11326 PERL_STATIC_INLINE void
11327 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11328 regnode *node, I32* flagp, STRLEN len, UV code_point,
11331 /* This knows the details about sizing an EXACTish node, setting flags for
11332 * it (by setting <*flagp>, and potentially populating it with a single
11335 * If <len> (the length in bytes) is non-zero, this function assumes that
11336 * the node has already been populated, and just does the sizing. In this
11337 * case <code_point> should be the final code point that has already been
11338 * placed into the node. This value will be ignored except that under some
11339 * circumstances <*flagp> is set based on it.
11341 * If <len> is zero, the function assumes that the node is to contain only
11342 * the single character given by <code_point> and calculates what <len>
11343 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11344 * additionally will populate the node's STRING with <code_point> or its
11347 * In both cases <*flagp> is appropriately set
11349 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11350 * 255, must be folded (the former only when the rules indicate it can
11353 * When it does the populating, it looks at the flag 'downgradable'. If
11354 * true with a node that folds, it checks if the single code point
11355 * participates in a fold, and if not downgrades the node to an EXACT.
11356 * This helps the optimizer */
11358 bool len_passed_in = cBOOL(len != 0);
11359 U8 character[UTF8_MAXBYTES_CASE+1];
11361 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11363 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11364 * sizing difference, and is extra work that is thrown away */
11365 if (downgradable && ! PASS2) {
11366 downgradable = FALSE;
11369 if (! len_passed_in) {
11371 if (UVCHR_IS_INVARIANT(code_point)) {
11372 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11373 *character = (U8) code_point;
11375 else { /* Here is /i and not /l. (toFOLD() is defined on just
11376 ASCII, which isn't the same thing as INVARIANT on
11377 EBCDIC, but it works there, as the extra invariants
11378 fold to themselves) */
11379 *character = toFOLD((U8) code_point);
11381 /* We can downgrade to an EXACT node if this character
11382 * isn't a folding one. Note that this assumes that
11383 * nothing above Latin1 folds to some other invariant than
11384 * one of these alphabetics; otherwise we would also have
11386 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11387 * || ASCII_FOLD_RESTRICTED))
11389 if (downgradable && PL_fold[code_point] == code_point) {
11395 else if (FOLD && (! LOC
11396 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11397 { /* Folding, and ok to do so now */
11398 UV folded = _to_uni_fold_flags(
11402 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11403 ? FOLD_FLAGS_NOMIX_ASCII
11406 && folded == code_point /* This quickly rules out many
11407 cases, avoiding the
11408 _invlist_contains_cp() overhead
11410 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11415 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11417 /* Not folding this cp, and can output it directly */
11418 *character = UTF8_TWO_BYTE_HI(code_point);
11419 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11423 uvchr_to_utf8( character, code_point);
11424 len = UTF8SKIP(character);
11426 } /* Else pattern isn't UTF8. */
11428 *character = (U8) code_point;
11430 } /* Else is folded non-UTF8 */
11431 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11433 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11434 * comments at join_exact()); */
11435 *character = (U8) code_point;
11438 /* Can turn into an EXACT node if we know the fold at compile time,
11439 * and it folds to itself and doesn't particpate in other folds */
11442 && PL_fold_latin1[code_point] == code_point
11443 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11444 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11448 } /* else is Sharp s. May need to fold it */
11449 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11451 *(character + 1) = 's';
11455 *character = LATIN_SMALL_LETTER_SHARP_S;
11461 RExC_size += STR_SZ(len);
11464 RExC_emit += STR_SZ(len);
11465 STR_LEN(node) = len;
11466 if (! len_passed_in) {
11467 Copy((char *) character, STRING(node), len, char);
11471 *flagp |= HASWIDTH;
11473 /* A single character node is SIMPLE, except for the special-cased SHARP S
11475 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11476 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11477 || ! FOLD || ! DEPENDS_SEMANTICS))
11482 /* The OP may not be well defined in PASS1 */
11483 if (PASS2 && OP(node) == EXACTFL) {
11484 RExC_contains_locale = 1;
11489 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11490 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11493 S_backref_value(char *p)
11495 const char* endptr;
11496 UV val = grok_atou(p, &endptr);
11497 if (endptr == p || endptr == NULL || val > I32_MAX)
11504 - regatom - the lowest level
11506 Try to identify anything special at the start of the pattern. If there
11507 is, then handle it as required. This may involve generating a single regop,
11508 such as for an assertion; or it may involve recursing, such as to
11509 handle a () structure.
11511 If the string doesn't start with something special then we gobble up
11512 as much literal text as we can.
11514 Once we have been able to handle whatever type of thing started the
11515 sequence, we return.
11517 Note: we have to be careful with escapes, as they can be both literal
11518 and special, and in the case of \10 and friends, context determines which.
11520 A summary of the code structure is:
11522 switch (first_byte) {
11523 cases for each special:
11524 handle this special;
11527 switch (2nd byte) {
11528 cases for each unambiguous special:
11529 handle this special;
11531 cases for each ambigous special/literal:
11533 if (special) handle here
11535 default: // unambiguously literal:
11538 default: // is a literal char
11541 create EXACTish node for literal;
11542 while (more input and node isn't full) {
11543 switch (input_byte) {
11544 cases for each special;
11545 make sure parse pointer is set so that the next call to
11546 regatom will see this special first
11547 goto loopdone; // EXACTish node terminated by prev. char
11549 append char to EXACTISH node;
11551 get next input byte;
11555 return the generated node;
11557 Specifically there are two separate switches for handling
11558 escape sequences, with the one for handling literal escapes requiring
11559 a dummy entry for all of the special escapes that are actually handled
11562 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11564 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11566 Otherwise does not return NULL.
11570 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11572 regnode *ret = NULL;
11574 char *parse_start = RExC_parse;
11579 GET_RE_DEBUG_FLAGS_DECL;
11581 *flagp = WORST; /* Tentatively. */
11583 DEBUG_PARSE("atom");
11585 PERL_ARGS_ASSERT_REGATOM;
11588 switch ((U8)*RExC_parse) {
11590 RExC_seen_zerolen++;
11591 nextchar(pRExC_state);
11592 if (RExC_flags & RXf_PMf_MULTILINE)
11593 ret = reg_node(pRExC_state, MBOL);
11595 ret = reg_node(pRExC_state, SBOL);
11596 Set_Node_Length(ret, 1); /* MJD */
11599 nextchar(pRExC_state);
11601 RExC_seen_zerolen++;
11602 if (RExC_flags & RXf_PMf_MULTILINE)
11603 ret = reg_node(pRExC_state, MEOL);
11605 ret = reg_node(pRExC_state, SEOL);
11606 Set_Node_Length(ret, 1); /* MJD */
11609 nextchar(pRExC_state);
11610 if (RExC_flags & RXf_PMf_SINGLELINE)
11611 ret = reg_node(pRExC_state, SANY);
11613 ret = reg_node(pRExC_state, REG_ANY);
11614 *flagp |= HASWIDTH|SIMPLE;
11616 Set_Node_Length(ret, 1); /* MJD */
11620 char * const oregcomp_parse = ++RExC_parse;
11621 ret = regclass(pRExC_state, flagp,depth+1,
11622 FALSE, /* means parse the whole char class */
11623 TRUE, /* allow multi-char folds */
11624 FALSE, /* don't silence non-portable warnings. */
11626 if (*RExC_parse != ']') {
11627 RExC_parse = oregcomp_parse;
11628 vFAIL("Unmatched [");
11631 if (*flagp & RESTART_UTF8)
11633 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11636 nextchar(pRExC_state);
11637 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11641 nextchar(pRExC_state);
11642 ret = reg(pRExC_state, 2, &flags,depth+1);
11644 if (flags & TRYAGAIN) {
11645 if (RExC_parse == RExC_end) {
11646 /* Make parent create an empty node if needed. */
11647 *flagp |= TRYAGAIN;
11652 if (flags & RESTART_UTF8) {
11653 *flagp = RESTART_UTF8;
11656 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11659 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11663 if (flags & TRYAGAIN) {
11664 *flagp |= TRYAGAIN;
11667 vFAIL("Internal urp");
11668 /* Supposed to be caught earlier. */
11674 vFAIL("Quantifier follows nothing");
11679 This switch handles escape sequences that resolve to some kind
11680 of special regop and not to literal text. Escape sequnces that
11681 resolve to literal text are handled below in the switch marked
11684 Every entry in this switch *must* have a corresponding entry
11685 in the literal escape switch. However, the opposite is not
11686 required, as the default for this switch is to jump to the
11687 literal text handling code.
11689 switch ((U8)*++RExC_parse) {
11690 /* Special Escapes */
11692 RExC_seen_zerolen++;
11693 ret = reg_node(pRExC_state, SBOL);
11694 /* SBOL is shared with /^/ so we set the flags so we can tell
11695 * /\A/ from /^/ in split. We check ret because first pass we
11696 * have no regop struct to set the flags on. */
11700 goto finish_meta_pat;
11702 ret = reg_node(pRExC_state, GPOS);
11703 RExC_seen |= REG_GPOS_SEEN;
11705 goto finish_meta_pat;
11707 RExC_seen_zerolen++;
11708 ret = reg_node(pRExC_state, KEEPS);
11710 /* XXX:dmq : disabling in-place substitution seems to
11711 * be necessary here to avoid cases of memory corruption, as
11712 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11714 RExC_seen |= REG_LOOKBEHIND_SEEN;
11715 goto finish_meta_pat;
11717 ret = reg_node(pRExC_state, SEOL);
11719 RExC_seen_zerolen++; /* Do not optimize RE away */
11720 goto finish_meta_pat;
11722 ret = reg_node(pRExC_state, EOS);
11724 RExC_seen_zerolen++; /* Do not optimize RE away */
11725 goto finish_meta_pat;
11727 ret = reg_node(pRExC_state, CANY);
11728 RExC_seen |= REG_CANY_SEEN;
11729 *flagp |= HASWIDTH|SIMPLE;
11731 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11733 goto finish_meta_pat;
11735 ret = reg_node(pRExC_state, CLUMP);
11736 *flagp |= HASWIDTH;
11737 goto finish_meta_pat;
11743 arg = ANYOF_WORDCHAR;
11747 RExC_seen_zerolen++;
11748 RExC_seen |= REG_LOOKBEHIND_SEEN;
11749 op = BOUND + get_regex_charset(RExC_flags);
11750 if (op > BOUNDA) { /* /aa is same as /a */
11753 else if (op == BOUNDL) {
11754 RExC_contains_locale = 1;
11756 ret = reg_node(pRExC_state, op);
11757 FLAGS(ret) = get_regex_charset(RExC_flags);
11759 if ((U8) *(RExC_parse + 1) == '{') {
11760 /* diag_listed_as: Use "%s" instead of "%s" */
11761 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11763 goto finish_meta_pat;
11765 RExC_seen_zerolen++;
11766 RExC_seen |= REG_LOOKBEHIND_SEEN;
11767 op = NBOUND + get_regex_charset(RExC_flags);
11768 if (op > NBOUNDA) { /* /aa is same as /a */
11771 else if (op == NBOUNDL) {
11772 RExC_contains_locale = 1;
11774 ret = reg_node(pRExC_state, op);
11775 FLAGS(ret) = get_regex_charset(RExC_flags);
11777 if ((U8) *(RExC_parse + 1) == '{') {
11778 /* diag_listed_as: Use "%s" instead of "%s" */
11779 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11781 goto finish_meta_pat;
11791 ret = reg_node(pRExC_state, LNBREAK);
11792 *flagp |= HASWIDTH|SIMPLE;
11793 goto finish_meta_pat;
11801 goto join_posix_op_known;
11807 arg = ANYOF_VERTWS;
11809 goto join_posix_op_known;
11819 op = POSIXD + get_regex_charset(RExC_flags);
11820 if (op > POSIXA) { /* /aa is same as /a */
11823 else if (op == POSIXL) {
11824 RExC_contains_locale = 1;
11827 join_posix_op_known:
11830 op += NPOSIXD - POSIXD;
11833 ret = reg_node(pRExC_state, op);
11835 FLAGS(ret) = namedclass_to_classnum(arg);
11838 *flagp |= HASWIDTH|SIMPLE;
11842 nextchar(pRExC_state);
11843 Set_Node_Length(ret, 2); /* MJD */
11849 char* parse_start = RExC_parse - 2;
11854 ret = regclass(pRExC_state, flagp,depth+1,
11855 TRUE, /* means just parse this element */
11856 FALSE, /* don't allow multi-char folds */
11857 FALSE, /* don't silence non-portable warnings.
11858 It would be a bug if these returned
11861 /* regclass() can only return RESTART_UTF8 if multi-char folds
11864 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11869 Set_Node_Offset(ret, parse_start + 2);
11870 Set_Node_Cur_Length(ret, parse_start);
11871 nextchar(pRExC_state);
11875 /* Handle \N and \N{NAME} with multiple code points here and not
11876 * below because it can be multicharacter. join_exact() will join
11877 * them up later on. Also this makes sure that things like
11878 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11879 * The options to the grok function call causes it to fail if the
11880 * sequence is just a single code point. We then go treat it as
11881 * just another character in the current EXACT node, and hence it
11882 * gets uniform treatment with all the other characters. The
11883 * special treatment for quantifiers is not needed for such single
11884 * character sequences */
11886 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11889 if (*flagp & RESTART_UTF8)
11895 case 'k': /* Handle \k<NAME> and \k'NAME' */
11898 char ch= RExC_parse[1];
11899 if (ch != '<' && ch != '\'' && ch != '{') {
11901 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11902 vFAIL2("Sequence %.2s... not terminated",parse_start);
11904 /* this pretty much dupes the code for (?P=...) in reg(), if
11905 you change this make sure you change that */
11906 char* name_start = (RExC_parse += 2);
11908 SV *sv_dat = reg_scan_name(pRExC_state,
11909 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11910 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11911 if (RExC_parse == name_start || *RExC_parse != ch)
11912 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11913 vFAIL2("Sequence %.3s... not terminated",parse_start);
11916 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11917 RExC_rxi->data->data[num]=(void*)sv_dat;
11918 SvREFCNT_inc_simple_void(sv_dat);
11922 ret = reganode(pRExC_state,
11925 : (ASCII_FOLD_RESTRICTED)
11927 : (AT_LEAST_UNI_SEMANTICS)
11933 *flagp |= HASWIDTH;
11935 /* override incorrect value set in reganode MJD */
11936 Set_Node_Offset(ret, parse_start+1);
11937 Set_Node_Cur_Length(ret, parse_start);
11938 nextchar(pRExC_state);
11944 case '1': case '2': case '3': case '4':
11945 case '5': case '6': case '7': case '8': case '9':
11950 if (*RExC_parse == 'g') {
11954 if (*RExC_parse == '{') {
11958 if (*RExC_parse == '-') {
11962 if (hasbrace && !isDIGIT(*RExC_parse)) {
11963 if (isrel) RExC_parse--;
11965 goto parse_named_seq;
11968 num = S_backref_value(RExC_parse);
11970 vFAIL("Reference to invalid group 0");
11971 else if (num == I32_MAX) {
11972 if (isDIGIT(*RExC_parse))
11973 vFAIL("Reference to nonexistent group");
11975 vFAIL("Unterminated \\g... pattern");
11979 num = RExC_npar - num;
11981 vFAIL("Reference to nonexistent or unclosed group");
11985 num = S_backref_value(RExC_parse);
11986 /* bare \NNN might be backref or octal - if it is larger than or equal
11987 * RExC_npar then it is assumed to be and octal escape.
11988 * Note RExC_npar is +1 from the actual number of parens*/
11989 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11990 && *RExC_parse != '8' && *RExC_parse != '9'))
11992 /* Probably a character specified in octal, e.g. \35 */
11997 /* at this point RExC_parse definitely points to a backref
12000 #ifdef RE_TRACK_PATTERN_OFFSETS
12001 char * const parse_start = RExC_parse - 1; /* MJD */
12003 while (isDIGIT(*RExC_parse))
12006 if (*RExC_parse != '}')
12007 vFAIL("Unterminated \\g{...} pattern");
12011 if (num > (I32)RExC_rx->nparens)
12012 vFAIL("Reference to nonexistent group");
12015 ret = reganode(pRExC_state,
12018 : (ASCII_FOLD_RESTRICTED)
12020 : (AT_LEAST_UNI_SEMANTICS)
12026 *flagp |= HASWIDTH;
12028 /* override incorrect value set in reganode MJD */
12029 Set_Node_Offset(ret, parse_start+1);
12030 Set_Node_Cur_Length(ret, parse_start);
12032 nextchar(pRExC_state);
12037 if (RExC_parse >= RExC_end)
12038 FAIL("Trailing \\");
12041 /* Do not generate "unrecognized" warnings here, we fall
12042 back into the quick-grab loop below */
12049 if (RExC_flags & RXf_PMf_EXTENDED) {
12050 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12051 if (RExC_parse < RExC_end)
12058 parse_start = RExC_parse - 1;
12067 #define MAX_NODE_STRING_SIZE 127
12068 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12070 U8 upper_parse = MAX_NODE_STRING_SIZE;
12071 U8 node_type = compute_EXACTish(pRExC_state);
12072 bool next_is_quantifier;
12073 char * oldp = NULL;
12075 /* We can convert EXACTF nodes to EXACTFU if they contain only
12076 * characters that match identically regardless of the target
12077 * string's UTF8ness. The reason to do this is that EXACTF is not
12078 * trie-able, EXACTFU is.
12080 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12081 * contain only above-Latin1 characters (hence must be in UTF8),
12082 * which don't participate in folds with Latin1-range characters,
12083 * as the latter's folds aren't known until runtime. (We don't
12084 * need to figure this out until pass 2) */
12085 bool maybe_exactfu = PASS2
12086 && (node_type == EXACTF || node_type == EXACTFL);
12088 /* If a folding node contains only code points that don't
12089 * participate in folds, it can be changed into an EXACT node,
12090 * which allows the optimizer more things to look for */
12093 ret = reg_node(pRExC_state, node_type);
12095 /* In pass1, folded, we use a temporary buffer instead of the
12096 * actual node, as the node doesn't exist yet */
12097 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12103 /* We do the EXACTFish to EXACT node only if folding. (And we
12104 * don't need to figure this out until pass 2) */
12105 maybe_exact = FOLD && PASS2;
12107 /* XXX The node can hold up to 255 bytes, yet this only goes to
12108 * 127. I (khw) do not know why. Keeping it somewhat less than
12109 * 255 allows us to not have to worry about overflow due to
12110 * converting to utf8 and fold expansion, but that value is
12111 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12112 * split up by this limit into a single one using the real max of
12113 * 255. Even at 127, this breaks under rare circumstances. If
12114 * folding, we do not want to split a node at a character that is a
12115 * non-final in a multi-char fold, as an input string could just
12116 * happen to want to match across the node boundary. The join
12117 * would solve that problem if the join actually happens. But a
12118 * series of more than two nodes in a row each of 127 would cause
12119 * the first join to succeed to get to 254, but then there wouldn't
12120 * be room for the next one, which could at be one of those split
12121 * multi-char folds. I don't know of any fool-proof solution. One
12122 * could back off to end with only a code point that isn't such a
12123 * non-final, but it is possible for there not to be any in the
12125 for (p = RExC_parse - 1;
12126 len < upper_parse && p < RExC_end;
12131 if (RExC_flags & RXf_PMf_EXTENDED)
12132 p = regpatws(pRExC_state, p,
12133 TRUE); /* means recognize comments */
12144 /* Literal Escapes Switch
12146 This switch is meant to handle escape sequences that
12147 resolve to a literal character.
12149 Every escape sequence that represents something
12150 else, like an assertion or a char class, is handled
12151 in the switch marked 'Special Escapes' above in this
12152 routine, but also has an entry here as anything that
12153 isn't explicitly mentioned here will be treated as
12154 an unescaped equivalent literal.
12157 switch ((U8)*++p) {
12158 /* These are all the special escapes. */
12159 case 'A': /* Start assertion */
12160 case 'b': case 'B': /* Word-boundary assertion*/
12161 case 'C': /* Single char !DANGEROUS! */
12162 case 'd': case 'D': /* digit class */
12163 case 'g': case 'G': /* generic-backref, pos assertion */
12164 case 'h': case 'H': /* HORIZWS */
12165 case 'k': case 'K': /* named backref, keep marker */
12166 case 'p': case 'P': /* Unicode property */
12167 case 'R': /* LNBREAK */
12168 case 's': case 'S': /* space class */
12169 case 'v': case 'V': /* VERTWS */
12170 case 'w': case 'W': /* word class */
12171 case 'X': /* eXtended Unicode "combining
12172 character sequence" */
12173 case 'z': case 'Z': /* End of line/string assertion */
12177 /* Anything after here is an escape that resolves to a
12178 literal. (Except digits, which may or may not)
12184 case 'N': /* Handle a single-code point named character. */
12185 /* The options cause it to fail if a multiple code
12186 * point sequence. Handle those in the switch() above
12188 RExC_parse = p + 1;
12189 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12195 if (*flagp & RESTART_UTF8)
12196 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12197 RExC_parse = p = oldp;
12201 if (ender > 0xff) {
12218 ender = ESC_NATIVE;
12228 const char* error_msg;
12230 bool valid = grok_bslash_o(&p,
12233 PASS2, /* out warnings */
12234 FALSE, /* not strict */
12235 TRUE, /* Output warnings
12240 RExC_parse = p; /* going to die anyway; point
12241 to exact spot of failure */
12245 if (IN_ENCODING && ender < 0x100) {
12246 goto recode_encoding;
12248 if (ender > 0xff) {
12255 UV result = UV_MAX; /* initialize to erroneous
12257 const char* error_msg;
12259 bool valid = grok_bslash_x(&p,
12262 PASS2, /* out warnings */
12263 FALSE, /* not strict */
12264 TRUE, /* Output warnings
12269 RExC_parse = p; /* going to die anyway; point
12270 to exact spot of failure */
12275 if (IN_ENCODING && ender < 0x100) {
12276 goto recode_encoding;
12278 if (ender > 0xff) {
12285 ender = grok_bslash_c(*p++, PASS2);
12287 case '8': case '9': /* must be a backreference */
12290 case '1': case '2': case '3':case '4':
12291 case '5': case '6': case '7':
12292 /* When we parse backslash escapes there is ambiguity
12293 * between backreferences and octal escapes. Any escape
12294 * from \1 - \9 is a backreference, any multi-digit
12295 * escape which does not start with 0 and which when
12296 * evaluated as decimal could refer to an already
12297 * parsed capture buffer is a backslash. Anything else
12300 * Note this implies that \118 could be interpreted as
12301 * 118 OR as "\11" . "8" depending on whether there
12302 * were 118 capture buffers defined already in the
12305 /* NOTE, RExC_npar is 1 more than the actual number of
12306 * parens we have seen so far, hence the < RExC_npar below. */
12308 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12309 { /* Not to be treated as an octal constant, go
12317 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12319 ender = grok_oct(p, &numlen, &flags, NULL);
12320 if (ender > 0xff) {
12324 if (PASS2 /* like \08, \178 */
12327 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12329 reg_warn_non_literal_string(
12331 form_short_octal_warning(p, numlen));
12334 if (IN_ENCODING && ender < 0x100)
12335 goto recode_encoding;
12338 if (! RExC_override_recoding) {
12339 SV* enc = _get_encoding();
12340 ender = reg_recode((const char)(U8)ender, &enc);
12342 ckWARNreg(p, "Invalid escape in the specified encoding");
12348 FAIL("Trailing \\");
12351 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12352 /* Include any { following the alpha to emphasize
12353 * that it could be part of an escape at some point
12355 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12356 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12358 goto normal_default;
12359 } /* End of switch on '\' */
12362 /* Currently we don't warn when the lbrace is at the start
12363 * of a construct. This catches it in the middle of a
12364 * literal string, or when its the first thing after
12365 * something like "\b" */
12367 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12369 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12372 default: /* A literal character */
12374 if (UTF8_IS_START(*p) && UTF) {
12376 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12377 &numlen, UTF8_ALLOW_DEFAULT);
12383 } /* End of switch on the literal */
12385 /* Here, have looked at the literal character and <ender>
12386 * contains its ordinal, <p> points to the character after it
12389 if ( RExC_flags & RXf_PMf_EXTENDED)
12390 p = regpatws(pRExC_state, p,
12391 TRUE); /* means recognize comments */
12393 /* If the next thing is a quantifier, it applies to this
12394 * character only, which means that this character has to be in
12395 * its own node and can't just be appended to the string in an
12396 * existing node, so if there are already other characters in
12397 * the node, close the node with just them, and set up to do
12398 * this character again next time through, when it will be the
12399 * only thing in its new node */
12400 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12406 if (! FOLD /* The simple case, just append the literal */
12407 || (LOC /* Also don't fold for tricky chars under /l */
12408 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12411 const STRLEN unilen = reguni(pRExC_state, ender, s);
12417 /* The loop increments <len> each time, as all but this
12418 * path (and one other) through it add a single byte to
12419 * the EXACTish node. But this one has changed len to
12420 * be the correct final value, so subtract one to
12421 * cancel out the increment that follows */
12425 REGC((char)ender, s++);
12428 /* Can get here if folding only if is one of the /l
12429 * characters whose fold depends on the locale. The
12430 * occurrence of any of these indicate that we can't
12431 * simplify things */
12433 maybe_exact = FALSE;
12434 maybe_exactfu = FALSE;
12439 /* See comments for join_exact() as to why we fold this
12440 * non-UTF at compile time */
12441 || (node_type == EXACTFU
12442 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12444 /* Here, are folding and are not UTF-8 encoded; therefore
12445 * the character must be in the range 0-255, and is not /l
12446 * (Not /l because we already handled these under /l in
12447 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12448 if (IS_IN_SOME_FOLD_L1(ender)) {
12449 maybe_exact = FALSE;
12451 /* See if the character's fold differs between /d and
12452 * /u. This includes the multi-char fold SHARP S to
12455 && (PL_fold[ender] != PL_fold_latin1[ender]
12456 || ender == LATIN_SMALL_LETTER_SHARP_S
12458 && isALPHA_FOLD_EQ(ender, 's')
12459 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12461 maybe_exactfu = FALSE;
12465 /* Even when folding, we store just the input character, as
12466 * we have an array that finds its fold quickly */
12467 *(s++) = (char) ender;
12469 else { /* FOLD and UTF */
12470 /* Unlike the non-fold case, we do actually have to
12471 * calculate the results here in pass 1. This is for two
12472 * reasons, the folded length may be longer than the
12473 * unfolded, and we have to calculate how many EXACTish
12474 * nodes it will take; and we may run out of room in a node
12475 * in the middle of a potential multi-char fold, and have
12476 * to back off accordingly. (Hence we can't use REGC for
12477 * the simple case just below.) */
12480 if (isASCII_uni(ender)) {
12481 folded = toFOLD(ender);
12482 *(s)++ = (U8) folded;
12487 folded = _to_uni_fold_flags(
12491 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12492 ? FOLD_FLAGS_NOMIX_ASCII
12496 /* The loop increments <len> each time, as all but this
12497 * path (and one other) through it add a single byte to
12498 * the EXACTish node. But this one has changed len to
12499 * be the correct final value, so subtract one to
12500 * cancel out the increment that follows */
12501 len += foldlen - 1;
12503 /* If this node only contains non-folding code points so
12504 * far, see if this new one is also non-folding */
12506 if (folded != ender) {
12507 maybe_exact = FALSE;
12510 /* Here the fold is the original; we have to check
12511 * further to see if anything folds to it */
12512 if (_invlist_contains_cp(PL_utf8_foldable,
12515 maybe_exact = FALSE;
12522 if (next_is_quantifier) {
12524 /* Here, the next input is a quantifier, and to get here,
12525 * the current character is the only one in the node.
12526 * Also, here <len> doesn't include the final byte for this
12532 } /* End of loop through literal characters */
12534 /* Here we have either exhausted the input or ran out of room in
12535 * the node. (If we encountered a character that can't be in the
12536 * node, transfer is made directly to <loopdone>, and so we
12537 * wouldn't have fallen off the end of the loop.) In the latter
12538 * case, we artificially have to split the node into two, because
12539 * we just don't have enough space to hold everything. This
12540 * creates a problem if the final character participates in a
12541 * multi-character fold in the non-final position, as a match that
12542 * should have occurred won't, due to the way nodes are matched,
12543 * and our artificial boundary. So back off until we find a non-
12544 * problematic character -- one that isn't at the beginning or
12545 * middle of such a fold. (Either it doesn't participate in any
12546 * folds, or appears only in the final position of all the folds it
12547 * does participate in.) A better solution with far fewer false
12548 * positives, and that would fill the nodes more completely, would
12549 * be to actually have available all the multi-character folds to
12550 * test against, and to back-off only far enough to be sure that
12551 * this node isn't ending with a partial one. <upper_parse> is set
12552 * further below (if we need to reparse the node) to include just
12553 * up through that final non-problematic character that this code
12554 * identifies, so when it is set to less than the full node, we can
12555 * skip the rest of this */
12556 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12558 const STRLEN full_len = len;
12560 assert(len >= MAX_NODE_STRING_SIZE);
12562 /* Here, <s> points to the final byte of the final character.
12563 * Look backwards through the string until find a non-
12564 * problematic character */
12568 /* This has no multi-char folds to non-UTF characters */
12569 if (ASCII_FOLD_RESTRICTED) {
12573 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12577 if (! PL_NonL1NonFinalFold) {
12578 PL_NonL1NonFinalFold = _new_invlist_C_array(
12579 NonL1_Perl_Non_Final_Folds_invlist);
12582 /* Point to the first byte of the final character */
12583 s = (char *) utf8_hop((U8 *) s, -1);
12585 while (s >= s0) { /* Search backwards until find
12586 non-problematic char */
12587 if (UTF8_IS_INVARIANT(*s)) {
12589 /* There are no ascii characters that participate
12590 * in multi-char folds under /aa. In EBCDIC, the
12591 * non-ascii invariants are all control characters,
12592 * so don't ever participate in any folds. */
12593 if (ASCII_FOLD_RESTRICTED
12594 || ! IS_NON_FINAL_FOLD(*s))
12599 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12600 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12606 else if (! _invlist_contains_cp(
12607 PL_NonL1NonFinalFold,
12608 valid_utf8_to_uvchr((U8 *) s, NULL)))
12613 /* Here, the current character is problematic in that
12614 * it does occur in the non-final position of some
12615 * fold, so try the character before it, but have to
12616 * special case the very first byte in the string, so
12617 * we don't read outside the string */
12618 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12619 } /* End of loop backwards through the string */
12621 /* If there were only problematic characters in the string,
12622 * <s> will point to before s0, in which case the length
12623 * should be 0, otherwise include the length of the
12624 * non-problematic character just found */
12625 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12628 /* Here, have found the final character, if any, that is
12629 * non-problematic as far as ending the node without splitting
12630 * it across a potential multi-char fold. <len> contains the
12631 * number of bytes in the node up-to and including that
12632 * character, or is 0 if there is no such character, meaning
12633 * the whole node contains only problematic characters. In
12634 * this case, give up and just take the node as-is. We can't
12639 /* If the node ends in an 's' we make sure it stays EXACTF,
12640 * as if it turns into an EXACTFU, it could later get
12641 * joined with another 's' that would then wrongly match
12643 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12645 maybe_exactfu = FALSE;
12649 /* Here, the node does contain some characters that aren't
12650 * problematic. If one such is the final character in the
12651 * node, we are done */
12652 if (len == full_len) {
12655 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12657 /* If the final character is problematic, but the
12658 * penultimate is not, back-off that last character to
12659 * later start a new node with it */
12664 /* Here, the final non-problematic character is earlier
12665 * in the input than the penultimate character. What we do
12666 * is reparse from the beginning, going up only as far as
12667 * this final ok one, thus guaranteeing that the node ends
12668 * in an acceptable character. The reason we reparse is
12669 * that we know how far in the character is, but we don't
12670 * know how to correlate its position with the input parse.
12671 * An alternate implementation would be to build that
12672 * correlation as we go along during the original parse,
12673 * but that would entail extra work for every node, whereas
12674 * this code gets executed only when the string is too
12675 * large for the node, and the final two characters are
12676 * problematic, an infrequent occurrence. Yet another
12677 * possible strategy would be to save the tail of the
12678 * string, and the next time regatom is called, initialize
12679 * with that. The problem with this is that unless you
12680 * back off one more character, you won't be guaranteed
12681 * regatom will get called again, unless regbranch,
12682 * regpiece ... are also changed. If you do back off that
12683 * extra character, so that there is input guaranteed to
12684 * force calling regatom, you can't handle the case where
12685 * just the first character in the node is acceptable. I
12686 * (khw) decided to try this method which doesn't have that
12687 * pitfall; if performance issues are found, we can do a
12688 * combination of the current approach plus that one */
12694 } /* End of verifying node ends with an appropriate char */
12696 loopdone: /* Jumped to when encounters something that shouldn't be in
12699 /* I (khw) don't know if you can get here with zero length, but the
12700 * old code handled this situation by creating a zero-length EXACT
12701 * node. Might as well be NOTHING instead */
12707 /* If 'maybe_exact' is still set here, means there are no
12708 * code points in the node that participate in folds;
12709 * similarly for 'maybe_exactfu' and code points that match
12710 * differently depending on UTF8ness of the target string
12711 * (for /u), or depending on locale for /l */
12715 else if (maybe_exactfu) {
12719 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12720 FALSE /* Don't look to see if could
12721 be turned into an EXACT
12722 node, as we have already
12727 RExC_parse = p - 1;
12728 Set_Node_Cur_Length(ret, parse_start);
12729 nextchar(pRExC_state);
12731 /* len is STRLEN which is unsigned, need to copy to signed */
12734 vFAIL("Internal disaster");
12737 } /* End of label 'defchar:' */
12739 } /* End of giant switch on input character */
12745 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12747 /* Returns the next non-pattern-white space, non-comment character (the
12748 * latter only if 'recognize_comment is true) in the string p, which is
12749 * ended by RExC_end. See also reg_skipcomment */
12750 const char *e = RExC_end;
12752 PERL_ARGS_ASSERT_REGPATWS;
12756 if ((len = is_PATWS_safe(p, e, UTF))) {
12759 else if (recognize_comment && *p == '#') {
12760 p = reg_skipcomment(pRExC_state, p);
12769 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12771 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12772 * sets up the bitmap and any flags, removing those code points from the
12773 * inversion list, setting it to NULL should it become completely empty */
12775 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12776 assert(PL_regkind[OP(node)] == ANYOF);
12778 ANYOF_BITMAP_ZERO(node);
12779 if (*invlist_ptr) {
12781 /* This gets set if we actually need to modify things */
12782 bool change_invlist = FALSE;
12786 /* Start looking through *invlist_ptr */
12787 invlist_iterinit(*invlist_ptr);
12788 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12792 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12793 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12795 else if (end >= NUM_ANYOF_CODE_POINTS) {
12796 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12799 /* Quit if are above what we should change */
12800 if (start >= NUM_ANYOF_CODE_POINTS) {
12804 change_invlist = TRUE;
12806 /* Set all the bits in the range, up to the max that we are doing */
12807 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12809 : NUM_ANYOF_CODE_POINTS - 1;
12810 for (i = start; i <= (int) high; i++) {
12811 if (! ANYOF_BITMAP_TEST(node, i)) {
12812 ANYOF_BITMAP_SET(node, i);
12816 invlist_iterfinish(*invlist_ptr);
12818 /* Done with loop; remove any code points that are in the bitmap from
12819 * *invlist_ptr; similarly for code points above the bitmap if we have
12820 * a flag to match all of them anyways */
12821 if (change_invlist) {
12822 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12824 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12825 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12828 /* If have completely emptied it, remove it completely */
12829 if (_invlist_len(*invlist_ptr) == 0) {
12830 SvREFCNT_dec_NN(*invlist_ptr);
12831 *invlist_ptr = NULL;
12836 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12837 Character classes ([:foo:]) can also be negated ([:^foo:]).
12838 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12839 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12840 but trigger failures because they are currently unimplemented. */
12842 #define POSIXCC_DONE(c) ((c) == ':')
12843 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12844 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12846 PERL_STATIC_INLINE I32
12847 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12849 I32 namedclass = OOB_NAMEDCLASS;
12851 PERL_ARGS_ASSERT_REGPPOSIXCC;
12853 if (value == '[' && RExC_parse + 1 < RExC_end &&
12854 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12855 POSIXCC(UCHARAT(RExC_parse)))
12857 const char c = UCHARAT(RExC_parse);
12858 char* const s = RExC_parse++;
12860 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12862 if (RExC_parse == RExC_end) {
12865 /* Try to give a better location for the error (than the end of
12866 * the string) by looking for the matching ']' */
12868 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12871 vFAIL2("Unmatched '%c' in POSIX class", c);
12873 /* Grandfather lone [:, [=, [. */
12877 const char* const t = RExC_parse++; /* skip over the c */
12880 if (UCHARAT(RExC_parse) == ']') {
12881 const char *posixcc = s + 1;
12882 RExC_parse++; /* skip over the ending ] */
12885 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12886 const I32 skip = t - posixcc;
12888 /* Initially switch on the length of the name. */
12891 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12892 this is the Perl \w
12894 namedclass = ANYOF_WORDCHAR;
12897 /* Names all of length 5. */
12898 /* alnum alpha ascii blank cntrl digit graph lower
12899 print punct space upper */
12900 /* Offset 4 gives the best switch position. */
12901 switch (posixcc[4]) {
12903 if (memEQ(posixcc, "alph", 4)) /* alpha */
12904 namedclass = ANYOF_ALPHA;
12907 if (memEQ(posixcc, "spac", 4)) /* space */
12908 namedclass = ANYOF_PSXSPC;
12911 if (memEQ(posixcc, "grap", 4)) /* graph */
12912 namedclass = ANYOF_GRAPH;
12915 if (memEQ(posixcc, "asci", 4)) /* ascii */
12916 namedclass = ANYOF_ASCII;
12919 if (memEQ(posixcc, "blan", 4)) /* blank */
12920 namedclass = ANYOF_BLANK;
12923 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12924 namedclass = ANYOF_CNTRL;
12927 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12928 namedclass = ANYOF_ALPHANUMERIC;
12931 if (memEQ(posixcc, "lowe", 4)) /* lower */
12932 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12933 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12934 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12937 if (memEQ(posixcc, "digi", 4)) /* digit */
12938 namedclass = ANYOF_DIGIT;
12939 else if (memEQ(posixcc, "prin", 4)) /* print */
12940 namedclass = ANYOF_PRINT;
12941 else if (memEQ(posixcc, "punc", 4)) /* punct */
12942 namedclass = ANYOF_PUNCT;
12947 if (memEQ(posixcc, "xdigit", 6))
12948 namedclass = ANYOF_XDIGIT;
12952 if (namedclass == OOB_NAMEDCLASS)
12954 "POSIX class [:%"UTF8f":] unknown",
12955 UTF8fARG(UTF, t - s - 1, s + 1));
12957 /* The #defines are structured so each complement is +1 to
12958 * the normal one */
12962 assert (posixcc[skip] == ':');
12963 assert (posixcc[skip+1] == ']');
12964 } else if (!SIZE_ONLY) {
12965 /* [[=foo=]] and [[.foo.]] are still future. */
12967 /* adjust RExC_parse so the warning shows after
12968 the class closes */
12969 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12971 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12974 /* Maternal grandfather:
12975 * "[:" ending in ":" but not in ":]" */
12977 vFAIL("Unmatched '[' in POSIX class");
12980 /* Grandfather lone [:, [=, [. */
12990 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12992 /* This applies some heuristics at the current parse position (which should
12993 * be at a '[') to see if what follows might be intended to be a [:posix:]
12994 * class. It returns true if it really is a posix class, of course, but it
12995 * also can return true if it thinks that what was intended was a posix
12996 * class that didn't quite make it.
12998 * It will return true for
13000 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13001 * ')' indicating the end of the (?[
13002 * [:any garbage including %^&$ punctuation:]
13004 * This is designed to be called only from S_handle_regex_sets; it could be
13005 * easily adapted to be called from the spot at the beginning of regclass()
13006 * that checks to see in a normal bracketed class if the surrounding []
13007 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13008 * change long-standing behavior, so I (khw) didn't do that */
13009 char* p = RExC_parse + 1;
13010 char first_char = *p;
13012 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13014 assert(*(p - 1) == '[');
13016 if (! POSIXCC(first_char)) {
13021 while (p < RExC_end && isWORDCHAR(*p)) p++;
13023 if (p >= RExC_end) {
13027 if (p - RExC_parse > 2 /* Got at least 1 word character */
13028 && (*p == first_char
13029 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13034 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13037 && p - RExC_parse > 2 /* [:] evaluates to colon;
13038 [::] is a bad posix class. */
13039 && first_char == *(p - 1));
13043 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13044 I32 *flagp, U32 depth,
13045 char * const oregcomp_parse)
13047 /* Handle the (?[...]) construct to do set operations */
13050 UV start, end; /* End points of code point ranges */
13052 char *save_end, *save_parse;
13057 const bool save_fold = FOLD;
13059 GET_RE_DEBUG_FLAGS_DECL;
13061 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13064 vFAIL("(?[...]) not valid in locale");
13066 RExC_uni_semantics = 1;
13068 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13069 * (such as EXACT). Thus we can skip most everything if just sizing. We
13070 * call regclass to handle '[]' so as to not have to reinvent its parsing
13071 * rules here (throwing away the size it computes each time). And, we exit
13072 * upon an unescaped ']' that isn't one ending a regclass. To do both
13073 * these things, we need to realize that something preceded by a backslash
13074 * is escaped, so we have to keep track of backslashes */
13076 Perl_ck_warner_d(aTHX_
13077 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13078 "The regex_sets feature is experimental" REPORT_LOCATION,
13079 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13081 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13082 RExC_precomp + (RExC_parse - RExC_precomp)));
13085 UV depth = 0; /* how many nested (?[...]) constructs */
13087 while (RExC_parse < RExC_end) {
13088 SV* current = NULL;
13089 RExC_parse = regpatws(pRExC_state, RExC_parse,
13090 TRUE); /* means recognize comments */
13091 switch (*RExC_parse) {
13093 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13098 /* Skip the next byte (which could cause us to end up in
13099 * the middle of a UTF-8 character, but since none of those
13100 * are confusable with anything we currently handle in this
13101 * switch (invariants all), it's safe. We'll just hit the
13102 * default: case next time and keep on incrementing until
13103 * we find one of the invariants we do handle. */
13108 /* If this looks like it is a [:posix:] class, leave the
13109 * parse pointer at the '[' to fool regclass() into
13110 * thinking it is part of a '[[:posix:]]'. That function
13111 * will use strict checking to force a syntax error if it
13112 * doesn't work out to a legitimate class */
13113 bool is_posix_class
13114 = could_it_be_a_POSIX_class(pRExC_state);
13115 if (! is_posix_class) {
13119 /* regclass() can only return RESTART_UTF8 if multi-char
13120 folds are allowed. */
13121 if (!regclass(pRExC_state, flagp,depth+1,
13122 is_posix_class, /* parse the whole char
13123 class only if not a
13125 FALSE, /* don't allow multi-char folds */
13126 TRUE, /* silence non-portable warnings. */
13128 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13131 /* function call leaves parse pointing to the ']', except
13132 * if we faked it */
13133 if (is_posix_class) {
13137 SvREFCNT_dec(current); /* In case it returned something */
13142 if (depth--) break;
13144 if (RExC_parse < RExC_end
13145 && *RExC_parse == ')')
13147 node = reganode(pRExC_state, ANYOF, 0);
13148 RExC_size += ANYOF_SKIP;
13149 nextchar(pRExC_state);
13150 Set_Node_Length(node,
13151 RExC_parse - oregcomp_parse + 1); /* MJD */
13160 FAIL("Syntax error in (?[...])");
13163 /* Pass 2 only after this. Everything in this construct is a
13164 * metacharacter. Operands begin with either a '\' (for an escape
13165 * sequence), or a '[' for a bracketed character class. Any other
13166 * character should be an operator, or parenthesis for grouping. Both
13167 * types of operands are handled by calling regclass() to parse them. It
13168 * is called with a parameter to indicate to return the computed inversion
13169 * list. The parsing here is implemented via a stack. Each entry on the
13170 * stack is a single character representing one of the operators, or the
13171 * '('; or else a pointer to an operand inversion list. */
13173 #define IS_OPERAND(a) (! SvIOK(a))
13175 /* The stack starts empty. It is a syntax error if the first thing parsed
13176 * is a binary operator; everything else is pushed on the stack. When an
13177 * operand is parsed, the top of the stack is examined. If it is a binary
13178 * operator, the item before it should be an operand, and both are replaced
13179 * by the result of doing that operation on the new operand and the one on
13180 * the stack. Thus a sequence of binary operands is reduced to a single
13181 * one before the next one is parsed.
13183 * A unary operator may immediately follow a binary in the input, for
13186 * When an operand is parsed and the top of the stack is a unary operator,
13187 * the operation is performed, and then the stack is rechecked to see if
13188 * this new operand is part of a binary operation; if so, it is handled as
13191 * A '(' is simply pushed on the stack; it is valid only if the stack is
13192 * empty, or the top element of the stack is an operator or another '('
13193 * (for which the parenthesized expression will become an operand). By the
13194 * time the corresponding ')' is parsed everything in between should have
13195 * been parsed and evaluated to a single operand (or else is a syntax
13196 * error), and is handled as a regular operand */
13198 sv_2mortal((SV *)(stack = newAV()));
13200 while (RExC_parse < RExC_end) {
13201 I32 top_index = av_tindex(stack);
13203 SV* current = NULL;
13205 /* Skip white space */
13206 RExC_parse = regpatws(pRExC_state, RExC_parse,
13207 TRUE /* means recognize comments */ );
13208 if (RExC_parse >= RExC_end) {
13209 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13211 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13218 if (av_tindex(stack) >= 0 /* This makes sure that we can
13219 safely subtract 1 from
13220 RExC_parse in the next clause.
13221 If we have something on the
13222 stack, we have parsed something
13224 && UCHARAT(RExC_parse - 1) == '('
13225 && RExC_parse < RExC_end)
13227 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13228 * This happens when we have some thing like
13230 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13232 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13234 * Here we would be handling the interpolated
13235 * '$thai_or_lao'. We handle this by a recursive call to
13236 * ourselves which returns the inversion list the
13237 * interpolated expression evaluates to. We use the flags
13238 * from the interpolated pattern. */
13239 U32 save_flags = RExC_flags;
13240 const char * const save_parse = ++RExC_parse;
13242 parse_lparen_question_flags(pRExC_state);
13244 if (RExC_parse == save_parse /* Makes sure there was at
13245 least one flag (or this
13246 embedding wasn't compiled)
13248 || RExC_parse >= RExC_end - 4
13249 || UCHARAT(RExC_parse) != ':'
13250 || UCHARAT(++RExC_parse) != '('
13251 || UCHARAT(++RExC_parse) != '?'
13252 || UCHARAT(++RExC_parse) != '[')
13255 /* In combination with the above, this moves the
13256 * pointer to the point just after the first erroneous
13257 * character (or if there are no flags, to where they
13258 * should have been) */
13259 if (RExC_parse >= RExC_end - 4) {
13260 RExC_parse = RExC_end;
13262 else if (RExC_parse != save_parse) {
13263 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13265 vFAIL("Expecting '(?flags:(?[...'");
13268 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13269 depth+1, oregcomp_parse);
13271 /* Here, 'current' contains the embedded expression's
13272 * inversion list, and RExC_parse points to the trailing
13273 * ']'; the next character should be the ')' which will be
13274 * paired with the '(' that has been put on the stack, so
13275 * the whole embedded expression reduces to '(operand)' */
13278 RExC_flags = save_flags;
13279 goto handle_operand;
13284 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13285 vFAIL("Unexpected character");
13288 /* regclass() can only return RESTART_UTF8 if multi-char
13289 folds are allowed. */
13290 if (!regclass(pRExC_state, flagp,depth+1,
13291 TRUE, /* means parse just the next thing */
13292 FALSE, /* don't allow multi-char folds */
13293 FALSE, /* don't silence non-portable warnings. */
13295 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13297 /* regclass() will return with parsing just the \ sequence,
13298 * leaving the parse pointer at the next thing to parse */
13300 goto handle_operand;
13302 case '[': /* Is a bracketed character class */
13304 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13306 if (! is_posix_class) {
13310 /* regclass() can only return RESTART_UTF8 if multi-char
13311 folds are allowed. */
13312 if(!regclass(pRExC_state, flagp,depth+1,
13313 is_posix_class, /* parse the whole char class
13314 only if not a posix class */
13315 FALSE, /* don't allow multi-char folds */
13316 FALSE, /* don't silence non-portable warnings. */
13318 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13320 /* function call leaves parse pointing to the ']', except if we
13322 if (is_posix_class) {
13326 goto handle_operand;
13335 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13336 || ! IS_OPERAND(*top_ptr))
13339 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13341 av_push(stack, newSVuv(curchar));
13345 av_push(stack, newSVuv(curchar));
13349 if (top_index >= 0) {
13350 top_ptr = av_fetch(stack, top_index, FALSE);
13352 if (IS_OPERAND(*top_ptr)) {
13354 vFAIL("Unexpected '(' with no preceding operator");
13357 av_push(stack, newSVuv(curchar));
13364 || ! (current = av_pop(stack))
13365 || ! IS_OPERAND(current)
13366 || ! (lparen = av_pop(stack))
13367 || IS_OPERAND(lparen)
13368 || SvUV(lparen) != '(')
13370 SvREFCNT_dec(current);
13372 vFAIL("Unexpected ')'");
13375 SvREFCNT_dec_NN(lparen);
13382 /* Here, we have an operand to process, in 'current' */
13384 if (top_index < 0) { /* Just push if stack is empty */
13385 av_push(stack, current);
13388 SV* top = av_pop(stack);
13390 char current_operator;
13392 if (IS_OPERAND(top)) {
13393 SvREFCNT_dec_NN(top);
13394 SvREFCNT_dec_NN(current);
13395 vFAIL("Operand with no preceding operator");
13397 current_operator = (char) SvUV(top);
13398 switch (current_operator) {
13399 case '(': /* Push the '(' back on followed by the new
13401 av_push(stack, top);
13402 av_push(stack, current);
13403 SvREFCNT_inc(top); /* Counters the '_dec' done
13404 just after the 'break', so
13405 it doesn't get wrongly freed
13410 _invlist_invert(current);
13412 /* Unlike binary operators, the top of the stack,
13413 * now that this unary one has been popped off, may
13414 * legally be an operator, and we now have operand
13417 SvREFCNT_dec_NN(top);
13418 goto handle_operand;
13421 prev = av_pop(stack);
13422 _invlist_intersection(prev,
13425 av_push(stack, current);
13430 prev = av_pop(stack);
13431 _invlist_union(prev, current, ¤t);
13432 av_push(stack, current);
13436 prev = av_pop(stack);;
13437 _invlist_subtract(prev, current, ¤t);
13438 av_push(stack, current);
13441 case '^': /* The union minus the intersection */
13447 prev = av_pop(stack);
13448 _invlist_union(prev, current, &u);
13449 _invlist_intersection(prev, current, &i);
13450 /* _invlist_subtract will overwrite current
13451 without freeing what it already contains */
13453 _invlist_subtract(u, i, ¤t);
13454 av_push(stack, current);
13455 SvREFCNT_dec_NN(i);
13456 SvREFCNT_dec_NN(u);
13457 SvREFCNT_dec_NN(element);
13462 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13464 SvREFCNT_dec_NN(top);
13465 SvREFCNT_dec(prev);
13469 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13472 if (av_tindex(stack) < 0 /* Was empty */
13473 || ((final = av_pop(stack)) == NULL)
13474 || ! IS_OPERAND(final)
13475 || av_tindex(stack) >= 0) /* More left on stack */
13477 vFAIL("Incomplete expression within '(?[ ])'");
13480 /* Here, 'final' is the resultant inversion list from evaluating the
13481 * expression. Return it if so requested */
13482 if (return_invlist) {
13483 *return_invlist = final;
13487 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13488 * expecting a string of ranges and individual code points */
13489 invlist_iterinit(final);
13490 result_string = newSVpvs("");
13491 while (invlist_iternext(final, &start, &end)) {
13492 if (start == end) {
13493 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13496 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13501 save_parse = RExC_parse;
13502 RExC_parse = SvPV(result_string, len);
13503 save_end = RExC_end;
13504 RExC_end = RExC_parse + len;
13506 /* We turn off folding around the call, as the class we have constructed
13507 * already has all folding taken into consideration, and we don't want
13508 * regclass() to add to that */
13509 RExC_flags &= ~RXf_PMf_FOLD;
13510 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13512 node = regclass(pRExC_state, flagp,depth+1,
13513 FALSE, /* means parse the whole char class */
13514 FALSE, /* don't allow multi-char folds */
13515 TRUE, /* silence non-portable warnings. The above may very
13516 well have generated non-portable code points, but
13517 they're valid on this machine */
13520 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13523 RExC_flags |= RXf_PMf_FOLD;
13525 RExC_parse = save_parse + 1;
13526 RExC_end = save_end;
13527 SvREFCNT_dec_NN(final);
13528 SvREFCNT_dec_NN(result_string);
13530 nextchar(pRExC_state);
13531 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13537 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13539 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13540 * innocent-looking character class, like /[ks]/i won't have to go out to
13541 * disk to find the possible matches.
13543 * This should be called only for a Latin1-range code points, cp, which is
13544 * known to be involved in a simple fold with other code points above
13545 * Latin1. It would give false results if /aa has been specified.
13546 * Multi-char folds are outside the scope of this, and must be handled
13549 * XXX It would be better to generate these via regen, in case a new
13550 * version of the Unicode standard adds new mappings, though that is not
13551 * really likely, and may be caught by the default: case of the switch
13554 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13556 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13562 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13566 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13569 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13570 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13572 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13573 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13574 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13576 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13577 *invlist = add_cp_to_invlist(*invlist,
13578 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13580 case LATIN_SMALL_LETTER_SHARP_S:
13581 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13584 /* Use deprecated warning to increase the chances of this being
13587 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13594 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13596 /* This adds the string scalar <multi_string> to the array
13597 * <multi_char_matches>. <multi_string> is known to have exactly
13598 * <cp_count> code points in it. This is used when constructing a
13599 * bracketed character class and we find something that needs to match more
13600 * than a single character.
13602 * <multi_char_matches> is actually an array of arrays. Each top-level
13603 * element is an array that contains all the strings known so far that are
13604 * the same length. And that length (in number of code points) is the same
13605 * as the index of the top-level array. Hence, the [2] element is an
13606 * array, each element thereof is a string containing TWO code points;
13607 * while element [3] is for strings of THREE characters, and so on. Since
13608 * this is for multi-char strings there can never be a [0] nor [1] element.
13610 * When we rewrite the character class below, we will do so such that the
13611 * longest strings are written first, so that it prefers the longest
13612 * matching strings first. This is done even if it turns out that any
13613 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13614 * Christiansen has agreed that this is ok. This makes the test for the
13615 * ligature 'ffi' come before the test for 'ff', for example */
13618 AV** this_array_ptr;
13620 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13622 if (! multi_char_matches) {
13623 multi_char_matches = newAV();
13626 if (av_exists(multi_char_matches, cp_count)) {
13627 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13628 this_array = *this_array_ptr;
13631 this_array = newAV();
13632 av_store(multi_char_matches, cp_count,
13635 av_push(this_array, multi_string);
13637 return multi_char_matches;
13640 /* The names of properties whose definitions are not known at compile time are
13641 * stored in this SV, after a constant heading. So if the length has been
13642 * changed since initialization, then there is a run-time definition. */
13643 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13644 (SvCUR(listsv) != initial_listsv_len)
13647 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13648 const bool stop_at_1, /* Just parse the next thing, don't
13649 look for a full character class */
13650 bool allow_multi_folds,
13651 const bool silence_non_portable, /* Don't output warnings
13654 SV** ret_invlist) /* Return an inversion list, not a node */
13656 /* parse a bracketed class specification. Most of these will produce an
13657 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13658 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13659 * under /i with multi-character folds: it will be rewritten following the
13660 * paradigm of this example, where the <multi-fold>s are characters which
13661 * fold to multiple character sequences:
13662 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13663 * gets effectively rewritten as:
13664 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13665 * reg() gets called (recursively) on the rewritten version, and this
13666 * function will return what it constructs. (Actually the <multi-fold>s
13667 * aren't physically removed from the [abcdefghi], it's just that they are
13668 * ignored in the recursion by means of a flag:
13669 * <RExC_in_multi_char_class>.)
13671 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13672 * characters, with the corresponding bit set if that character is in the
13673 * list. For characters above this, a range list or swash is used. There
13674 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13675 * determinable at compile time
13677 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13678 * to be restarted. This can only happen if ret_invlist is non-NULL.
13681 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13683 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13686 IV namedclass = OOB_NAMEDCLASS;
13687 char *rangebegin = NULL;
13688 bool need_class = 0;
13690 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13691 than just initialized. */
13692 SV* properties = NULL; /* Code points that match \p{} \P{} */
13693 SV* posixes = NULL; /* Code points that match classes like [:word:],
13694 extended beyond the Latin1 range. These have to
13695 be kept separate from other code points for much
13696 of this function because their handling is
13697 different under /i, and for most classes under
13699 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13700 separate for a while from the non-complemented
13701 versions because of complications with /d
13703 UV element_count = 0; /* Number of distinct elements in the class.
13704 Optimizations may be possible if this is tiny */
13705 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13706 character; used under /i */
13708 char * stop_ptr = RExC_end; /* where to stop parsing */
13709 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13711 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13713 /* Unicode properties are stored in a swash; this holds the current one
13714 * being parsed. If this swash is the only above-latin1 component of the
13715 * character class, an optimization is to pass it directly on to the
13716 * execution engine. Otherwise, it is set to NULL to indicate that there
13717 * are other things in the class that have to be dealt with at execution
13719 SV* swash = NULL; /* Code points that match \p{} \P{} */
13721 /* Set if a component of this character class is user-defined; just passed
13722 * on to the engine */
13723 bool has_user_defined_property = FALSE;
13725 /* inversion list of code points this node matches only when the target
13726 * string is in UTF-8. (Because is under /d) */
13727 SV* depends_list = NULL;
13729 /* Inversion list of code points this node matches regardless of things
13730 * like locale, folding, utf8ness of the target string */
13731 SV* cp_list = NULL;
13733 /* Like cp_list, but code points on this list need to be checked for things
13734 * that fold to/from them under /i */
13735 SV* cp_foldable_list = NULL;
13737 /* Like cp_list, but code points on this list are valid only when the
13738 * runtime locale is UTF-8 */
13739 SV* only_utf8_locale_list = NULL;
13742 /* In a range, counts how many 0-2 of the ends of it came from literals,
13743 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13744 UV literal_endpoint = 0;
13746 bool invert = FALSE; /* Is this class to be complemented */
13748 bool warn_super = ALWAYS_WARN_SUPER;
13750 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13751 case we need to change the emitted regop to an EXACT. */
13752 const char * orig_parse = RExC_parse;
13753 const SSize_t orig_size = RExC_size;
13754 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13755 GET_RE_DEBUG_FLAGS_DECL;
13757 PERL_ARGS_ASSERT_REGCLASS;
13759 PERL_UNUSED_ARG(depth);
13762 DEBUG_PARSE("clas");
13764 /* Assume we are going to generate an ANYOF node. */
13765 ret = reganode(pRExC_state, ANYOF, 0);
13768 RExC_size += ANYOF_SKIP;
13769 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13772 ANYOF_FLAGS(ret) = 0;
13774 RExC_emit += ANYOF_SKIP;
13775 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13776 initial_listsv_len = SvCUR(listsv);
13777 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13781 RExC_parse = regpatws(pRExC_state, RExC_parse,
13782 FALSE /* means don't recognize comments */ );
13785 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13788 allow_multi_folds = FALSE;
13791 RExC_parse = regpatws(pRExC_state, RExC_parse,
13792 FALSE /* means don't recognize comments */ );
13796 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13797 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13798 const char *s = RExC_parse;
13799 const char c = *s++;
13801 while (isWORDCHAR(*s))
13803 if (*s && c == *s && s[1] == ']') {
13804 SAVEFREESV(RExC_rx_sv);
13806 "POSIX syntax [%c %c] belongs inside character classes",
13808 (void)ReREFCNT_inc(RExC_rx_sv);
13812 /* If the caller wants us to just parse a single element, accomplish this
13813 * by faking the loop ending condition */
13814 if (stop_at_1 && RExC_end > RExC_parse) {
13815 stop_ptr = RExC_parse + 1;
13818 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13819 if (UCHARAT(RExC_parse) == ']')
13820 goto charclassloop;
13823 if (RExC_parse >= stop_ptr) {
13828 RExC_parse = regpatws(pRExC_state, RExC_parse,
13829 FALSE /* means don't recognize comments */ );
13832 if (UCHARAT(RExC_parse) == ']') {
13838 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13839 save_value = value;
13840 save_prevvalue = prevvalue;
13843 rangebegin = RExC_parse;
13847 value = utf8n_to_uvchr((U8*)RExC_parse,
13848 RExC_end - RExC_parse,
13849 &numlen, UTF8_ALLOW_DEFAULT);
13850 RExC_parse += numlen;
13853 value = UCHARAT(RExC_parse++);
13856 && RExC_parse < RExC_end
13857 && POSIXCC(UCHARAT(RExC_parse)))
13859 namedclass = regpposixcc(pRExC_state, value, strict);
13861 else if (value != '\\') {
13863 literal_endpoint++;
13867 /* Is a backslash; get the code point of the char after it */
13868 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13869 value = utf8n_to_uvchr((U8*)RExC_parse,
13870 RExC_end - RExC_parse,
13871 &numlen, UTF8_ALLOW_DEFAULT);
13872 RExC_parse += numlen;
13875 value = UCHARAT(RExC_parse++);
13877 /* Some compilers cannot handle switching on 64-bit integer
13878 * values, therefore value cannot be an UV. Yes, this will
13879 * be a problem later if we want switch on Unicode.
13880 * A similar issue a little bit later when switching on
13881 * namedclass. --jhi */
13883 /* If the \ is escaping white space when white space is being
13884 * skipped, it means that that white space is wanted literally, and
13885 * is already in 'value'. Otherwise, need to translate the escape
13886 * into what it signifies. */
13887 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13889 case 'w': namedclass = ANYOF_WORDCHAR; break;
13890 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13891 case 's': namedclass = ANYOF_SPACE; break;
13892 case 'S': namedclass = ANYOF_NSPACE; break;
13893 case 'd': namedclass = ANYOF_DIGIT; break;
13894 case 'D': namedclass = ANYOF_NDIGIT; break;
13895 case 'v': namedclass = ANYOF_VERTWS; break;
13896 case 'V': namedclass = ANYOF_NVERTWS; break;
13897 case 'h': namedclass = ANYOF_HORIZWS; break;
13898 case 'H': namedclass = ANYOF_NHORIZWS; break;
13899 case 'N': /* Handle \N{NAME} in class */
13902 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13903 flagp, depth, &as_text);
13904 if (*flagp & RESTART_UTF8)
13905 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13906 if (cp_count != 1) { /* The typical case drops through */
13907 assert(cp_count != (STRLEN) -1);
13908 if (cp_count == 0) {
13910 RExC_parse++; /* Position after the "}" */
13911 vFAIL("Zero length \\N{}");
13914 ckWARNreg(RExC_parse,
13915 "Ignoring zero length \\N{} in character class");
13918 else { /* cp_count > 1 */
13919 if (! RExC_in_multi_char_class) {
13920 if (invert || range || *RExC_parse == '-') {
13923 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13926 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13931 = add_multi_match(multi_char_matches,
13935 break; /* <value> contains the first code
13936 point. Drop out of the switch to
13939 } /* End of cp_count != 1 */
13941 /* This element should not be processed further in this
13944 value = save_value;
13945 prevvalue = save_prevvalue;
13946 continue; /* Back to top of loop to get next char */
13948 /* Here, is a single code point, and <value> contains it */
13950 /* We consider named characters to be literal characters */
13951 literal_endpoint++;
13960 /* We will handle any undefined properties ourselves */
13961 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13962 /* And we actually would prefer to get
13963 * the straight inversion list of the
13964 * swash, since we will be accessing it
13965 * anyway, to save a little time */
13966 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13968 if (RExC_parse >= RExC_end)
13969 vFAIL2("Empty \\%c{}", (U8)value);
13970 if (*RExC_parse == '{') {
13971 const U8 c = (U8)value;
13972 e = strchr(RExC_parse++, '}');
13974 vFAIL2("Missing right brace on \\%c{}", c);
13975 while (isSPACE(*RExC_parse))
13977 if (e == RExC_parse)
13978 vFAIL2("Empty \\%c{}", c);
13979 n = e - RExC_parse;
13980 while (isSPACE(*(RExC_parse + n - 1)))
13991 if (UCHARAT(RExC_parse) == '^') {
13994 /* toggle. (The rhs xor gets the single bit that
13995 * differs between P and p; the other xor inverts just
13997 value ^= 'P' ^ 'p';
13999 while (isSPACE(*RExC_parse)) {
14004 /* Try to get the definition of the property into
14005 * <invlist>. If /i is in effect, the effective property
14006 * will have its name be <__NAME_i>. The design is
14007 * discussed in commit
14008 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14009 name = savepv(Perl_form(aTHX_
14011 (FOLD) ? "__" : "",
14017 /* Look up the property name, and get its swash and
14018 * inversion list, if the property is found */
14020 SvREFCNT_dec_NN(swash);
14022 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14025 NULL, /* No inversion list */
14028 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14029 HV* curpkg = (IN_PERL_COMPILETIME)
14031 : CopSTASH(PL_curcop);
14033 SvREFCNT_dec_NN(swash);
14037 /* Here didn't find it. It could be a user-defined
14038 * property that will be available at run-time. If we
14039 * accept only compile-time properties, is an error;
14040 * otherwise add it to the list for run-time look up */
14042 RExC_parse = e + 1;
14044 "Property '%"UTF8f"' is unknown",
14045 UTF8fARG(UTF, n, name));
14048 /* If the property name doesn't already have a package
14049 * name, add the current one to it so that it can be
14050 * referred to outside it. [perl #121777] */
14051 if (curpkg && ! instr(name, "::")) {
14052 char* pkgname = HvNAME(curpkg);
14053 if (strNE(pkgname, "main")) {
14054 char* full_name = Perl_form(aTHX_
14058 n = strlen(full_name);
14060 name = savepvn(full_name, n);
14063 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14064 (value == 'p' ? '+' : '!'),
14065 UTF8fARG(UTF, n, name));
14066 has_user_defined_property = TRUE;
14068 /* We don't know yet, so have to assume that the
14069 * property could match something in the Latin1 range,
14070 * hence something that isn't utf8. Note that this
14071 * would cause things in <depends_list> to match
14072 * inappropriately, except that any \p{}, including
14073 * this one forces Unicode semantics, which means there
14074 * is no <depends_list> */
14076 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14080 /* Here, did get the swash and its inversion list. If
14081 * the swash is from a user-defined property, then this
14082 * whole character class should be regarded as such */
14083 if (swash_init_flags
14084 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14086 has_user_defined_property = TRUE;
14089 /* We warn on matching an above-Unicode code point
14090 * if the match would return true, except don't
14091 * warn for \p{All}, which has exactly one element
14093 (_invlist_contains_cp(invlist, 0x110000)
14094 && (! (_invlist_len(invlist) == 1
14095 && *invlist_array(invlist) == 0)))
14101 /* Invert if asking for the complement */
14102 if (value == 'P') {
14103 _invlist_union_complement_2nd(properties,
14107 /* The swash can't be used as-is, because we've
14108 * inverted things; delay removing it to here after
14109 * have copied its invlist above */
14110 SvREFCNT_dec_NN(swash);
14114 _invlist_union(properties, invlist, &properties);
14119 RExC_parse = e + 1;
14120 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14123 /* \p means they want Unicode semantics */
14124 RExC_uni_semantics = 1;
14127 case 'n': value = '\n'; break;
14128 case 'r': value = '\r'; break;
14129 case 't': value = '\t'; break;
14130 case 'f': value = '\f'; break;
14131 case 'b': value = '\b'; break;
14132 case 'e': value = ESC_NATIVE; break;
14133 case 'a': value = '\a'; break;
14135 RExC_parse--; /* function expects to be pointed at the 'o' */
14137 const char* error_msg;
14138 bool valid = grok_bslash_o(&RExC_parse,
14141 PASS2, /* warnings only in
14144 silence_non_portable,
14150 if (IN_ENCODING && value < 0x100) {
14151 goto recode_encoding;
14155 RExC_parse--; /* function expects to be pointed at the 'x' */
14157 const char* error_msg;
14158 bool valid = grok_bslash_x(&RExC_parse,
14161 PASS2, /* Output warnings */
14163 silence_non_portable,
14169 if (IN_ENCODING && value < 0x100)
14170 goto recode_encoding;
14173 value = grok_bslash_c(*RExC_parse++, PASS2);
14175 case '0': case '1': case '2': case '3': case '4':
14176 case '5': case '6': case '7':
14178 /* Take 1-3 octal digits */
14179 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14180 numlen = (strict) ? 4 : 3;
14181 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14182 RExC_parse += numlen;
14185 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14186 vFAIL("Need exactly 3 octal digits");
14188 else if (! SIZE_ONLY /* like \08, \178 */
14190 && RExC_parse < RExC_end
14191 && isDIGIT(*RExC_parse)
14192 && ckWARN(WARN_REGEXP))
14194 SAVEFREESV(RExC_rx_sv);
14195 reg_warn_non_literal_string(
14197 form_short_octal_warning(RExC_parse, numlen));
14198 (void)ReREFCNT_inc(RExC_rx_sv);
14201 if (IN_ENCODING && value < 0x100)
14202 goto recode_encoding;
14206 if (! RExC_override_recoding) {
14207 SV* enc = _get_encoding();
14208 value = reg_recode((const char)(U8)value, &enc);
14211 vFAIL("Invalid escape in the specified encoding");
14214 ckWARNreg(RExC_parse,
14215 "Invalid escape in the specified encoding");
14221 /* Allow \_ to not give an error */
14222 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14224 vFAIL2("Unrecognized escape \\%c in character class",
14228 SAVEFREESV(RExC_rx_sv);
14229 ckWARN2reg(RExC_parse,
14230 "Unrecognized escape \\%c in character class passed through",
14232 (void)ReREFCNT_inc(RExC_rx_sv);
14236 } /* End of switch on char following backslash */
14237 } /* end of handling backslash escape sequences */
14239 /* Here, we have the current token in 'value' */
14241 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14244 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14245 * literal, as is the character that began the false range, i.e.
14246 * the 'a' in the examples */
14249 const int w = (RExC_parse >= rangebegin)
14250 ? RExC_parse - rangebegin
14254 "False [] range \"%"UTF8f"\"",
14255 UTF8fARG(UTF, w, rangebegin));
14258 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14259 ckWARN2reg(RExC_parse,
14260 "False [] range \"%"UTF8f"\"",
14261 UTF8fARG(UTF, w, rangebegin));
14262 (void)ReREFCNT_inc(RExC_rx_sv);
14263 cp_list = add_cp_to_invlist(cp_list, '-');
14264 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14269 range = 0; /* this was not a true range */
14270 element_count += 2; /* So counts for three values */
14273 classnum = namedclass_to_classnum(namedclass);
14275 if (LOC && namedclass < ANYOF_POSIXL_MAX
14276 #ifndef HAS_ISASCII
14277 && classnum != _CC_ASCII
14280 /* What the Posix classes (like \w, [:space:]) match in locale
14281 * isn't knowable under locale until actual match time. Room
14282 * must be reserved (one time per outer bracketed class) to
14283 * store such classes. The space will contain a bit for each
14284 * named class that is to be matched against. This isn't
14285 * needed for \p{} and pseudo-classes, as they are not affected
14286 * by locale, and hence are dealt with separately */
14287 if (! need_class) {
14290 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14293 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14295 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14296 ANYOF_POSIXL_ZERO(ret);
14299 /* Coverity thinks it is possible for this to be negative; both
14300 * jhi and khw think it's not, but be safer */
14301 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14302 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14304 /* See if it already matches the complement of this POSIX
14306 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14307 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14311 posixl_matches_all = TRUE;
14312 break; /* No need to continue. Since it matches both
14313 e.g., \w and \W, it matches everything, and the
14314 bracketed class can be optimized into qr/./s */
14317 /* Add this class to those that should be checked at runtime */
14318 ANYOF_POSIXL_SET(ret, namedclass);
14320 /* The above-Latin1 characters are not subject to locale rules.
14321 * Just add them, in the second pass, to the
14322 * unconditionally-matched list */
14324 SV* scratch_list = NULL;
14326 /* Get the list of the above-Latin1 code points this
14328 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14329 PL_XPosix_ptrs[classnum],
14331 /* Odd numbers are complements, like
14332 * NDIGIT, NASCII, ... */
14333 namedclass % 2 != 0,
14335 /* Checking if 'cp_list' is NULL first saves an extra
14336 * clone. Its reference count will be decremented at the
14337 * next union, etc, or if this is the only instance, at the
14338 * end of the routine */
14340 cp_list = scratch_list;
14343 _invlist_union(cp_list, scratch_list, &cp_list);
14344 SvREFCNT_dec_NN(scratch_list);
14346 continue; /* Go get next character */
14349 else if (! SIZE_ONLY) {
14351 /* Here, not in pass1 (in that pass we skip calculating the
14352 * contents of this class), and is /l, or is a POSIX class for
14353 * which /l doesn't matter (or is a Unicode property, which is
14354 * skipped here). */
14355 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14356 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14358 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14359 * nor /l make a difference in what these match,
14360 * therefore we just add what they match to cp_list. */
14361 if (classnum != _CC_VERTSPACE) {
14362 assert( namedclass == ANYOF_HORIZWS
14363 || namedclass == ANYOF_NHORIZWS);
14365 /* It turns out that \h is just a synonym for
14367 classnum = _CC_BLANK;
14370 _invlist_union_maybe_complement_2nd(
14372 PL_XPosix_ptrs[classnum],
14373 namedclass % 2 != 0, /* Complement if odd
14374 (NHORIZWS, NVERTWS)
14379 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14380 complement and use nposixes */
14381 SV** posixes_ptr = namedclass % 2 == 0
14384 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14385 _invlist_union_maybe_complement_2nd(
14388 namedclass % 2 != 0,
14392 } /* end of namedclass \blah */
14395 RExC_parse = regpatws(pRExC_state, RExC_parse,
14396 FALSE /* means don't recognize comments */ );
14399 /* If 'range' is set, 'value' is the ending of a range--check its
14400 * validity. (If value isn't a single code point in the case of a
14401 * range, we should have figured that out above in the code that
14402 * catches false ranges). Later, we will handle each individual code
14403 * point in the range. If 'range' isn't set, this could be the
14404 * beginning of a range, so check for that by looking ahead to see if
14405 * the next real character to be processed is the range indicator--the
14409 if (prevvalue > value) /* b-a */ {
14410 const int w = RExC_parse - rangebegin;
14412 "Invalid [] range \"%"UTF8f"\"",
14413 UTF8fARG(UTF, w, rangebegin));
14414 range = 0; /* not a valid range */
14418 prevvalue = value; /* save the beginning of the potential range */
14419 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14420 && *RExC_parse == '-')
14422 char* next_char_ptr = RExC_parse + 1;
14423 if (skip_white) { /* Get the next real char after the '-' */
14424 next_char_ptr = regpatws(pRExC_state,
14426 FALSE); /* means don't recognize
14430 /* If the '-' is at the end of the class (just before the ']',
14431 * it is a literal minus; otherwise it is a range */
14432 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14433 RExC_parse = next_char_ptr;
14435 /* a bad range like \w-, [:word:]- ? */
14436 if (namedclass > OOB_NAMEDCLASS) {
14437 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14438 const int w = RExC_parse >= rangebegin
14439 ? RExC_parse - rangebegin
14442 vFAIL4("False [] range \"%*.*s\"",
14447 "False [] range \"%*.*s\"",
14452 cp_list = add_cp_to_invlist(cp_list, '-');
14456 range = 1; /* yeah, it's a range! */
14457 continue; /* but do it the next time */
14462 if (namedclass > OOB_NAMEDCLASS) {
14466 /* Here, we have a single value this time through the loop, and
14467 * <prevvalue> is the beginning of the range, if any; or <value> if
14470 /* non-Latin1 code point implies unicode semantics. Must be set in
14471 * pass1 so is there for the whole of pass 2 */
14473 RExC_uni_semantics = 1;
14476 /* Ready to process either the single value, or the completed range.
14477 * For single-valued non-inverted ranges, we consider the possibility
14478 * of multi-char folds. (We made a conscious decision to not do this
14479 * for the other cases because it can often lead to non-intuitive
14480 * results. For example, you have the peculiar case that:
14481 * "s s" =~ /^[^\xDF]+$/i => Y
14482 * "ss" =~ /^[^\xDF]+$/i => N
14484 * See [perl #89750] */
14485 if (FOLD && allow_multi_folds && value == prevvalue) {
14486 if (value == LATIN_SMALL_LETTER_SHARP_S
14487 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14490 /* Here <value> is indeed a multi-char fold. Get what it is */
14492 U8 foldbuf[UTF8_MAXBYTES_CASE];
14495 UV folded = _to_uni_fold_flags(
14499 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14500 ? FOLD_FLAGS_NOMIX_ASCII
14504 /* Here, <folded> should be the first character of the
14505 * multi-char fold of <value>, with <foldbuf> containing the
14506 * whole thing. But, if this fold is not allowed (because of
14507 * the flags), <fold> will be the same as <value>, and should
14508 * be processed like any other character, so skip the special
14510 if (folded != value) {
14512 /* Skip if we are recursed, currently parsing the class
14513 * again. Otherwise add this character to the list of
14514 * multi-char folds. */
14515 if (! RExC_in_multi_char_class) {
14516 STRLEN cp_count = utf8_length(foldbuf,
14517 foldbuf + foldlen);
14518 SV* multi_fold = sv_2mortal(newSVpvs(""));
14520 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14523 = add_multi_match(multi_char_matches,
14529 /* This element should not be processed further in this
14532 value = save_value;
14533 prevvalue = save_prevvalue;
14539 /* Deal with this element of the class */
14542 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14545 SV* this_range = _new_invlist(1);
14546 _append_range_to_invlist(this_range, prevvalue, value);
14548 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14549 * If this range was specified using something like 'i-j', we want
14550 * to include only the 'i' and the 'j', and not anything in
14551 * between, so exclude non-ASCII, non-alphabetics from it.
14552 * However, if the range was specified with something like
14553 * [\x89-\x91] or [\x89-j], all code points within it should be
14554 * included. literal_endpoint==2 means both ends of the range used
14555 * a literal character, not \x{foo} */
14556 if (literal_endpoint == 2
14557 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14558 || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14560 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14563 /* Since 'this_range' now only contains ascii, the intersection
14564 * of it with anything will still yield only ascii */
14565 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14568 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14569 literal_endpoint = 0;
14570 SvREFCNT_dec_NN(this_range);
14574 range = 0; /* this range (if it was one) is done now */
14575 } /* End of loop through all the text within the brackets */
14577 /* If anything in the class expands to more than one character, we have to
14578 * deal with them by building up a substitute parse string, and recursively
14579 * calling reg() on it, instead of proceeding */
14580 if (multi_char_matches) {
14581 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14584 char *save_end = RExC_end;
14585 char *save_parse = RExC_parse;
14586 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14591 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14592 because too confusing */
14594 sv_catpv(substitute_parse, "(?:");
14598 /* Look at the longest folds first */
14599 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14601 if (av_exists(multi_char_matches, cp_count)) {
14602 AV** this_array_ptr;
14605 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14607 while ((this_sequence = av_pop(*this_array_ptr)) !=
14610 if (! first_time) {
14611 sv_catpv(substitute_parse, "|");
14613 first_time = FALSE;
14615 sv_catpv(substitute_parse, SvPVX(this_sequence));
14620 /* If the character class contains anything else besides these
14621 * multi-character folds, have to include it in recursive parsing */
14622 if (element_count) {
14623 sv_catpv(substitute_parse, "|[");
14624 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14625 sv_catpv(substitute_parse, "]");
14628 sv_catpv(substitute_parse, ")");
14631 /* This is a way to get the parse to skip forward a whole named
14632 * sequence instead of matching the 2nd character when it fails the
14634 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14638 RExC_parse = SvPV(substitute_parse, len);
14639 RExC_end = RExC_parse + len;
14640 RExC_in_multi_char_class = 1;
14641 RExC_override_recoding = 1;
14642 RExC_emit = (regnode *)orig_emit;
14644 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14646 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14648 RExC_parse = save_parse;
14649 RExC_end = save_end;
14650 RExC_in_multi_char_class = 0;
14651 RExC_override_recoding = 0;
14652 SvREFCNT_dec_NN(multi_char_matches);
14656 /* Here, we've gone through the entire class and dealt with multi-char
14657 * folds. We are now in a position that we can do some checks to see if we
14658 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14659 * Currently we only do two checks:
14660 * 1) is in the unlikely event that the user has specified both, eg. \w and
14661 * \W under /l, then the class matches everything. (This optimization
14662 * is done only to make the optimizer code run later work.)
14663 * 2) if the character class contains only a single element (including a
14664 * single range), we see if there is an equivalent node for it.
14665 * Other checks are possible */
14666 if (! ret_invlist /* Can't optimize if returning the constructed
14668 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14673 if (UNLIKELY(posixl_matches_all)) {
14676 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14677 \w or [:digit:] or \p{foo}
14680 /* All named classes are mapped into POSIXish nodes, with its FLAG
14681 * argument giving which class it is */
14682 switch ((I32)namedclass) {
14683 case ANYOF_UNIPROP:
14686 /* These don't depend on the charset modifiers. They always
14687 * match under /u rules */
14688 case ANYOF_NHORIZWS:
14689 case ANYOF_HORIZWS:
14690 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14693 case ANYOF_NVERTWS:
14698 /* The actual POSIXish node for all the rest depends on the
14699 * charset modifier. The ones in the first set depend only on
14700 * ASCII or, if available on this platform, locale */
14704 op = (LOC) ? POSIXL : POSIXA;
14715 /* under /a could be alpha */
14717 if (ASCII_RESTRICTED) {
14718 namedclass = ANYOF_ALPHA + (namedclass % 2);
14726 /* The rest have more possibilities depending on the charset.
14727 * We take advantage of the enum ordering of the charset
14728 * modifiers to get the exact node type, */
14730 op = POSIXD + get_regex_charset(RExC_flags);
14731 if (op > POSIXA) { /* /aa is same as /a */
14736 /* The odd numbered ones are the complements of the
14737 * next-lower even number one */
14738 if (namedclass % 2 == 1) {
14742 arg = namedclass_to_classnum(namedclass);
14746 else if (value == prevvalue) {
14748 /* Here, the class consists of just a single code point */
14751 if (! LOC && value == '\n') {
14752 op = REG_ANY; /* Optimize [^\n] */
14753 *flagp |= HASWIDTH|SIMPLE;
14757 else if (value < 256 || UTF) {
14759 /* Optimize a single value into an EXACTish node, but not if it
14760 * would require converting the pattern to UTF-8. */
14761 op = compute_EXACTish(pRExC_state);
14763 } /* Otherwise is a range */
14764 else if (! LOC) { /* locale could vary these */
14765 if (prevvalue == '0') {
14766 if (value == '9') {
14771 else if (prevvalue == 'A') {
14774 && literal_endpoint == 2
14777 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14781 else if (prevvalue == 'a') {
14784 && literal_endpoint == 2
14787 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14793 /* Here, we have changed <op> away from its initial value iff we found
14794 * an optimization */
14797 /* Throw away this ANYOF regnode, and emit the calculated one,
14798 * which should correspond to the beginning, not current, state of
14800 const char * cur_parse = RExC_parse;
14801 RExC_parse = (char *)orig_parse;
14805 /* To get locale nodes to not use the full ANYOF size would
14806 * require moving the code above that writes the portions
14807 * of it that aren't in other nodes to after this point.
14808 * e.g. ANYOF_POSIXL_SET */
14809 RExC_size = orig_size;
14813 RExC_emit = (regnode *)orig_emit;
14814 if (PL_regkind[op] == POSIXD) {
14815 if (op == POSIXL) {
14816 RExC_contains_locale = 1;
14819 op += NPOSIXD - POSIXD;
14824 ret = reg_node(pRExC_state, op);
14826 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14830 *flagp |= HASWIDTH|SIMPLE;
14832 else if (PL_regkind[op] == EXACT) {
14833 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14834 TRUE /* downgradable to EXACT */
14838 RExC_parse = (char *) cur_parse;
14840 SvREFCNT_dec(posixes);
14841 SvREFCNT_dec(nposixes);
14842 SvREFCNT_dec(cp_list);
14843 SvREFCNT_dec(cp_foldable_list);
14850 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14852 /* If folding, we calculate all characters that could fold to or from the
14853 * ones already on the list */
14854 if (cp_foldable_list) {
14856 UV start, end; /* End points of code point ranges */
14858 SV* fold_intersection = NULL;
14861 /* Our calculated list will be for Unicode rules. For locale
14862 * matching, we have to keep a separate list that is consulted at
14863 * runtime only when the locale indicates Unicode rules. For
14864 * non-locale, we just use to the general list */
14866 use_list = &only_utf8_locale_list;
14869 use_list = &cp_list;
14872 /* Only the characters in this class that participate in folds need
14873 * be checked. Get the intersection of this class and all the
14874 * possible characters that are foldable. This can quickly narrow
14875 * down a large class */
14876 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14877 &fold_intersection);
14879 /* The folds for all the Latin1 characters are hard-coded into this
14880 * program, but we have to go out to disk to get the others. */
14881 if (invlist_highest(cp_foldable_list) >= 256) {
14883 /* This is a hash that for a particular fold gives all
14884 * characters that are involved in it */
14885 if (! PL_utf8_foldclosures) {
14886 _load_PL_utf8_foldclosures();
14890 /* Now look at the foldable characters in this class individually */
14891 invlist_iterinit(fold_intersection);
14892 while (invlist_iternext(fold_intersection, &start, &end)) {
14895 /* Look at every character in the range */
14896 for (j = start; j <= end; j++) {
14897 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14903 if (IS_IN_SOME_FOLD_L1(j)) {
14905 /* ASCII is always matched; non-ASCII is matched
14906 * only under Unicode rules (which could happen
14907 * under /l if the locale is a UTF-8 one */
14908 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14909 *use_list = add_cp_to_invlist(*use_list,
14910 PL_fold_latin1[j]);
14914 add_cp_to_invlist(depends_list,
14915 PL_fold_latin1[j]);
14919 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14920 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14922 add_above_Latin1_folds(pRExC_state,
14929 /* Here is an above Latin1 character. We don't have the
14930 * rules hard-coded for it. First, get its fold. This is
14931 * the simple fold, as the multi-character folds have been
14932 * handled earlier and separated out */
14933 _to_uni_fold_flags(j, foldbuf, &foldlen,
14934 (ASCII_FOLD_RESTRICTED)
14935 ? FOLD_FLAGS_NOMIX_ASCII
14938 /* Single character fold of above Latin1. Add everything in
14939 * its fold closure to the list that this node should match.
14940 * The fold closures data structure is a hash with the keys
14941 * being the UTF-8 of every character that is folded to, like
14942 * 'k', and the values each an array of all code points that
14943 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14944 * Multi-character folds are not included */
14945 if ((listp = hv_fetch(PL_utf8_foldclosures,
14946 (char *) foldbuf, foldlen, FALSE)))
14948 AV* list = (AV*) *listp;
14950 for (k = 0; k <= av_tindex(list); k++) {
14951 SV** c_p = av_fetch(list, k, FALSE);
14957 /* /aa doesn't allow folds between ASCII and non- */
14958 if ((ASCII_FOLD_RESTRICTED
14959 && (isASCII(c) != isASCII(j))))
14964 /* Folds under /l which cross the 255/256 boundary
14965 * are added to a separate list. (These are valid
14966 * only when the locale is UTF-8.) */
14967 if (c < 256 && LOC) {
14968 *use_list = add_cp_to_invlist(*use_list, c);
14972 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14974 cp_list = add_cp_to_invlist(cp_list, c);
14977 /* Similarly folds involving non-ascii Latin1
14978 * characters under /d are added to their list */
14979 depends_list = add_cp_to_invlist(depends_list,
14986 SvREFCNT_dec_NN(fold_intersection);
14989 /* Now that we have finished adding all the folds, there is no reason
14990 * to keep the foldable list separate */
14991 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14992 SvREFCNT_dec_NN(cp_foldable_list);
14995 /* And combine the result (if any) with any inversion list from posix
14996 * classes. The lists are kept separate up to now because we don't want to
14997 * fold the classes (folding of those is automatically handled by the swash
14998 * fetching code) */
14999 if (posixes || nposixes) {
15000 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15001 /* Under /a and /aa, nothing above ASCII matches these */
15002 _invlist_intersection(posixes,
15003 PL_XPosix_ptrs[_CC_ASCII],
15007 if (DEPENDS_SEMANTICS) {
15008 /* Under /d, everything in the upper half of the Latin1 range
15009 * matches these complements */
15010 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15012 else if (AT_LEAST_ASCII_RESTRICTED) {
15013 /* Under /a and /aa, everything above ASCII matches these
15015 _invlist_union_complement_2nd(nposixes,
15016 PL_XPosix_ptrs[_CC_ASCII],
15020 _invlist_union(posixes, nposixes, &posixes);
15021 SvREFCNT_dec_NN(nposixes);
15024 posixes = nposixes;
15027 if (! DEPENDS_SEMANTICS) {
15029 _invlist_union(cp_list, posixes, &cp_list);
15030 SvREFCNT_dec_NN(posixes);
15037 /* Under /d, we put into a separate list the Latin1 things that
15038 * match only when the target string is utf8 */
15039 SV* nonascii_but_latin1_properties = NULL;
15040 _invlist_intersection(posixes, PL_UpperLatin1,
15041 &nonascii_but_latin1_properties);
15042 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15045 _invlist_union(cp_list, posixes, &cp_list);
15046 SvREFCNT_dec_NN(posixes);
15052 if (depends_list) {
15053 _invlist_union(depends_list, nonascii_but_latin1_properties,
15055 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15058 depends_list = nonascii_but_latin1_properties;
15063 /* And combine the result (if any) with any inversion list from properties.
15064 * The lists are kept separate up to now so that we can distinguish the two
15065 * in regards to matching above-Unicode. A run-time warning is generated
15066 * if a Unicode property is matched against a non-Unicode code point. But,
15067 * we allow user-defined properties to match anything, without any warning,
15068 * and we also suppress the warning if there is a portion of the character
15069 * class that isn't a Unicode property, and which matches above Unicode, \W
15070 * or [\x{110000}] for example.
15071 * (Note that in this case, unlike the Posix one above, there is no
15072 * <depends_list>, because having a Unicode property forces Unicode
15077 /* If it matters to the final outcome, see if a non-property
15078 * component of the class matches above Unicode. If so, the
15079 * warning gets suppressed. This is true even if just a single
15080 * such code point is specified, as though not strictly correct if
15081 * another such code point is matched against, the fact that they
15082 * are using above-Unicode code points indicates they should know
15083 * the issues involved */
15085 warn_super = ! (invert
15086 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15089 _invlist_union(properties, cp_list, &cp_list);
15090 SvREFCNT_dec_NN(properties);
15093 cp_list = properties;
15097 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15101 /* Here, we have calculated what code points should be in the character
15104 * Now we can see about various optimizations. Fold calculation (which we
15105 * did above) needs to take place before inversion. Otherwise /[^k]/i
15106 * would invert to include K, which under /i would match k, which it
15107 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15108 * folded until runtime */
15110 /* If we didn't do folding, it's because some information isn't available
15111 * until runtime; set the run-time fold flag for these. (We don't have to
15112 * worry about properties folding, as that is taken care of by the swash
15113 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15114 * locales, or the class matches at least one 0-255 range code point */
15116 if (only_utf8_locale_list) {
15117 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15119 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15122 invlist_iterinit(cp_list);
15123 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15124 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15126 invlist_iterfinish(cp_list);
15130 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15131 * at compile time. Besides not inverting folded locale now, we can't
15132 * invert if there are things such as \w, which aren't known until runtime
15136 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15138 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15140 _invlist_invert(cp_list);
15142 /* Any swash can't be used as-is, because we've inverted things */
15144 SvREFCNT_dec_NN(swash);
15148 /* Clear the invert flag since have just done it here */
15153 *ret_invlist = cp_list;
15154 SvREFCNT_dec(swash);
15156 /* Discard the generated node */
15158 RExC_size = orig_size;
15161 RExC_emit = orig_emit;
15166 /* Some character classes are equivalent to other nodes. Such nodes take
15167 * up less room and generally fewer operations to execute than ANYOF nodes.
15168 * Above, we checked for and optimized into some such equivalents for
15169 * certain common classes that are easy to test. Getting to this point in
15170 * the code means that the class didn't get optimized there. Since this
15171 * code is only executed in Pass 2, it is too late to save space--it has
15172 * been allocated in Pass 1, and currently isn't given back. But turning
15173 * things into an EXACTish node can allow the optimizer to join it to any
15174 * adjacent such nodes. And if the class is equivalent to things like /./,
15175 * expensive run-time swashes can be avoided. Now that we have more
15176 * complete information, we can find things necessarily missed by the
15177 * earlier code. I (khw) am not sure how much to look for here. It would
15178 * be easy, but perhaps too slow, to check any candidates against all the
15179 * node types they could possibly match using _invlistEQ(). */
15184 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15185 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15187 /* We don't optimize if we are supposed to make sure all non-Unicode
15188 * code points raise a warning, as only ANYOF nodes have this check.
15190 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15193 U8 op = END; /* The optimzation node-type */
15194 const char * cur_parse= RExC_parse;
15196 invlist_iterinit(cp_list);
15197 if (! invlist_iternext(cp_list, &start, &end)) {
15199 /* Here, the list is empty. This happens, for example, when a
15200 * Unicode property is the only thing in the character class, and
15201 * it doesn't match anything. (perluniprops.pod notes such
15204 *flagp |= HASWIDTH|SIMPLE;
15206 else if (start == end) { /* The range is a single code point */
15207 if (! invlist_iternext(cp_list, &start, &end)
15209 /* Don't do this optimization if it would require changing
15210 * the pattern to UTF-8 */
15211 && (start < 256 || UTF))
15213 /* Here, the list contains a single code point. Can optimize
15214 * into an EXACTish node */
15223 /* A locale node under folding with one code point can be
15224 * an EXACTFL, as its fold won't be calculated until
15230 /* Here, we are generally folding, but there is only one
15231 * code point to match. If we have to, we use an EXACT
15232 * node, but it would be better for joining with adjacent
15233 * nodes in the optimization pass if we used the same
15234 * EXACTFish node that any such are likely to be. We can
15235 * do this iff the code point doesn't participate in any
15236 * folds. For example, an EXACTF of a colon is the same as
15237 * an EXACT one, since nothing folds to or from a colon. */
15239 if (IS_IN_SOME_FOLD_L1(value)) {
15244 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15249 /* If we haven't found the node type, above, it means we
15250 * can use the prevailing one */
15252 op = compute_EXACTish(pRExC_state);
15257 else if (start == 0) {
15258 if (end == UV_MAX) {
15260 *flagp |= HASWIDTH|SIMPLE;
15263 else if (end == '\n' - 1
15264 && invlist_iternext(cp_list, &start, &end)
15265 && start == '\n' + 1 && end == UV_MAX)
15268 *flagp |= HASWIDTH|SIMPLE;
15272 invlist_iterfinish(cp_list);
15275 RExC_parse = (char *)orig_parse;
15276 RExC_emit = (regnode *)orig_emit;
15278 ret = reg_node(pRExC_state, op);
15280 RExC_parse = (char *)cur_parse;
15282 if (PL_regkind[op] == EXACT) {
15283 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15284 TRUE /* downgradable to EXACT */
15288 SvREFCNT_dec_NN(cp_list);
15293 /* Here, <cp_list> contains all the code points we can determine at
15294 * compile time that match under all conditions. Go through it, and
15295 * for things that belong in the bitmap, put them there, and delete from
15296 * <cp_list>. While we are at it, see if everything above 255 is in the
15297 * list, and if so, set a flag to speed up execution */
15299 populate_ANYOF_from_invlist(ret, &cp_list);
15302 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15305 /* Here, the bitmap has been populated with all the Latin1 code points that
15306 * always match. Can now add to the overall list those that match only
15307 * when the target string is UTF-8 (<depends_list>). */
15308 if (depends_list) {
15310 _invlist_union(cp_list, depends_list, &cp_list);
15311 SvREFCNT_dec_NN(depends_list);
15314 cp_list = depends_list;
15316 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15319 /* If there is a swash and more than one element, we can't use the swash in
15320 * the optimization below. */
15321 if (swash && element_count > 1) {
15322 SvREFCNT_dec_NN(swash);
15326 /* Note that the optimization of using 'swash' if it is the only thing in
15327 * the class doesn't have us change swash at all, so it can include things
15328 * that are also in the bitmap; otherwise we have purposely deleted that
15329 * duplicate information */
15330 set_ANYOF_arg(pRExC_state, ret, cp_list,
15331 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15333 only_utf8_locale_list,
15334 swash, has_user_defined_property);
15336 *flagp |= HASWIDTH|SIMPLE;
15338 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15339 RExC_contains_locale = 1;
15345 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15348 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15349 regnode* const node,
15351 SV* const runtime_defns,
15352 SV* const only_utf8_locale_list,
15354 const bool has_user_defined_property)
15356 /* Sets the arg field of an ANYOF-type node 'node', using information about
15357 * the node passed-in. If there is nothing outside the node's bitmap, the
15358 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15359 * the count returned by add_data(), having allocated and stored an array,
15360 * av, that that count references, as follows:
15361 * av[0] stores the character class description in its textual form.
15362 * This is used later (regexec.c:Perl_regclass_swash()) to
15363 * initialize the appropriate swash, and is also useful for dumping
15364 * the regnode. This is set to &PL_sv_undef if the textual
15365 * description is not needed at run-time (as happens if the other
15366 * elements completely define the class)
15367 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15368 * computed from av[0]. But if no further computation need be done,
15369 * the swash is stored here now (and av[0] is &PL_sv_undef).
15370 * av[2] stores the inversion list of code points that match only if the
15371 * current locale is UTF-8
15372 * av[3] stores the cp_list inversion list for use in addition or instead
15373 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15374 * (Otherwise everything needed is already in av[0] and av[1])
15375 * av[4] is set if any component of the class is from a user-defined
15376 * property; used only if av[3] exists */
15380 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15382 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15383 assert(! (ANYOF_FLAGS(node)
15384 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15385 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15386 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15389 AV * const av = newAV();
15392 assert(ANYOF_FLAGS(node)
15393 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15394 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15396 av_store(av, 0, (runtime_defns)
15397 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15400 av_store(av, 1, swash);
15401 SvREFCNT_dec_NN(cp_list);
15404 av_store(av, 1, &PL_sv_undef);
15406 av_store(av, 3, cp_list);
15407 av_store(av, 4, newSVuv(has_user_defined_property));
15411 if (only_utf8_locale_list) {
15412 av_store(av, 2, only_utf8_locale_list);
15415 av_store(av, 2, &PL_sv_undef);
15418 rv = newRV_noinc(MUTABLE_SV(av));
15419 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15420 RExC_rxi->data->data[n] = (void*)rv;
15425 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15427 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15428 const regnode* node,
15431 SV** only_utf8_locale_ptr,
15435 /* For internal core use only.
15436 * Returns the swash for the input 'node' in the regex 'prog'.
15437 * If <doinit> is 'true', will attempt to create the swash if not already
15439 * If <listsvp> is non-null, will return the printable contents of the
15440 * swash. This can be used to get debugging information even before the
15441 * swash exists, by calling this function with 'doinit' set to false, in
15442 * which case the components that will be used to eventually create the
15443 * swash are returned (in a printable form).
15444 * If <exclude_list> is not NULL, it is an inversion list of things to
15445 * exclude from what's returned in <listsvp>.
15446 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15447 * that, in spite of this function's name, the swash it returns may include
15448 * the bitmap data as well */
15451 SV *si = NULL; /* Input swash initialization string */
15452 SV* invlist = NULL;
15454 RXi_GET_DECL(prog,progi);
15455 const struct reg_data * const data = prog ? progi->data : NULL;
15457 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15459 assert(ANYOF_FLAGS(node)
15460 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15461 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15463 if (data && data->count) {
15464 const U32 n = ARG(node);
15466 if (data->what[n] == 's') {
15467 SV * const rv = MUTABLE_SV(data->data[n]);
15468 AV * const av = MUTABLE_AV(SvRV(rv));
15469 SV **const ary = AvARRAY(av);
15470 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15472 si = *ary; /* ary[0] = the string to initialize the swash with */
15474 /* Elements 3 and 4 are either both present or both absent. [3] is
15475 * any inversion list generated at compile time; [4] indicates if
15476 * that inversion list has any user-defined properties in it. */
15477 if (av_tindex(av) >= 2) {
15478 if (only_utf8_locale_ptr
15480 && ary[2] != &PL_sv_undef)
15482 *only_utf8_locale_ptr = ary[2];
15485 assert(only_utf8_locale_ptr);
15486 *only_utf8_locale_ptr = NULL;
15489 if (av_tindex(av) >= 3) {
15491 if (SvUV(ary[4])) {
15492 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15500 /* Element [1] is reserved for the set-up swash. If already there,
15501 * return it; if not, create it and store it there */
15502 if (ary[1] && SvROK(ary[1])) {
15505 else if (doinit && ((si && si != &PL_sv_undef)
15506 || (invlist && invlist != &PL_sv_undef))) {
15508 sw = _core_swash_init("utf8", /* the utf8 package */
15512 0, /* not from tr/// */
15514 &swash_init_flags);
15515 (void)av_store(av, 1, sw);
15520 /* If requested, return a printable version of what this swash matches */
15522 SV* matches_string = newSVpvs("");
15524 /* The swash should be used, if possible, to get the data, as it
15525 * contains the resolved data. But this function can be called at
15526 * compile-time, before everything gets resolved, in which case we
15527 * return the currently best available information, which is the string
15528 * that will eventually be used to do that resolving, 'si' */
15529 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15530 && (si && si != &PL_sv_undef))
15532 sv_catsv(matches_string, si);
15535 /* Add the inversion list to whatever we have. This may have come from
15536 * the swash, or from an input parameter */
15538 if (exclude_list) {
15539 SV* clone = invlist_clone(invlist);
15540 _invlist_subtract(clone, exclude_list, &clone);
15541 sv_catsv(matches_string, _invlist_contents(clone));
15542 SvREFCNT_dec_NN(clone);
15545 sv_catsv(matches_string, _invlist_contents(invlist));
15548 *listsvp = matches_string;
15553 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15555 /* reg_skipcomment()
15557 Absorbs an /x style # comment from the input stream,
15558 returning a pointer to the first character beyond the comment, or if the
15559 comment terminates the pattern without anything following it, this returns
15560 one past the final character of the pattern (in other words, RExC_end) and
15561 sets the REG_RUN_ON_COMMENT_SEEN flag.
15563 Note it's the callers responsibility to ensure that we are
15564 actually in /x mode
15568 PERL_STATIC_INLINE char*
15569 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15571 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15575 while (p < RExC_end) {
15576 if (*(++p) == '\n') {
15581 /* we ran off the end of the pattern without ending the comment, so we have
15582 * to add an \n when wrapping */
15583 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15589 Advances the parse position, and optionally absorbs
15590 "whitespace" from the inputstream.
15592 Without /x "whitespace" means (?#...) style comments only,
15593 with /x this means (?#...) and # comments and whitespace proper.
15595 Returns the RExC_parse point from BEFORE the scan occurs.
15597 This is the /x friendly way of saying RExC_parse++.
15601 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15603 char* const retval = RExC_parse++;
15605 PERL_ARGS_ASSERT_NEXTCHAR;
15608 if (RExC_end - RExC_parse >= 3
15609 && *RExC_parse == '('
15610 && RExC_parse[1] == '?'
15611 && RExC_parse[2] == '#')
15613 while (*RExC_parse != ')') {
15614 if (RExC_parse == RExC_end)
15615 FAIL("Sequence (?#... not terminated");
15621 if (RExC_flags & RXf_PMf_EXTENDED) {
15622 char * p = regpatws(pRExC_state, RExC_parse,
15623 TRUE); /* means recognize comments */
15624 if (p != RExC_parse) {
15634 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15636 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15637 * space. In pass1, it aligns and increments RExC_size; in pass2,
15640 regnode * const ret = RExC_emit;
15641 GET_RE_DEBUG_FLAGS_DECL;
15643 PERL_ARGS_ASSERT_REGNODE_GUTS;
15645 assert(extra_size >= regarglen[op]);
15648 SIZE_ALIGN(RExC_size);
15649 RExC_size += 1 + extra_size;
15652 if (RExC_emit >= RExC_emit_bound)
15653 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15654 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15656 NODE_ALIGN_FILL(ret);
15657 #ifndef RE_TRACK_PATTERN_OFFSETS
15658 PERL_UNUSED_ARG(name);
15660 if (RExC_offsets) { /* MJD */
15662 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15665 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15666 ? "Overwriting end of array!\n" : "OK",
15667 (UV)(RExC_emit - RExC_emit_start),
15668 (UV)(RExC_parse - RExC_start),
15669 (UV)RExC_offsets[0]));
15670 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15677 - reg_node - emit a node
15679 STATIC regnode * /* Location. */
15680 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15682 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15684 PERL_ARGS_ASSERT_REG_NODE;
15686 assert(regarglen[op] == 0);
15689 regnode *ptr = ret;
15690 FILL_ADVANCE_NODE(ptr, op);
15697 - reganode - emit a node with an argument
15699 STATIC regnode * /* Location. */
15700 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15702 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15704 PERL_ARGS_ASSERT_REGANODE;
15706 assert(regarglen[op] == 1);
15709 regnode *ptr = ret;
15710 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15717 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15719 /* emit a node with U32 and I32 arguments */
15721 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15723 PERL_ARGS_ASSERT_REG2LANODE;
15725 assert(regarglen[op] == 2);
15728 regnode *ptr = ret;
15729 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15736 - reguni - emit (if appropriate) a Unicode character
15738 PERL_STATIC_INLINE STRLEN
15739 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15741 PERL_ARGS_ASSERT_REGUNI;
15743 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15747 - reginsert - insert an operator in front of already-emitted operand
15749 * Means relocating the operand.
15752 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15757 const int offset = regarglen[(U8)op];
15758 const int size = NODE_STEP_REGNODE + offset;
15759 GET_RE_DEBUG_FLAGS_DECL;
15761 PERL_ARGS_ASSERT_REGINSERT;
15762 PERL_UNUSED_CONTEXT;
15763 PERL_UNUSED_ARG(depth);
15764 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15765 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15774 if (RExC_open_parens) {
15776 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15777 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15778 if ( RExC_open_parens[paren] >= opnd ) {
15779 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15780 RExC_open_parens[paren] += size;
15782 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15784 if ( RExC_close_parens[paren] >= opnd ) {
15785 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15786 RExC_close_parens[paren] += size;
15788 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15793 while (src > opnd) {
15794 StructCopy(--src, --dst, regnode);
15795 #ifdef RE_TRACK_PATTERN_OFFSETS
15796 if (RExC_offsets) { /* MJD 20010112 */
15798 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15802 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15803 ? "Overwriting end of array!\n" : "OK",
15804 (UV)(src - RExC_emit_start),
15805 (UV)(dst - RExC_emit_start),
15806 (UV)RExC_offsets[0]));
15807 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15808 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15814 place = opnd; /* Op node, where operand used to be. */
15815 #ifdef RE_TRACK_PATTERN_OFFSETS
15816 if (RExC_offsets) { /* MJD */
15818 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15822 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15823 ? "Overwriting end of array!\n" : "OK",
15824 (UV)(place - RExC_emit_start),
15825 (UV)(RExC_parse - RExC_start),
15826 (UV)RExC_offsets[0]));
15827 Set_Node_Offset(place, RExC_parse);
15828 Set_Node_Length(place, 1);
15831 src = NEXTOPER(place);
15832 FILL_ADVANCE_NODE(place, op);
15833 Zero(src, offset, regnode);
15837 - regtail - set the next-pointer at the end of a node chain of p to val.
15838 - SEE ALSO: regtail_study
15840 /* TODO: All three parms should be const */
15842 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15843 const regnode *val,U32 depth)
15846 GET_RE_DEBUG_FLAGS_DECL;
15848 PERL_ARGS_ASSERT_REGTAIL;
15850 PERL_UNUSED_ARG(depth);
15856 /* Find last node. */
15859 regnode * const temp = regnext(scan);
15861 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15862 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15863 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15864 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15865 (temp == NULL ? "->" : ""),
15866 (temp == NULL ? PL_reg_name[OP(val)] : "")
15874 if (reg_off_by_arg[OP(scan)]) {
15875 ARG_SET(scan, val - scan);
15878 NEXT_OFF(scan) = val - scan;
15884 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15885 - Look for optimizable sequences at the same time.
15886 - currently only looks for EXACT chains.
15888 This is experimental code. The idea is to use this routine to perform
15889 in place optimizations on branches and groups as they are constructed,
15890 with the long term intention of removing optimization from study_chunk so
15891 that it is purely analytical.
15893 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15894 to control which is which.
15897 /* TODO: All four parms should be const */
15900 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15901 const regnode *val,U32 depth)
15905 #ifdef EXPERIMENTAL_INPLACESCAN
15908 GET_RE_DEBUG_FLAGS_DECL;
15910 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15916 /* Find last node. */
15920 regnode * const temp = regnext(scan);
15921 #ifdef EXPERIMENTAL_INPLACESCAN
15922 if (PL_regkind[OP(scan)] == EXACT) {
15923 bool unfolded_multi_char; /* Unexamined in this routine */
15924 if (join_exact(pRExC_state, scan, &min,
15925 &unfolded_multi_char, 1, val, depth+1))
15930 switch (OP(scan)) {
15933 case EXACTFA_NO_TRIE:
15938 if( exact == PSEUDO )
15940 else if ( exact != OP(scan) )
15949 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15950 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15951 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15952 SvPV_nolen_const(RExC_mysv),
15953 REG_NODE_NUM(scan),
15954 PL_reg_name[exact]);
15961 DEBUG_PARSE_MSG("");
15962 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
15963 PerlIO_printf(Perl_debug_log,
15964 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15965 SvPV_nolen_const(RExC_mysv),
15966 (IV)REG_NODE_NUM(val),
15970 if (reg_off_by_arg[OP(scan)]) {
15971 ARG_SET(scan, val - scan);
15974 NEXT_OFF(scan) = val - scan;
15982 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15987 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15992 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15994 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15995 if (flags & (1<<bit)) {
15996 if (!set++ && lead)
15997 PerlIO_printf(Perl_debug_log, "%s",lead);
15998 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16003 PerlIO_printf(Perl_debug_log, "\n");
16005 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16010 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16016 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16018 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16019 if (flags & (1<<bit)) {
16020 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16023 if (!set++ && lead)
16024 PerlIO_printf(Perl_debug_log, "%s",lead);
16025 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16028 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16029 if (!set++ && lead) {
16030 PerlIO_printf(Perl_debug_log, "%s",lead);
16033 case REGEX_UNICODE_CHARSET:
16034 PerlIO_printf(Perl_debug_log, "UNICODE");
16036 case REGEX_LOCALE_CHARSET:
16037 PerlIO_printf(Perl_debug_log, "LOCALE");
16039 case REGEX_ASCII_RESTRICTED_CHARSET:
16040 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16042 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16043 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16046 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16052 PerlIO_printf(Perl_debug_log, "\n");
16054 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16060 Perl_regdump(pTHX_ const regexp *r)
16063 SV * const sv = sv_newmortal();
16064 SV *dsv= sv_newmortal();
16065 RXi_GET_DECL(r,ri);
16066 GET_RE_DEBUG_FLAGS_DECL;
16068 PERL_ARGS_ASSERT_REGDUMP;
16070 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16072 /* Header fields of interest. */
16073 if (r->anchored_substr) {
16074 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16075 RE_SV_DUMPLEN(r->anchored_substr), 30);
16076 PerlIO_printf(Perl_debug_log,
16077 "anchored %s%s at %"IVdf" ",
16078 s, RE_SV_TAIL(r->anchored_substr),
16079 (IV)r->anchored_offset);
16080 } else if (r->anchored_utf8) {
16081 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16082 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16083 PerlIO_printf(Perl_debug_log,
16084 "anchored utf8 %s%s at %"IVdf" ",
16085 s, RE_SV_TAIL(r->anchored_utf8),
16086 (IV)r->anchored_offset);
16088 if (r->float_substr) {
16089 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16090 RE_SV_DUMPLEN(r->float_substr), 30);
16091 PerlIO_printf(Perl_debug_log,
16092 "floating %s%s at %"IVdf"..%"UVuf" ",
16093 s, RE_SV_TAIL(r->float_substr),
16094 (IV)r->float_min_offset, (UV)r->float_max_offset);
16095 } else if (r->float_utf8) {
16096 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16097 RE_SV_DUMPLEN(r->float_utf8), 30);
16098 PerlIO_printf(Perl_debug_log,
16099 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16100 s, RE_SV_TAIL(r->float_utf8),
16101 (IV)r->float_min_offset, (UV)r->float_max_offset);
16103 if (r->check_substr || r->check_utf8)
16104 PerlIO_printf(Perl_debug_log,
16106 (r->check_substr == r->float_substr
16107 && r->check_utf8 == r->float_utf8
16108 ? "(checking floating" : "(checking anchored"));
16109 if (r->intflags & PREGf_NOSCAN)
16110 PerlIO_printf(Perl_debug_log, " noscan");
16111 if (r->extflags & RXf_CHECK_ALL)
16112 PerlIO_printf(Perl_debug_log, " isall");
16113 if (r->check_substr || r->check_utf8)
16114 PerlIO_printf(Perl_debug_log, ") ");
16116 if (ri->regstclass) {
16117 regprop(r, sv, ri->regstclass, NULL, NULL);
16118 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16120 if (r->intflags & PREGf_ANCH) {
16121 PerlIO_printf(Perl_debug_log, "anchored");
16122 if (r->intflags & PREGf_ANCH_MBOL)
16123 PerlIO_printf(Perl_debug_log, "(MBOL)");
16124 if (r->intflags & PREGf_ANCH_SBOL)
16125 PerlIO_printf(Perl_debug_log, "(SBOL)");
16126 if (r->intflags & PREGf_ANCH_GPOS)
16127 PerlIO_printf(Perl_debug_log, "(GPOS)");
16128 PerlIO_putc(Perl_debug_log, ' ');
16130 if (r->intflags & PREGf_GPOS_SEEN)
16131 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16132 if (r->intflags & PREGf_SKIP)
16133 PerlIO_printf(Perl_debug_log, "plus ");
16134 if (r->intflags & PREGf_IMPLICIT)
16135 PerlIO_printf(Perl_debug_log, "implicit ");
16136 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16137 if (r->extflags & RXf_EVAL_SEEN)
16138 PerlIO_printf(Perl_debug_log, "with eval ");
16139 PerlIO_printf(Perl_debug_log, "\n");
16141 regdump_extflags("r->extflags: ",r->extflags);
16142 regdump_intflags("r->intflags: ",r->intflags);
16145 PERL_ARGS_ASSERT_REGDUMP;
16146 PERL_UNUSED_CONTEXT;
16147 PERL_UNUSED_ARG(r);
16148 #endif /* DEBUGGING */
16152 - regprop - printable representation of opcode, with run time support
16156 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16161 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16162 static const char * const anyofs[] = {
16163 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16164 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16165 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16166 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16167 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16168 || _CC_VERTSPACE != 16
16169 #error Need to adjust order of anyofs[]
16206 RXi_GET_DECL(prog,progi);
16207 GET_RE_DEBUG_FLAGS_DECL;
16209 PERL_ARGS_ASSERT_REGPROP;
16211 sv_setpvn(sv, "", 0);
16213 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16214 /* It would be nice to FAIL() here, but this may be called from
16215 regexec.c, and it would be hard to supply pRExC_state. */
16216 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16217 (int)OP(o), (int)REGNODE_MAX);
16218 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16220 k = PL_regkind[OP(o)];
16223 sv_catpvs(sv, " ");
16224 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16225 * is a crude hack but it may be the best for now since
16226 * we have no flag "this EXACTish node was UTF-8"
16228 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16229 PERL_PV_ESCAPE_UNI_DETECT |
16230 PERL_PV_ESCAPE_NONASCII |
16231 PERL_PV_PRETTY_ELLIPSES |
16232 PERL_PV_PRETTY_LTGT |
16233 PERL_PV_PRETTY_NOCLEAR
16235 } else if (k == TRIE) {
16236 /* print the details of the trie in dumpuntil instead, as
16237 * progi->data isn't available here */
16238 const char op = OP(o);
16239 const U32 n = ARG(o);
16240 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16241 (reg_ac_data *)progi->data->data[n] :
16243 const reg_trie_data * const trie
16244 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16246 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16247 DEBUG_TRIE_COMPILE_r(
16248 Perl_sv_catpvf(aTHX_ sv,
16249 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16250 (UV)trie->startstate,
16251 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16252 (UV)trie->wordcount,
16255 (UV)TRIE_CHARCOUNT(trie),
16256 (UV)trie->uniquecharcount
16259 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16260 sv_catpvs(sv, "[");
16261 (void) put_charclass_bitmap_innards(sv,
16262 (IS_ANYOF_TRIE(op))
16264 : TRIE_BITMAP(trie),
16266 sv_catpvs(sv, "]");
16269 } else if (k == CURLY) {
16270 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16271 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16272 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16274 else if (k == WHILEM && o->flags) /* Ordinal/of */
16275 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16276 else if (k == REF || k == OPEN || k == CLOSE
16277 || k == GROUPP || OP(o)==ACCEPT)
16279 AV *name_list= NULL;
16280 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16281 if ( RXp_PAREN_NAMES(prog) ) {
16282 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16283 } else if ( pRExC_state ) {
16284 name_list= RExC_paren_name_list;
16287 if ( k != REF || (OP(o) < NREF)) {
16288 SV **name= av_fetch(name_list, ARG(o), 0 );
16290 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16293 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16294 I32 *nums=(I32*)SvPVX(sv_dat);
16295 SV **name= av_fetch(name_list, nums[0], 0 );
16298 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16299 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16300 (n ? "," : ""), (IV)nums[n]);
16302 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16306 if ( k == REF && reginfo) {
16307 U32 n = ARG(o); /* which paren pair */
16308 I32 ln = prog->offs[n].start;
16309 if (prog->lastparen < n || ln == -1)
16310 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16311 else if (ln == prog->offs[n].end)
16312 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16314 const char *s = reginfo->strbeg + ln;
16315 Perl_sv_catpvf(aTHX_ sv, ": ");
16316 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16317 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16320 } else if (k == GOSUB) {
16321 AV *name_list= NULL;
16322 if ( RXp_PAREN_NAMES(prog) ) {
16323 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16324 } else if ( pRExC_state ) {
16325 name_list= RExC_paren_name_list;
16328 /* Paren and offset */
16329 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16331 SV **name= av_fetch(name_list, ARG(o), 0 );
16333 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16336 else if (k == VERB) {
16338 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16339 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16340 } else if (k == LOGICAL)
16341 /* 2: embedded, otherwise 1 */
16342 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16343 else if (k == ANYOF) {
16344 const U8 flags = ANYOF_FLAGS(o);
16346 SV* bitmap_invlist; /* Will hold what the bit map contains */
16349 if (flags & ANYOF_LOCALE_FLAGS)
16350 sv_catpvs(sv, "{loc}");
16351 if (flags & ANYOF_LOC_FOLD)
16352 sv_catpvs(sv, "{i}");
16353 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16354 if (flags & ANYOF_INVERT)
16355 sv_catpvs(sv, "^");
16357 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16359 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16362 /* output any special charclass tests (used entirely under use
16364 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16366 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16367 if (ANYOF_POSIXL_TEST(o,i)) {
16368 sv_catpv(sv, anyofs[i]);
16374 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16375 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16376 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16380 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16381 if (flags & ANYOF_INVERT)
16382 /*make sure the invert info is in each */
16383 sv_catpvs(sv, "^");
16386 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16387 sv_catpvs(sv, "{non-utf8-latin1-all}");
16390 /* output information about the unicode matching */
16391 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16392 sv_catpvs(sv, "{above_bitmap_all}");
16393 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16394 SV *lv; /* Set if there is something outside the bit map. */
16395 bool byte_output = FALSE; /* If something in the bitmap has
16397 SV *only_utf8_locale;
16399 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16400 * is used to guarantee that nothing in the bitmap gets
16402 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16403 &lv, &only_utf8_locale,
16405 if (lv && lv != &PL_sv_undef) {
16406 char *s = savesvpv(lv);
16407 char * const origs = s;
16409 while (*s && *s != '\n')
16413 const char * const t = ++s;
16415 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16416 sv_catpvs(sv, "{outside bitmap}");
16419 sv_catpvs(sv, "{utf8}");
16423 sv_catpvs(sv, " ");
16429 /* Truncate very long output */
16430 if (s - origs > 256) {
16431 Perl_sv_catpvf(aTHX_ sv,
16433 (int) (s - origs - 1),
16439 else if (*s == '\t') {
16453 SvREFCNT_dec_NN(lv);
16456 if ((flags & ANYOF_LOC_FOLD)
16457 && only_utf8_locale
16458 && only_utf8_locale != &PL_sv_undef)
16461 int max_entries = 256;
16463 sv_catpvs(sv, "{utf8 locale}");
16464 invlist_iterinit(only_utf8_locale);
16465 while (invlist_iternext(only_utf8_locale,
16467 put_range(sv, start, end, FALSE);
16469 if (max_entries < 0) {
16470 sv_catpvs(sv, "...");
16474 invlist_iterfinish(only_utf8_locale);
16478 SvREFCNT_dec(bitmap_invlist);
16481 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16483 else if (k == POSIXD || k == NPOSIXD) {
16484 U8 index = FLAGS(o) * 2;
16485 if (index < C_ARRAY_LENGTH(anyofs)) {
16486 if (*anyofs[index] != '[') {
16489 sv_catpv(sv, anyofs[index]);
16490 if (*anyofs[index] != '[') {
16495 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16498 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16499 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16500 else if (OP(o) == SBOL)
16501 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16503 PERL_UNUSED_CONTEXT;
16504 PERL_UNUSED_ARG(sv);
16505 PERL_UNUSED_ARG(o);
16506 PERL_UNUSED_ARG(prog);
16507 PERL_UNUSED_ARG(reginfo);
16508 PERL_UNUSED_ARG(pRExC_state);
16509 #endif /* DEBUGGING */
16515 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16516 { /* Assume that RE_INTUIT is set */
16517 struct regexp *const prog = ReANY(r);
16518 GET_RE_DEBUG_FLAGS_DECL;
16520 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16521 PERL_UNUSED_CONTEXT;
16525 const char * const s = SvPV_nolen_const(prog->check_substr
16526 ? prog->check_substr : prog->check_utf8);
16528 if (!PL_colorset) reginitcolors();
16529 PerlIO_printf(Perl_debug_log,
16530 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16532 prog->check_substr ? "" : "utf8 ",
16533 PL_colors[5],PL_colors[0],
16536 (strlen(s) > 60 ? "..." : ""));
16539 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16545 handles refcounting and freeing the perl core regexp structure. When
16546 it is necessary to actually free the structure the first thing it
16547 does is call the 'free' method of the regexp_engine associated to
16548 the regexp, allowing the handling of the void *pprivate; member
16549 first. (This routine is not overridable by extensions, which is why
16550 the extensions free is called first.)
16552 See regdupe and regdupe_internal if you change anything here.
16554 #ifndef PERL_IN_XSUB_RE
16556 Perl_pregfree(pTHX_ REGEXP *r)
16562 Perl_pregfree2(pTHX_ REGEXP *rx)
16564 struct regexp *const r = ReANY(rx);
16565 GET_RE_DEBUG_FLAGS_DECL;
16567 PERL_ARGS_ASSERT_PREGFREE2;
16569 if (r->mother_re) {
16570 ReREFCNT_dec(r->mother_re);
16572 CALLREGFREE_PVT(rx); /* free the private data */
16573 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16574 Safefree(r->xpv_len_u.xpvlenu_pv);
16577 SvREFCNT_dec(r->anchored_substr);
16578 SvREFCNT_dec(r->anchored_utf8);
16579 SvREFCNT_dec(r->float_substr);
16580 SvREFCNT_dec(r->float_utf8);
16581 Safefree(r->substrs);
16583 RX_MATCH_COPY_FREE(rx);
16584 #ifdef PERL_ANY_COW
16585 SvREFCNT_dec(r->saved_copy);
16588 SvREFCNT_dec(r->qr_anoncv);
16589 rx->sv_u.svu_rx = 0;
16594 This is a hacky workaround to the structural issue of match results
16595 being stored in the regexp structure which is in turn stored in
16596 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16597 could be PL_curpm in multiple contexts, and could require multiple
16598 result sets being associated with the pattern simultaneously, such
16599 as when doing a recursive match with (??{$qr})
16601 The solution is to make a lightweight copy of the regexp structure
16602 when a qr// is returned from the code executed by (??{$qr}) this
16603 lightweight copy doesn't actually own any of its data except for
16604 the starp/end and the actual regexp structure itself.
16610 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16612 struct regexp *ret;
16613 struct regexp *const r = ReANY(rx);
16614 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16616 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16619 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16621 SvOK_off((SV *)ret_x);
16623 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16624 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16625 made both spots point to the same regexp body.) */
16626 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16627 assert(!SvPVX(ret_x));
16628 ret_x->sv_u.svu_rx = temp->sv_any;
16629 temp->sv_any = NULL;
16630 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16631 SvREFCNT_dec_NN(temp);
16632 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16633 ing below will not set it. */
16634 SvCUR_set(ret_x, SvCUR(rx));
16637 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16638 sv_force_normal(sv) is called. */
16640 ret = ReANY(ret_x);
16642 SvFLAGS(ret_x) |= SvUTF8(rx);
16643 /* We share the same string buffer as the original regexp, on which we
16644 hold a reference count, incremented when mother_re is set below.
16645 The string pointer is copied here, being part of the regexp struct.
16647 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16648 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16650 const I32 npar = r->nparens+1;
16651 Newx(ret->offs, npar, regexp_paren_pair);
16652 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16655 Newx(ret->substrs, 1, struct reg_substr_data);
16656 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16658 SvREFCNT_inc_void(ret->anchored_substr);
16659 SvREFCNT_inc_void(ret->anchored_utf8);
16660 SvREFCNT_inc_void(ret->float_substr);
16661 SvREFCNT_inc_void(ret->float_utf8);
16663 /* check_substr and check_utf8, if non-NULL, point to either their
16664 anchored or float namesakes, and don't hold a second reference. */
16666 RX_MATCH_COPIED_off(ret_x);
16667 #ifdef PERL_ANY_COW
16668 ret->saved_copy = NULL;
16670 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16671 SvREFCNT_inc_void(ret->qr_anoncv);
16677 /* regfree_internal()
16679 Free the private data in a regexp. This is overloadable by
16680 extensions. Perl takes care of the regexp structure in pregfree(),
16681 this covers the *pprivate pointer which technically perl doesn't
16682 know about, however of course we have to handle the
16683 regexp_internal structure when no extension is in use.
16685 Note this is called before freeing anything in the regexp
16690 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16692 struct regexp *const r = ReANY(rx);
16693 RXi_GET_DECL(r,ri);
16694 GET_RE_DEBUG_FLAGS_DECL;
16696 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16702 SV *dsv= sv_newmortal();
16703 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16704 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16705 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16706 PL_colors[4],PL_colors[5],s);
16709 #ifdef RE_TRACK_PATTERN_OFFSETS
16711 Safefree(ri->u.offsets); /* 20010421 MJD */
16713 if (ri->code_blocks) {
16715 for (n = 0; n < ri->num_code_blocks; n++)
16716 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16717 Safefree(ri->code_blocks);
16721 int n = ri->data->count;
16724 /* If you add a ->what type here, update the comment in regcomp.h */
16725 switch (ri->data->what[n]) {
16731 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16734 Safefree(ri->data->data[n]);
16740 { /* Aho Corasick add-on structure for a trie node.
16741 Used in stclass optimization only */
16743 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16744 #ifdef USE_ITHREADS
16748 refcount = --aho->refcount;
16751 PerlMemShared_free(aho->states);
16752 PerlMemShared_free(aho->fail);
16753 /* do this last!!!! */
16754 PerlMemShared_free(ri->data->data[n]);
16755 /* we should only ever get called once, so
16756 * assert as much, and also guard the free
16757 * which /might/ happen twice. At the least
16758 * it will make code anlyzers happy and it
16759 * doesn't cost much. - Yves */
16760 assert(ri->regstclass);
16761 if (ri->regstclass) {
16762 PerlMemShared_free(ri->regstclass);
16763 ri->regstclass = 0;
16770 /* trie structure. */
16772 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16773 #ifdef USE_ITHREADS
16777 refcount = --trie->refcount;
16780 PerlMemShared_free(trie->charmap);
16781 PerlMemShared_free(trie->states);
16782 PerlMemShared_free(trie->trans);
16784 PerlMemShared_free(trie->bitmap);
16786 PerlMemShared_free(trie->jump);
16787 PerlMemShared_free(trie->wordinfo);
16788 /* do this last!!!! */
16789 PerlMemShared_free(ri->data->data[n]);
16794 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16795 ri->data->what[n]);
16798 Safefree(ri->data->what);
16799 Safefree(ri->data);
16805 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16806 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16807 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16810 re_dup - duplicate a regexp.
16812 This routine is expected to clone a given regexp structure. It is only
16813 compiled under USE_ITHREADS.
16815 After all of the core data stored in struct regexp is duplicated
16816 the regexp_engine.dupe method is used to copy any private data
16817 stored in the *pprivate pointer. This allows extensions to handle
16818 any duplication it needs to do.
16820 See pregfree() and regfree_internal() if you change anything here.
16822 #if defined(USE_ITHREADS)
16823 #ifndef PERL_IN_XSUB_RE
16825 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16829 const struct regexp *r = ReANY(sstr);
16830 struct regexp *ret = ReANY(dstr);
16832 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16834 npar = r->nparens+1;
16835 Newx(ret->offs, npar, regexp_paren_pair);
16836 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16838 if (ret->substrs) {
16839 /* Do it this way to avoid reading from *r after the StructCopy().
16840 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16841 cache, it doesn't matter. */
16842 const bool anchored = r->check_substr
16843 ? r->check_substr == r->anchored_substr
16844 : r->check_utf8 == r->anchored_utf8;
16845 Newx(ret->substrs, 1, struct reg_substr_data);
16846 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16848 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16849 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16850 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16851 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16853 /* check_substr and check_utf8, if non-NULL, point to either their
16854 anchored or float namesakes, and don't hold a second reference. */
16856 if (ret->check_substr) {
16858 assert(r->check_utf8 == r->anchored_utf8);
16859 ret->check_substr = ret->anchored_substr;
16860 ret->check_utf8 = ret->anchored_utf8;
16862 assert(r->check_substr == r->float_substr);
16863 assert(r->check_utf8 == r->float_utf8);
16864 ret->check_substr = ret->float_substr;
16865 ret->check_utf8 = ret->float_utf8;
16867 } else if (ret->check_utf8) {
16869 ret->check_utf8 = ret->anchored_utf8;
16871 ret->check_utf8 = ret->float_utf8;
16876 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16877 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16880 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16882 if (RX_MATCH_COPIED(dstr))
16883 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16885 ret->subbeg = NULL;
16886 #ifdef PERL_ANY_COW
16887 ret->saved_copy = NULL;
16890 /* Whether mother_re be set or no, we need to copy the string. We
16891 cannot refrain from copying it when the storage points directly to
16892 our mother regexp, because that's
16893 1: a buffer in a different thread
16894 2: something we no longer hold a reference on
16895 so we need to copy it locally. */
16896 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16897 ret->mother_re = NULL;
16899 #endif /* PERL_IN_XSUB_RE */
16904 This is the internal complement to regdupe() which is used to copy
16905 the structure pointed to by the *pprivate pointer in the regexp.
16906 This is the core version of the extension overridable cloning hook.
16907 The regexp structure being duplicated will be copied by perl prior
16908 to this and will be provided as the regexp *r argument, however
16909 with the /old/ structures pprivate pointer value. Thus this routine
16910 may override any copying normally done by perl.
16912 It returns a pointer to the new regexp_internal structure.
16916 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16919 struct regexp *const r = ReANY(rx);
16920 regexp_internal *reti;
16922 RXi_GET_DECL(r,ri);
16924 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16928 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16929 char, regexp_internal);
16930 Copy(ri->program, reti->program, len+1, regnode);
16932 reti->num_code_blocks = ri->num_code_blocks;
16933 if (ri->code_blocks) {
16935 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16936 struct reg_code_block);
16937 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16938 struct reg_code_block);
16939 for (n = 0; n < ri->num_code_blocks; n++)
16940 reti->code_blocks[n].src_regex = (REGEXP*)
16941 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16944 reti->code_blocks = NULL;
16946 reti->regstclass = NULL;
16949 struct reg_data *d;
16950 const int count = ri->data->count;
16953 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16954 char, struct reg_data);
16955 Newx(d->what, count, U8);
16958 for (i = 0; i < count; i++) {
16959 d->what[i] = ri->data->what[i];
16960 switch (d->what[i]) {
16961 /* see also regcomp.h and regfree_internal() */
16962 case 'a': /* actually an AV, but the dup function is identical. */
16966 case 'u': /* actually an HV, but the dup function is identical. */
16967 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16970 /* This is cheating. */
16971 Newx(d->data[i], 1, regnode_ssc);
16972 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16973 reti->regstclass = (regnode*)d->data[i];
16976 /* Trie stclasses are readonly and can thus be shared
16977 * without duplication. We free the stclass in pregfree
16978 * when the corresponding reg_ac_data struct is freed.
16980 reti->regstclass= ri->regstclass;
16984 ((reg_trie_data*)ri->data->data[i])->refcount++;
16989 d->data[i] = ri->data->data[i];
16992 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16993 ri->data->what[i]);
17002 reti->name_list_idx = ri->name_list_idx;
17004 #ifdef RE_TRACK_PATTERN_OFFSETS
17005 if (ri->u.offsets) {
17006 Newx(reti->u.offsets, 2*len+1, U32);
17007 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17010 SetProgLen(reti,len);
17013 return (void*)reti;
17016 #endif /* USE_ITHREADS */
17018 #ifndef PERL_IN_XSUB_RE
17021 - regnext - dig the "next" pointer out of a node
17024 Perl_regnext(pTHX_ regnode *p)
17031 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17032 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17033 (int)OP(p), (int)REGNODE_MAX);
17036 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17045 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17048 STRLEN l1 = strlen(pat1);
17049 STRLEN l2 = strlen(pat2);
17052 const char *message;
17054 PERL_ARGS_ASSERT_RE_CROAK2;
17060 Copy(pat1, buf, l1 , char);
17061 Copy(pat2, buf + l1, l2 , char);
17062 buf[l1 + l2] = '\n';
17063 buf[l1 + l2 + 1] = '\0';
17064 va_start(args, pat2);
17065 msv = vmess(buf, &args);
17067 message = SvPV_const(msv,l1);
17070 Copy(message, buf, l1 , char);
17071 /* l1-1 to avoid \n */
17072 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17076 /* Certain characters are output as a sequence with the first being a
17078 #define isBACKSLASHED_PUNCT(c) \
17079 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17082 S_put_code_point(pTHX_ SV *sv, UV c)
17084 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17087 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17089 else if (isPRINT(c)) {
17090 const char string = (char) c;
17091 if (isBACKSLASHED_PUNCT(c))
17092 sv_catpvs(sv, "\\");
17093 sv_catpvn(sv, &string, 1);
17096 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17098 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17101 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17106 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17109 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17111 /* Appends to 'sv' a displayable version of the range of code points from
17112 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17113 * as-is (though some of these will be escaped by put_code_point()). */
17115 const unsigned int min_range_count = 3;
17117 assert(start <= end);
17119 PERL_ARGS_ASSERT_PUT_RANGE;
17121 while (start <= end) {
17123 const char * format;
17125 if (end - start < min_range_count) {
17127 /* Individual chars in short ranges */
17128 for (; start <= end; start++) {
17129 put_code_point(sv, start);
17134 /* If permitted by the input options, and there is a possibility that
17135 * this range contains a printable literal, look to see if there is
17137 if (allow_literals && start <= MAX_PRINT_A) {
17139 /* If the range begin isn't an ASCII printable, effectively split
17140 * the range into two parts:
17141 * 1) the portion before the first such printable,
17143 * and output them separately. */
17144 if (! isPRINT_A(start)) {
17145 UV temp_end = start + 1;
17147 /* There is no point looking beyond the final possible
17148 * printable, in MAX_PRINT_A */
17149 UV max = MIN(end, MAX_PRINT_A);
17151 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17155 /* Here, temp_end points to one beyond the first printable if
17156 * found, or to one beyond 'max' if not. If none found, make
17157 * sure that we use the entire range */
17158 if (temp_end > MAX_PRINT_A) {
17159 temp_end = end + 1;
17162 /* Output the first part of the split range, the part that
17163 * doesn't have printables, with no looking for literals
17164 * (otherwise we would infinitely recurse) */
17165 put_range(sv, start, temp_end - 1, FALSE);
17167 /* The 2nd part of the range (if any) starts here. */
17170 /* We continue instead of dropping down because even if the 2nd
17171 * part is non-empty, it could be so short that we want to
17172 * output it specially, as tested for at the top of this loop.
17177 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17178 * output a sub-range of just the digits or letters, then process
17179 * the remaining portion as usual. */
17180 if (isALPHANUMERIC_A(start)) {
17181 UV mask = (isDIGIT_A(start))
17186 UV temp_end = start + 1;
17188 /* Find the end of the sub-range that includes just the
17189 * characters in the same class as the first character in it */
17190 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17195 /* For short ranges, don't duplicate the code above to output
17196 * them; just call recursively */
17197 if (temp_end - start < min_range_count) {
17198 put_range(sv, start, temp_end, FALSE);
17200 else { /* Output as a range */
17201 put_code_point(sv, start);
17202 sv_catpvs(sv, "-");
17203 put_code_point(sv, temp_end);
17205 start = temp_end + 1;
17209 /* We output any other printables as individual characters */
17210 if (isPUNCT_A(start) || isSPACE_A(start)) {
17211 while (start <= end && (isPUNCT_A(start)
17212 || isSPACE_A(start)))
17214 put_code_point(sv, start);
17219 } /* End of looking for literals */
17221 /* Here is not to output as a literal. Some control characters have
17222 * mnemonic names. Split off any of those at the beginning and end of
17223 * the range to print mnemonically. It isn't possible for many of
17224 * these to be in a row, so this won't overwhelm with output */
17225 while (isMNEMONIC_CNTRL(start) && start <= end) {
17226 put_code_point(sv, start);
17229 if (start < end && isMNEMONIC_CNTRL(end)) {
17231 /* Here, the final character in the range has a mnemonic name.
17232 * Work backwards from the end to find the final non-mnemonic */
17233 UV temp_end = end - 1;
17234 while (isMNEMONIC_CNTRL(temp_end)) {
17238 /* And separately output the range that doesn't have mnemonics */
17239 put_range(sv, start, temp_end, FALSE);
17241 /* Then output the mnemonic trailing controls */
17242 start = temp_end + 1;
17243 while (start <= end) {
17244 put_code_point(sv, start);
17250 /* As a final resort, output the range or subrange as hex. */
17252 this_end = (end < NUM_ANYOF_CODE_POINTS)
17254 : NUM_ANYOF_CODE_POINTS - 1;
17255 format = (this_end < 256)
17256 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17257 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17258 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17264 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17266 /* Appends to 'sv' a displayable version of the innards of the bracketed
17267 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17268 * output anything, and bitmap_invlist, if not NULL, will point to an
17269 * inversion list of what is in the bit map */
17273 unsigned int punct_count = 0;
17274 SV* invlist = NULL;
17275 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17276 bool allow_literals = TRUE;
17278 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17280 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17282 /* Worst case is exactly every-other code point is in the list */
17283 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17285 /* Convert the bit map to an inversion list, keeping track of how many
17286 * ASCII puncts are set, including an extra amount for the backslashed
17288 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17289 if (BITMAP_TEST(bitmap, i)) {
17290 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17291 if (isPUNCT_A(i)) {
17293 if isBACKSLASHED_PUNCT(i) {
17300 /* Nothing to output */
17301 if (_invlist_len(*invlist_ptr) == 0) {
17302 SvREFCNT_dec(invlist);
17306 /* Generally, it is more readable if printable characters are output as
17307 * literals, but if a range (nearly) spans all of them, it's best to output
17308 * it as a single range. This code will use a single range if all but 2
17309 * printables are in it */
17310 invlist_iterinit(*invlist_ptr);
17311 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17313 /* If range starts beyond final printable, it doesn't have any in it */
17314 if (start > MAX_PRINT_A) {
17318 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17319 * all but two, the range must start and end no later than 2 from
17321 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17322 if (end > MAX_PRINT_A) {
17328 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17329 allow_literals = FALSE;
17334 invlist_iterfinish(*invlist_ptr);
17336 /* The legibility of the output depends mostly on how many punctuation
17337 * characters are output. There are 32 possible ASCII ones, and some have
17338 * an additional backslash, bringing it to currently 36, so if any more
17339 * than 18 are to be output, we can instead output it as its complement,
17340 * yielding fewer puncts, and making it more legible. But give some weight
17341 * to the fact that outputting it as a complement is less legible than a
17342 * straight output, so don't complement unless we are somewhat over the 18
17344 if (allow_literals && punct_count > 22) {
17345 sv_catpvs(sv, "^");
17347 /* Add everything remaining to the list, so when we invert it just
17348 * below, it will be excluded */
17349 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17350 _invlist_invert(*invlist_ptr);
17353 /* Here we have figured things out. Output each range */
17354 invlist_iterinit(*invlist_ptr);
17355 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17356 if (start >= NUM_ANYOF_CODE_POINTS) {
17359 put_range(sv, start, end, allow_literals);
17361 invlist_iterfinish(*invlist_ptr);
17366 #define CLEAR_OPTSTART \
17367 if (optstart) STMT_START { \
17368 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17369 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17373 #define DUMPUNTIL(b,e) \
17375 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17377 STATIC const regnode *
17378 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17379 const regnode *last, const regnode *plast,
17380 SV* sv, I32 indent, U32 depth)
17382 U8 op = PSEUDO; /* Arbitrary non-END op. */
17383 const regnode *next;
17384 const regnode *optstart= NULL;
17386 RXi_GET_DECL(r,ri);
17387 GET_RE_DEBUG_FLAGS_DECL;
17389 PERL_ARGS_ASSERT_DUMPUNTIL;
17391 #ifdef DEBUG_DUMPUNTIL
17392 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17393 last ? last-start : 0,plast ? plast-start : 0);
17396 if (plast && plast < last)
17399 while (PL_regkind[op] != END && (!last || node < last)) {
17401 /* While that wasn't END last time... */
17404 if (op == CLOSE || op == WHILEM)
17406 next = regnext((regnode *)node);
17409 if (OP(node) == OPTIMIZED) {
17410 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17417 regprop(r, sv, node, NULL, NULL);
17418 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17419 (int)(2*indent + 1), "", SvPVX_const(sv));
17421 if (OP(node) != OPTIMIZED) {
17422 if (next == NULL) /* Next ptr. */
17423 PerlIO_printf(Perl_debug_log, " (0)");
17424 else if (PL_regkind[(U8)op] == BRANCH
17425 && PL_regkind[OP(next)] != BRANCH )
17426 PerlIO_printf(Perl_debug_log, " (FAIL)");
17428 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17429 (void)PerlIO_putc(Perl_debug_log, '\n');
17433 if (PL_regkind[(U8)op] == BRANCHJ) {
17436 const regnode *nnode = (OP(next) == LONGJMP
17437 ? regnext((regnode *)next)
17439 if (last && nnode > last)
17441 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17444 else if (PL_regkind[(U8)op] == BRANCH) {
17446 DUMPUNTIL(NEXTOPER(node), next);
17448 else if ( PL_regkind[(U8)op] == TRIE ) {
17449 const regnode *this_trie = node;
17450 const char op = OP(node);
17451 const U32 n = ARG(node);
17452 const reg_ac_data * const ac = op>=AHOCORASICK ?
17453 (reg_ac_data *)ri->data->data[n] :
17455 const reg_trie_data * const trie =
17456 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17458 AV *const trie_words
17459 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17461 const regnode *nextbranch= NULL;
17464 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17465 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17467 PerlIO_printf(Perl_debug_log, "%*s%s ",
17468 (int)(2*(indent+3)), "",
17470 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17471 SvCUR(*elem_ptr), 60,
17472 PL_colors[0], PL_colors[1],
17474 ? PERL_PV_ESCAPE_UNI
17476 | PERL_PV_PRETTY_ELLIPSES
17477 | PERL_PV_PRETTY_LTGT
17482 U16 dist= trie->jump[word_idx+1];
17483 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17484 (UV)((dist ? this_trie + dist : next) - start));
17487 nextbranch= this_trie + trie->jump[0];
17488 DUMPUNTIL(this_trie + dist, nextbranch);
17490 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17491 nextbranch= regnext((regnode *)nextbranch);
17493 PerlIO_printf(Perl_debug_log, "\n");
17496 if (last && next > last)
17501 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17502 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17503 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17505 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17507 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17509 else if ( op == PLUS || op == STAR) {
17510 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17512 else if (PL_regkind[(U8)op] == ANYOF) {
17513 /* arglen 1 + class block */
17514 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17515 ? ANYOF_POSIXL_SKIP
17517 node = NEXTOPER(node);
17519 else if (PL_regkind[(U8)op] == EXACT) {
17520 /* Literal string, where present. */
17521 node += NODE_SZ_STR(node) - 1;
17522 node = NEXTOPER(node);
17525 node = NEXTOPER(node);
17526 node += regarglen[(U8)op];
17528 if (op == CURLYX || op == OPEN)
17532 #ifdef DEBUG_DUMPUNTIL
17533 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17538 #endif /* DEBUGGING */
17542 * c-indentation-style: bsd
17543 * c-basic-offset: 4
17544 * indent-tabs-mode: nil
17547 * ex: set ts=8 sts=4 sw=4 et: