5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
109 struct RExC_state_t {
110 U32 flags; /* RXf_* are we folding, multilining? */
111 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
112 char *precomp; /* uncompiled string. */
113 REGEXP *rx_sv; /* The SV that is the regexp. */
114 regexp *rx; /* perl core regexp structure */
115 regexp_internal *rxi; /* internal data for regexp object
117 char *start; /* Start of input for compile */
118 char *end; /* End of input for compile */
119 char *parse; /* Input-scan pointer. */
120 SSize_t whilem_seen; /* number of WHILEM in this expr */
121 regnode *emit_start; /* Start of emitted-code area */
122 regnode *emit_bound; /* First regnode outside of the
124 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
125 implies compiling, so don't emit */
126 regnode_ssc emit_dummy; /* placeholder for emit to point to;
127 large enough for the largest
128 non-EXACTish node, so can use it as
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 SSize_t size; /* Code size. */
134 I32 npar; /* Capture buffer count, (OPEN) plus
135 one. ("par" 0 is the whole
137 I32 nestroot; /* root parens we are in - used by
141 regnode **open_parens; /* pointers to open parens */
142 regnode **close_parens; /* pointers to close parens */
143 regnode *opend; /* END node in program */
144 I32 utf8; /* whether the pattern is utf8 or not */
145 I32 orig_utf8; /* whether the pattern was originally in utf8 */
146 /* XXX use this for future optimisation of case
147 * where pattern must be upgraded to utf8. */
148 I32 uni_semantics; /* If a d charset modifier should use unicode
149 rules, even if the pattern is not in
151 HV *paren_names; /* Paren names */
153 regnode **recurse; /* Recurse regops */
154 I32 recurse_count; /* Number of recurse regops */
155 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
157 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
167 SSize_t maxlen; /* mininum possible number of chars in string to match */
168 #ifdef ADD_TO_REGEXEC
169 char *starttry; /* -Dr: where regtry was called. */
170 #define RExC_starttry (pRExC_state->starttry)
172 SV *runtime_code_qr; /* qr with the runtime code blocks */
174 const char *lastparse;
176 AV *paren_name_list; /* idx -> name */
177 U32 study_chunk_recursed_count;
178 #define RExC_lastparse (pRExC_state->lastparse)
179 #define RExC_lastnum (pRExC_state->lastnum)
180 #define RExC_paren_name_list (pRExC_state->paren_name_list)
181 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
185 #define RExC_flags (pRExC_state->flags)
186 #define RExC_pm_flags (pRExC_state->pm_flags)
187 #define RExC_precomp (pRExC_state->precomp)
188 #define RExC_rx_sv (pRExC_state->rx_sv)
189 #define RExC_rx (pRExC_state->rx)
190 #define RExC_rxi (pRExC_state->rxi)
191 #define RExC_start (pRExC_state->start)
192 #define RExC_end (pRExC_state->end)
193 #define RExC_parse (pRExC_state->parse)
194 #define RExC_whilem_seen (pRExC_state->whilem_seen)
195 #ifdef RE_TRACK_PATTERN_OFFSETS
196 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
199 #define RExC_emit (pRExC_state->emit)
200 #define RExC_emit_dummy (pRExC_state->emit_dummy)
201 #define RExC_emit_start (pRExC_state->emit_start)
202 #define RExC_emit_bound (pRExC_state->emit_bound)
203 #define RExC_naughty (pRExC_state->naughty)
204 #define RExC_sawback (pRExC_state->sawback)
205 #define RExC_seen (pRExC_state->seen)
206 #define RExC_size (pRExC_state->size)
207 #define RExC_maxlen (pRExC_state->maxlen)
208 #define RExC_npar (pRExC_state->npar)
209 #define RExC_nestroot (pRExC_state->nestroot)
210 #define RExC_extralen (pRExC_state->extralen)
211 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
212 #define RExC_utf8 (pRExC_state->utf8)
213 #define RExC_uni_semantics (pRExC_state->uni_semantics)
214 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
215 #define RExC_open_parens (pRExC_state->open_parens)
216 #define RExC_close_parens (pRExC_state->close_parens)
217 #define RExC_opend (pRExC_state->opend)
218 #define RExC_paren_names (pRExC_state->paren_names)
219 #define RExC_recurse (pRExC_state->recurse)
220 #define RExC_recurse_count (pRExC_state->recurse_count)
221 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
222 #define RExC_study_chunk_recursed_bytes \
223 (pRExC_state->study_chunk_recursed_bytes)
224 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
225 #define RExC_contains_locale (pRExC_state->contains_locale)
226 #define RExC_contains_i (pRExC_state->contains_i)
227 #define RExC_override_recoding (pRExC_state->override_recoding)
228 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
231 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
232 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
233 ((*s) == '{' && regcurly(s)))
236 * Flags to be passed up and down.
238 #define WORST 0 /* Worst case. */
239 #define HASWIDTH 0x01 /* Known to match non-null strings. */
241 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
242 * character. (There needs to be a case: in the switch statement in regexec.c
243 * for any node marked SIMPLE.) Note that this is not the same thing as
246 #define SPSTART 0x04 /* Starts with * or + */
247 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
248 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
249 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
251 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
253 /* whether trie related optimizations are enabled */
254 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
255 #define TRIE_STUDY_OPT
256 #define FULL_TRIE_STUDY
262 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
263 #define PBITVAL(paren) (1 << ((paren) & 7))
264 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
265 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
266 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
268 #define REQUIRE_UTF8 STMT_START { \
270 *flagp = RESTART_UTF8; \
275 /* This converts the named class defined in regcomp.h to its equivalent class
276 * number defined in handy.h. */
277 #define namedclass_to_classnum(class) ((int) ((class) / 2))
278 #define classnum_to_namedclass(classnum) ((classnum) * 2)
280 #define _invlist_union_complement_2nd(a, b, output) \
281 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
282 #define _invlist_intersection_complement_2nd(a, b, output) \
283 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
285 /* About scan_data_t.
287 During optimisation we recurse through the regexp program performing
288 various inplace (keyhole style) optimisations. In addition study_chunk
289 and scan_commit populate this data structure with information about
290 what strings MUST appear in the pattern. We look for the longest
291 string that must appear at a fixed location, and we look for the
292 longest string that may appear at a floating location. So for instance
297 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
298 strings (because they follow a .* construct). study_chunk will identify
299 both FOO and BAR as being the longest fixed and floating strings respectively.
301 The strings can be composites, for instance
305 will result in a composite fixed substring 'foo'.
307 For each string some basic information is maintained:
309 - offset or min_offset
310 This is the position the string must appear at, or not before.
311 It also implicitly (when combined with minlenp) tells us how many
312 characters must match before the string we are searching for.
313 Likewise when combined with minlenp and the length of the string it
314 tells us how many characters must appear after the string we have
318 Only used for floating strings. This is the rightmost point that
319 the string can appear at. If set to SSize_t_MAX it indicates that the
320 string can occur infinitely far to the right.
323 A pointer to the minimum number of characters of the pattern that the
324 string was found inside. This is important as in the case of positive
325 lookahead or positive lookbehind we can have multiple patterns
330 The minimum length of the pattern overall is 3, the minimum length
331 of the lookahead part is 3, but the minimum length of the part that
332 will actually match is 1. So 'FOO's minimum length is 3, but the
333 minimum length for the F is 1. This is important as the minimum length
334 is used to determine offsets in front of and behind the string being
335 looked for. Since strings can be composites this is the length of the
336 pattern at the time it was committed with a scan_commit. Note that
337 the length is calculated by study_chunk, so that the minimum lengths
338 are not known until the full pattern has been compiled, thus the
339 pointer to the value.
343 In the case of lookbehind the string being searched for can be
344 offset past the start point of the final matching string.
345 If this value was just blithely removed from the min_offset it would
346 invalidate some of the calculations for how many chars must match
347 before or after (as they are derived from min_offset and minlen and
348 the length of the string being searched for).
349 When the final pattern is compiled and the data is moved from the
350 scan_data_t structure into the regexp structure the information
351 about lookbehind is factored in, with the information that would
352 have been lost precalculated in the end_shift field for the
355 The fields pos_min and pos_delta are used to store the minimum offset
356 and the delta to the maximum offset at the current point in the pattern.
360 typedef struct scan_data_t {
361 /*I32 len_min; unused */
362 /*I32 len_delta; unused */
366 SSize_t last_end; /* min value, <0 unless valid. */
367 SSize_t last_start_min;
368 SSize_t last_start_max;
369 SV **longest; /* Either &l_fixed, or &l_float. */
370 SV *longest_fixed; /* longest fixed string found in pattern */
371 SSize_t offset_fixed; /* offset where it starts */
372 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
373 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
374 SV *longest_float; /* longest floating string found in pattern */
375 SSize_t offset_float_min; /* earliest point in string it can appear */
376 SSize_t offset_float_max; /* latest point in string it can appear */
377 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
378 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
381 SSize_t *last_closep;
382 regnode_ssc *start_class;
386 * Forward declarations for pregcomp()'s friends.
389 static const scan_data_t zero_scan_data =
390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
392 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
393 #define SF_BEFORE_SEOL 0x0001
394 #define SF_BEFORE_MEOL 0x0002
395 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
398 #define SF_FIX_SHIFT_EOL (+2)
399 #define SF_FL_SHIFT_EOL (+4)
401 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
404 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
406 #define SF_IS_INF 0x0040
407 #define SF_HAS_PAR 0x0080
408 #define SF_IN_PAR 0x0100
409 #define SF_HAS_EVAL 0x0200
410 #define SCF_DO_SUBSTR 0x0400
411 #define SCF_DO_STCLASS_AND 0x0800
412 #define SCF_DO_STCLASS_OR 0x1000
413 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
414 #define SCF_WHILEM_VISITED_POS 0x2000
416 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
417 #define SCF_SEEN_ACCEPT 0x8000
418 #define SCF_TRIE_DOING_RESTUDY 0x10000
420 #define UTF cBOOL(RExC_utf8)
422 /* The enums for all these are ordered so things work out correctly */
423 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
424 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
425 == REGEX_DEPENDS_CHARSET)
426 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
427 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
428 >= REGEX_UNICODE_CHARSET)
429 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
430 == REGEX_ASCII_RESTRICTED_CHARSET)
431 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
432 >= REGEX_ASCII_RESTRICTED_CHARSET)
433 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
434 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
436 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
438 /* For programs that want to be strictly Unicode compatible by dying if any
439 * attempt is made to match a non-Unicode code point against a Unicode
441 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
443 #define OOB_NAMEDCLASS -1
445 /* There is no code point that is out-of-bounds, so this is problematic. But
446 * its only current use is to initialize a variable that is always set before
448 #define OOB_UNICODE 0xDEADBEEF
450 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
451 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
454 /* length of regex to show in messages that don't mark a position within */
455 #define RegexLengthToShowInErrorMessages 127
458 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
459 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
460 * op/pragma/warn/regcomp.
462 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
463 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
465 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
466 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
468 #define REPORT_LOCATION_ARGS(offset) \
469 UTF8fARG(UTF, offset, RExC_precomp), \
470 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
473 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
474 * arg. Show regex, up to a maximum length. If it's too long, chop and add
477 #define _FAIL(code) STMT_START { \
478 const char *ellipses = ""; \
479 IV len = RExC_end - RExC_precomp; \
482 SAVEFREESV(RExC_rx_sv); \
483 if (len > RegexLengthToShowInErrorMessages) { \
484 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
485 len = RegexLengthToShowInErrorMessages - 10; \
491 #define FAIL(msg) _FAIL( \
492 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
493 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
495 #define FAIL2(msg,arg) _FAIL( \
496 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
497 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
500 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
502 #define Simple_vFAIL(m) STMT_START { \
504 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
505 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
506 m, REPORT_LOCATION_ARGS(offset)); \
510 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
512 #define vFAIL(m) STMT_START { \
514 SAVEFREESV(RExC_rx_sv); \
519 * Like Simple_vFAIL(), but accepts two arguments.
521 #define Simple_vFAIL2(m,a1) STMT_START { \
522 const IV offset = RExC_parse - RExC_precomp; \
523 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
524 REPORT_LOCATION_ARGS(offset)); \
528 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
530 #define vFAIL2(m,a1) STMT_START { \
532 SAVEFREESV(RExC_rx_sv); \
533 Simple_vFAIL2(m, a1); \
538 * Like Simple_vFAIL(), but accepts three arguments.
540 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
541 const IV offset = RExC_parse - RExC_precomp; \
542 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
543 REPORT_LOCATION_ARGS(offset)); \
547 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
549 #define vFAIL3(m,a1,a2) STMT_START { \
551 SAVEFREESV(RExC_rx_sv); \
552 Simple_vFAIL3(m, a1, a2); \
556 * Like Simple_vFAIL(), but accepts four arguments.
558 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
559 const IV offset = RExC_parse - RExC_precomp; \
560 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
561 REPORT_LOCATION_ARGS(offset)); \
564 #define vFAIL4(m,a1,a2,a3) STMT_START { \
566 SAVEFREESV(RExC_rx_sv); \
567 Simple_vFAIL4(m, a1, a2, a3); \
570 /* A specialized version of vFAIL2 that works with UTF8f */
571 #define vFAIL2utf8f(m, a1) STMT_START { \
572 const IV offset = RExC_parse - RExC_precomp; \
574 SAVEFREESV(RExC_rx_sv); \
575 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
576 REPORT_LOCATION_ARGS(offset)); \
579 /* These have asserts in them because of [perl #122671] Many warnings in
580 * regcomp.c can occur twice. If they get output in pass1 and later in that
581 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
582 * would get output again. So they should be output in pass2, and these
583 * asserts make sure new warnings follow that paradigm. */
585 /* m is not necessarily a "literal string", in this macro */
586 #define reg_warn_non_literal_string(loc, m) STMT_START { \
587 const IV offset = loc - RExC_precomp; \
588 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
589 m, REPORT_LOCATION_ARGS(offset)); \
592 #define ckWARNreg(loc,m) STMT_START { \
593 const IV offset = loc - RExC_precomp; \
594 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
595 REPORT_LOCATION_ARGS(offset)); \
598 #define vWARN_dep(loc, m) STMT_START { \
599 const IV offset = loc - RExC_precomp; \
600 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
601 REPORT_LOCATION_ARGS(offset)); \
604 #define ckWARNdep(loc,m) STMT_START { \
605 const IV offset = loc - RExC_precomp; \
606 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
608 REPORT_LOCATION_ARGS(offset)); \
611 #define ckWARNregdep(loc,m) STMT_START { \
612 const IV offset = loc - RExC_precomp; \
613 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
615 REPORT_LOCATION_ARGS(offset)); \
618 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
622 a1, REPORT_LOCATION_ARGS(offset)); \
625 #define ckWARN2reg(loc, m, a1) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
628 a1, REPORT_LOCATION_ARGS(offset)); \
631 #define vWARN3(loc, m, a1, a2) STMT_START { \
632 const IV offset = loc - RExC_precomp; \
633 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
634 a1, a2, REPORT_LOCATION_ARGS(offset)); \
637 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
638 const IV offset = loc - RExC_precomp; \
639 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
640 a1, a2, REPORT_LOCATION_ARGS(offset)); \
643 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
646 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
649 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
650 const IV offset = loc - RExC_precomp; \
651 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
652 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
655 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
656 const IV offset = loc - RExC_precomp; \
657 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
658 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
662 /* Allow for side effects in s */
663 #define REGC(c,s) STMT_START { \
664 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
667 /* Macros for recording node offsets. 20001227 mjd@plover.com
668 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
669 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
670 * Element 0 holds the number n.
671 * Position is 1 indexed.
673 #ifndef RE_TRACK_PATTERN_OFFSETS
674 #define Set_Node_Offset_To_R(node,byte)
675 #define Set_Node_Offset(node,byte)
676 #define Set_Cur_Node_Offset
677 #define Set_Node_Length_To_R(node,len)
678 #define Set_Node_Length(node,len)
679 #define Set_Node_Cur_Length(node,start)
680 #define Node_Offset(n)
681 #define Node_Length(n)
682 #define Set_Node_Offset_Length(node,offset,len)
683 #define ProgLen(ri) ri->u.proglen
684 #define SetProgLen(ri,x) ri->u.proglen = x
686 #define ProgLen(ri) ri->u.offsets[0]
687 #define SetProgLen(ri,x) ri->u.offsets[0] = x
688 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
690 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
691 __LINE__, (int)(node), (int)(byte))); \
693 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
696 RExC_offsets[2*(node)-1] = (byte); \
701 #define Set_Node_Offset(node,byte) \
702 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
703 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
705 #define Set_Node_Length_To_R(node,len) STMT_START { \
707 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
708 __LINE__, (int)(node), (int)(len))); \
710 Perl_croak(aTHX_ "value of node is %d in Length macro", \
713 RExC_offsets[2*(node)] = (len); \
718 #define Set_Node_Length(node,len) \
719 Set_Node_Length_To_R((node)-RExC_emit_start, len)
720 #define Set_Node_Cur_Length(node, start) \
721 Set_Node_Length(node, RExC_parse - start)
723 /* Get offsets and lengths */
724 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
725 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
727 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
728 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
729 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
733 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
734 #define EXPERIMENTAL_INPLACESCAN
735 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
737 #define DEBUG_RExC_seen() \
738 DEBUG_OPTIMISE_MORE_r({ \
739 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
741 if (RExC_seen & REG_ZERO_LEN_SEEN) \
742 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
744 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
745 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
747 if (RExC_seen & REG_GPOS_SEEN) \
748 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
750 if (RExC_seen & REG_CANY_SEEN) \
751 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
753 if (RExC_seen & REG_RECURSE_SEEN) \
754 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
756 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
757 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
759 if (RExC_seen & REG_VERBARG_SEEN) \
760 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
762 if (RExC_seen & REG_CUTGROUP_SEEN) \
763 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
765 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
766 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
768 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
769 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
771 if (RExC_seen & REG_GOSTART_SEEN) \
772 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
774 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
775 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
777 PerlIO_printf(Perl_debug_log,"\n"); \
780 #define DEBUG_STUDYDATA(str,data,depth) \
781 DEBUG_OPTIMISE_MORE_r(if(data){ \
782 PerlIO_printf(Perl_debug_log, \
783 "%*s" str "Pos:%"IVdf"/%"IVdf \
784 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
785 (int)(depth)*2, "", \
786 (IV)((data)->pos_min), \
787 (IV)((data)->pos_delta), \
788 (UV)((data)->flags), \
789 (IV)((data)->whilem_c), \
790 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
791 is_inf ? "INF " : "" \
793 if ((data)->last_found) \
794 PerlIO_printf(Perl_debug_log, \
795 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
796 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
797 SvPVX_const((data)->last_found), \
798 (IV)((data)->last_end), \
799 (IV)((data)->last_start_min), \
800 (IV)((data)->last_start_max), \
801 ((data)->longest && \
802 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
803 SvPVX_const((data)->longest_fixed), \
804 (IV)((data)->offset_fixed), \
805 ((data)->longest && \
806 (data)->longest==&((data)->longest_float)) ? "*" : "", \
807 SvPVX_const((data)->longest_float), \
808 (IV)((data)->offset_float_min), \
809 (IV)((data)->offset_float_max) \
811 PerlIO_printf(Perl_debug_log,"\n"); \
816 /* is c a control character for which we have a mnemonic? */
817 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
820 S_cntrl_to_mnemonic(const U8 c)
822 /* Returns the mnemonic string that represents character 'c', if one
823 * exists; NULL otherwise. The only ones that exist for the purposes of
824 * this routine are a few control characters */
827 case '\a': return "\\a";
828 case '\b': return "\\b";
829 case ESC_NATIVE: return "\\e";
830 case '\f': return "\\f";
831 case '\n': return "\\n";
832 case '\r': return "\\r";
833 case '\t': return "\\t";
841 /* Mark that we cannot extend a found fixed substring at this point.
842 Update the longest found anchored substring and the longest found
843 floating substrings if needed. */
846 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
847 SSize_t *minlenp, int is_inf)
849 const STRLEN l = CHR_SVLEN(data->last_found);
850 const STRLEN old_l = CHR_SVLEN(*data->longest);
851 GET_RE_DEBUG_FLAGS_DECL;
853 PERL_ARGS_ASSERT_SCAN_COMMIT;
855 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
856 SvSetMagicSV(*data->longest, data->last_found);
857 if (*data->longest == data->longest_fixed) {
858 data->offset_fixed = l ? data->last_start_min : data->pos_min;
859 if (data->flags & SF_BEFORE_EOL)
861 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
863 data->flags &= ~SF_FIX_BEFORE_EOL;
864 data->minlen_fixed=minlenp;
865 data->lookbehind_fixed=0;
867 else { /* *data->longest == data->longest_float */
868 data->offset_float_min = l ? data->last_start_min : data->pos_min;
869 data->offset_float_max = (l
870 ? data->last_start_max
871 : (data->pos_delta == SSize_t_MAX
873 : data->pos_min + data->pos_delta));
875 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
876 data->offset_float_max = SSize_t_MAX;
877 if (data->flags & SF_BEFORE_EOL)
879 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
881 data->flags &= ~SF_FL_BEFORE_EOL;
882 data->minlen_float=minlenp;
883 data->lookbehind_float=0;
886 SvCUR_set(data->last_found, 0);
888 SV * const sv = data->last_found;
889 if (SvUTF8(sv) && SvMAGICAL(sv)) {
890 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
896 data->flags &= ~SF_BEFORE_EOL;
897 DEBUG_STUDYDATA("commit: ",data,0);
900 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
901 * list that describes which code points it matches */
904 S_ssc_anything(pTHX_ regnode_ssc *ssc)
906 /* Set the SSC 'ssc' to match an empty string or any code point */
908 PERL_ARGS_ASSERT_SSC_ANYTHING;
910 assert(is_ANYOF_SYNTHETIC(ssc));
912 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
913 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
914 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
918 S_ssc_is_anything(const regnode_ssc *ssc)
920 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
921 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
922 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
923 * in any way, so there's no point in using it */
928 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
930 assert(is_ANYOF_SYNTHETIC(ssc));
932 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
936 /* See if the list consists solely of the range 0 - Infinity */
937 invlist_iterinit(ssc->invlist);
938 ret = invlist_iternext(ssc->invlist, &start, &end)
942 invlist_iterfinish(ssc->invlist);
948 /* If e.g., both \w and \W are set, matches everything */
949 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
951 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
952 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
962 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
964 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
965 * string, any code point, or any posix class under locale */
967 PERL_ARGS_ASSERT_SSC_INIT;
969 Zero(ssc, 1, regnode_ssc);
970 set_ANYOF_SYNTHETIC(ssc);
971 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
974 /* If any portion of the regex is to operate under locale rules,
975 * initialization includes it. The reason this isn't done for all regexes
976 * is that the optimizer was written under the assumption that locale was
977 * all-or-nothing. Given the complexity and lack of documentation in the
978 * optimizer, and that there are inadequate test cases for locale, many
979 * parts of it may not work properly, it is safest to avoid locale unless
981 if (RExC_contains_locale) {
982 ANYOF_POSIXL_SETALL(ssc);
985 ANYOF_POSIXL_ZERO(ssc);
990 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
991 const regnode_ssc *ssc)
993 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
994 * to the list of code points matched, and locale posix classes; hence does
995 * not check its flags) */
1000 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1002 assert(is_ANYOF_SYNTHETIC(ssc));
1004 invlist_iterinit(ssc->invlist);
1005 ret = invlist_iternext(ssc->invlist, &start, &end)
1009 invlist_iterfinish(ssc->invlist);
1015 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1023 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1024 const regnode_charclass* const node)
1026 /* Returns a mortal inversion list defining which code points are matched
1027 * by 'node', which is of type ANYOF. Handles complementing the result if
1028 * appropriate. If some code points aren't knowable at this time, the
1029 * returned list must, and will, contain every code point that is a
1032 SV* invlist = sv_2mortal(_new_invlist(0));
1033 SV* only_utf8_locale_invlist = NULL;
1035 const U32 n = ARG(node);
1036 bool new_node_has_latin1 = FALSE;
1038 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1040 /* Look at the data structure created by S_set_ANYOF_arg() */
1041 if (n != ANYOF_ONLY_HAS_BITMAP) {
1042 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1043 AV * const av = MUTABLE_AV(SvRV(rv));
1044 SV **const ary = AvARRAY(av);
1045 assert(RExC_rxi->data->what[n] == 's');
1047 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1048 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1050 else if (ary[0] && ary[0] != &PL_sv_undef) {
1052 /* Here, no compile-time swash, and there are things that won't be
1053 * known until runtime -- we have to assume it could be anything */
1054 return _add_range_to_invlist(invlist, 0, UV_MAX);
1056 else if (ary[3] && ary[3] != &PL_sv_undef) {
1058 /* Here no compile-time swash, and no run-time only data. Use the
1059 * node's inversion list */
1060 invlist = sv_2mortal(invlist_clone(ary[3]));
1063 /* Get the code points valid only under UTF-8 locales */
1064 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1065 && ary[2] && ary[2] != &PL_sv_undef)
1067 only_utf8_locale_invlist = ary[2];
1071 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1072 * code points, and an inversion list for the others, but if there are code
1073 * points that should match only conditionally on the target string being
1074 * UTF-8, those are placed in the inversion list, and not the bitmap.
1075 * Since there are circumstances under which they could match, they are
1076 * included in the SSC. But if the ANYOF node is to be inverted, we have
1077 * to exclude them here, so that when we invert below, the end result
1078 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1079 * have to do this here before we add the unconditionally matched code
1081 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1082 _invlist_intersection_complement_2nd(invlist,
1087 /* Add in the points from the bit map */
1088 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1089 if (ANYOF_BITMAP_TEST(node, i)) {
1090 invlist = add_cp_to_invlist(invlist, i);
1091 new_node_has_latin1 = TRUE;
1095 /* If this can match all upper Latin1 code points, have to add them
1097 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1098 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1101 /* Similarly for these */
1102 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1103 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1106 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1107 _invlist_invert(invlist);
1109 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1111 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1112 * locale. We can skip this if there are no 0-255 at all. */
1113 _invlist_union(invlist, PL_Latin1, &invlist);
1116 /* Similarly add the UTF-8 locale possible matches. These have to be
1117 * deferred until after the non-UTF-8 locale ones are taken care of just
1118 * above, or it leads to wrong results under ANYOF_INVERT */
1119 if (only_utf8_locale_invlist) {
1120 _invlist_union_maybe_complement_2nd(invlist,
1121 only_utf8_locale_invlist,
1122 ANYOF_FLAGS(node) & ANYOF_INVERT,
1129 /* These two functions currently do the exact same thing */
1130 #define ssc_init_zero ssc_init
1132 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1133 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1135 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1136 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1137 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1140 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1141 const regnode_charclass *and_with)
1143 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1144 * another SSC or a regular ANYOF class. Can create false positives. */
1149 PERL_ARGS_ASSERT_SSC_AND;
1151 assert(is_ANYOF_SYNTHETIC(ssc));
1153 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1154 * the code point inversion list and just the relevant flags */
1155 if (is_ANYOF_SYNTHETIC(and_with)) {
1156 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1157 anded_flags = ANYOF_FLAGS(and_with);
1159 /* XXX This is a kludge around what appears to be deficiencies in the
1160 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1161 * there are paths through the optimizer where it doesn't get weeded
1162 * out when it should. And if we don't make some extra provision for
1163 * it like the code just below, it doesn't get added when it should.
1164 * This solution is to add it only when AND'ing, which is here, and
1165 * only when what is being AND'ed is the pristine, original node
1166 * matching anything. Thus it is like adding it to ssc_anything() but
1167 * only when the result is to be AND'ed. Probably the same solution
1168 * could be adopted for the same problem we have with /l matching,
1169 * which is solved differently in S_ssc_init(), and that would lead to
1170 * fewer false positives than that solution has. But if this solution
1171 * creates bugs, the consequences are only that a warning isn't raised
1172 * that should be; while the consequences for having /l bugs is
1173 * incorrect matches */
1174 if (ssc_is_anything((regnode_ssc *)and_with)) {
1175 anded_flags |= ANYOF_WARN_SUPER;
1179 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1180 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1183 ANYOF_FLAGS(ssc) &= anded_flags;
1185 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1186 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1187 * 'and_with' may be inverted. When not inverted, we have the situation of
1189 * (C1 | P1) & (C2 | P2)
1190 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1191 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1192 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1193 * <= ((C1 & C2) | P1 | P2)
1194 * Alternatively, the last few steps could be:
1195 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1196 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1197 * <= (C1 | C2 | (P1 & P2))
1198 * We favor the second approach if either P1 or P2 is non-empty. This is
1199 * because these components are a barrier to doing optimizations, as what
1200 * they match cannot be known until the moment of matching as they are
1201 * dependent on the current locale, 'AND"ing them likely will reduce or
1203 * But we can do better if we know that C1,P1 are in their initial state (a
1204 * frequent occurrence), each matching everything:
1205 * (<everything>) & (C2 | P2) = C2 | P2
1206 * Similarly, if C2,P2 are in their initial state (again a frequent
1207 * occurrence), the result is a no-op
1208 * (C1 | P1) & (<everything>) = C1 | P1
1211 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1212 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1213 * <= (C1 & ~C2) | (P1 & ~P2)
1216 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1217 && ! is_ANYOF_SYNTHETIC(and_with))
1221 ssc_intersection(ssc,
1223 FALSE /* Has already been inverted */
1226 /* If either P1 or P2 is empty, the intersection will be also; can skip
1228 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1229 ANYOF_POSIXL_ZERO(ssc);
1231 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1233 /* Note that the Posix class component P from 'and_with' actually
1235 * P = Pa | Pb | ... | Pn
1236 * where each component is one posix class, such as in [\w\s].
1238 * ~P = ~(Pa | Pb | ... | Pn)
1239 * = ~Pa & ~Pb & ... & ~Pn
1240 * <= ~Pa | ~Pb | ... | ~Pn
1241 * The last is something we can easily calculate, but unfortunately
1242 * is likely to have many false positives. We could do better
1243 * in some (but certainly not all) instances if two classes in
1244 * P have known relationships. For example
1245 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1247 * :lower: & :print: = :lower:
1248 * And similarly for classes that must be disjoint. For example,
1249 * since \s and \w can have no elements in common based on rules in
1250 * the POSIX standard,
1251 * \w & ^\S = nothing
1252 * Unfortunately, some vendor locales do not meet the Posix
1253 * standard, in particular almost everything by Microsoft.
1254 * The loop below just changes e.g., \w into \W and vice versa */
1256 regnode_charclass_posixl temp;
1257 int add = 1; /* To calculate the index of the complement */
1259 ANYOF_POSIXL_ZERO(&temp);
1260 for (i = 0; i < ANYOF_MAX; i++) {
1262 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1263 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1265 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1266 ANYOF_POSIXL_SET(&temp, i + add);
1268 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1270 ANYOF_POSIXL_AND(&temp, ssc);
1272 } /* else ssc already has no posixes */
1273 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1274 in its initial state */
1275 else if (! is_ANYOF_SYNTHETIC(and_with)
1276 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1278 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1279 * copy it over 'ssc' */
1280 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1281 if (is_ANYOF_SYNTHETIC(and_with)) {
1282 StructCopy(and_with, ssc, regnode_ssc);
1285 ssc->invlist = anded_cp_list;
1286 ANYOF_POSIXL_ZERO(ssc);
1287 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1288 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1292 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1293 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1295 /* One or the other of P1, P2 is non-empty. */
1296 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1297 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1299 ssc_union(ssc, anded_cp_list, FALSE);
1301 else { /* P1 = P2 = empty */
1302 ssc_intersection(ssc, anded_cp_list, FALSE);
1308 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1309 const regnode_charclass *or_with)
1311 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1312 * another SSC or a regular ANYOF class. Can create false positives if
1313 * 'or_with' is to be inverted. */
1318 PERL_ARGS_ASSERT_SSC_OR;
1320 assert(is_ANYOF_SYNTHETIC(ssc));
1322 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1323 * the code point inversion list and just the relevant flags */
1324 if (is_ANYOF_SYNTHETIC(or_with)) {
1325 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1326 ored_flags = ANYOF_FLAGS(or_with);
1329 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1330 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1333 ANYOF_FLAGS(ssc) |= ored_flags;
1335 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1336 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1337 * 'or_with' may be inverted. When not inverted, we have the simple
1338 * situation of computing:
1339 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1340 * If P1|P2 yields a situation with both a class and its complement are
1341 * set, like having both \w and \W, this matches all code points, and we
1342 * can delete these from the P component of the ssc going forward. XXX We
1343 * might be able to delete all the P components, but I (khw) am not certain
1344 * about this, and it is better to be safe.
1347 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1348 * <= (C1 | P1) | ~C2
1349 * <= (C1 | ~C2) | P1
1350 * (which results in actually simpler code than the non-inverted case)
1353 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1354 && ! is_ANYOF_SYNTHETIC(or_with))
1356 /* We ignore P2, leaving P1 going forward */
1357 } /* else Not inverted */
1358 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1359 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1360 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1362 for (i = 0; i < ANYOF_MAX; i += 2) {
1363 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1365 ssc_match_all_cp(ssc);
1366 ANYOF_POSIXL_CLEAR(ssc, i);
1367 ANYOF_POSIXL_CLEAR(ssc, i+1);
1375 FALSE /* Already has been inverted */
1379 PERL_STATIC_INLINE void
1380 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1382 PERL_ARGS_ASSERT_SSC_UNION;
1384 assert(is_ANYOF_SYNTHETIC(ssc));
1386 _invlist_union_maybe_complement_2nd(ssc->invlist,
1392 PERL_STATIC_INLINE void
1393 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1395 const bool invert2nd)
1397 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1399 assert(is_ANYOF_SYNTHETIC(ssc));
1401 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1407 PERL_STATIC_INLINE void
1408 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1410 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1412 assert(is_ANYOF_SYNTHETIC(ssc));
1414 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1417 PERL_STATIC_INLINE void
1418 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1420 /* AND just the single code point 'cp' into the SSC 'ssc' */
1422 SV* cp_list = _new_invlist(2);
1424 PERL_ARGS_ASSERT_SSC_CP_AND;
1426 assert(is_ANYOF_SYNTHETIC(ssc));
1428 cp_list = add_cp_to_invlist(cp_list, cp);
1429 ssc_intersection(ssc, cp_list,
1430 FALSE /* Not inverted */
1432 SvREFCNT_dec_NN(cp_list);
1435 PERL_STATIC_INLINE void
1436 S_ssc_clear_locale(regnode_ssc *ssc)
1438 /* Set the SSC 'ssc' to not match any locale things */
1439 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1441 assert(is_ANYOF_SYNTHETIC(ssc));
1443 ANYOF_POSIXL_ZERO(ssc);
1444 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1447 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1450 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1452 /* The synthetic start class is used to hopefully quickly winnow down
1453 * places where a pattern could start a match in the target string. If it
1454 * doesn't really narrow things down that much, there isn't much point to
1455 * having the overhead of using it. This function uses some very crude
1456 * heuristics to decide if to use the ssc or not.
1458 * It returns TRUE if 'ssc' rules out more than half what it considers to
1459 * be the "likely" possible matches, but of course it doesn't know what the
1460 * actual things being matched are going to be; these are only guesses
1462 * For /l matches, it assumes that the only likely matches are going to be
1463 * in the 0-255 range, uniformly distributed, so half of that is 127
1464 * For /a and /d matches, it assumes that the likely matches will be just
1465 * the ASCII range, so half of that is 63
1466 * For /u and there isn't anything matching above the Latin1 range, it
1467 * assumes that that is the only range likely to be matched, and uses
1468 * half that as the cut-off: 127. If anything matches above Latin1,
1469 * it assumes that all of Unicode could match (uniformly), except for
1470 * non-Unicode code points and things in the General Category "Other"
1471 * (unassigned, private use, surrogates, controls and formats). This
1472 * is a much large number. */
1474 const U32 max_match = (LOC)
1478 : (invlist_highest(ssc->invlist) < 256)
1480 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1481 U32 count = 0; /* Running total of number of code points matched by
1483 UV start, end; /* Start and end points of current range in inversion
1486 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1488 invlist_iterinit(ssc->invlist);
1489 while (invlist_iternext(ssc->invlist, &start, &end)) {
1491 /* /u is the only thing that we expect to match above 255; so if not /u
1492 * and even if there are matches above 255, ignore them. This catches
1493 * things like \d under /d which does match the digits above 255, but
1494 * since the pattern is /d, it is not likely to be expecting them */
1495 if (! UNI_SEMANTICS) {
1499 end = MIN(end, 255);
1501 count += end - start + 1;
1502 if (count > max_match) {
1503 invlist_iterfinish(ssc->invlist);
1513 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1515 /* The inversion list in the SSC is marked mortal; now we need a more
1516 * permanent copy, which is stored the same way that is done in a regular
1517 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1520 SV* invlist = invlist_clone(ssc->invlist);
1522 PERL_ARGS_ASSERT_SSC_FINALIZE;
1524 assert(is_ANYOF_SYNTHETIC(ssc));
1526 /* The code in this file assumes that all but these flags aren't relevant
1527 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1528 * by the time we reach here */
1529 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1531 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1533 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1534 NULL, NULL, NULL, FALSE);
1536 /* Make sure is clone-safe */
1537 ssc->invlist = NULL;
1539 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1540 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1543 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1546 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1547 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1548 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1549 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1550 ? (TRIE_LIST_CUR( idx ) - 1) \
1556 dump_trie(trie,widecharmap,revcharmap)
1557 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1558 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1560 These routines dump out a trie in a somewhat readable format.
1561 The _interim_ variants are used for debugging the interim
1562 tables that are used to generate the final compressed
1563 representation which is what dump_trie expects.
1565 Part of the reason for their existence is to provide a form
1566 of documentation as to how the different representations function.
1571 Dumps the final compressed table form of the trie to Perl_debug_log.
1572 Used for debugging make_trie().
1576 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1577 AV *revcharmap, U32 depth)
1580 SV *sv=sv_newmortal();
1581 int colwidth= widecharmap ? 6 : 4;
1583 GET_RE_DEBUG_FLAGS_DECL;
1585 PERL_ARGS_ASSERT_DUMP_TRIE;
1587 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1588 (int)depth * 2 + 2,"",
1589 "Match","Base","Ofs" );
1591 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1592 SV ** const tmp = av_fetch( revcharmap, state, 0);
1594 PerlIO_printf( Perl_debug_log, "%*s",
1596 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1597 PL_colors[0], PL_colors[1],
1598 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1599 PERL_PV_ESCAPE_FIRSTCHAR
1604 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1605 (int)depth * 2 + 2,"");
1607 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1608 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1609 PerlIO_printf( Perl_debug_log, "\n");
1611 for( state = 1 ; state < trie->statecount ; state++ ) {
1612 const U32 base = trie->states[ state ].trans.base;
1614 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1615 (int)depth * 2 + 2,"", (UV)state);
1617 if ( trie->states[ state ].wordnum ) {
1618 PerlIO_printf( Perl_debug_log, " W%4X",
1619 trie->states[ state ].wordnum );
1621 PerlIO_printf( Perl_debug_log, "%6s", "" );
1624 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1629 while( ( base + ofs < trie->uniquecharcount ) ||
1630 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1631 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1635 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1637 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1638 if ( ( base + ofs >= trie->uniquecharcount )
1639 && ( base + ofs - trie->uniquecharcount
1641 && trie->trans[ base + ofs
1642 - trie->uniquecharcount ].check == state )
1644 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1646 (UV)trie->trans[ base + ofs
1647 - trie->uniquecharcount ].next );
1649 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1653 PerlIO_printf( Perl_debug_log, "]");
1656 PerlIO_printf( Perl_debug_log, "\n" );
1658 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1660 for (word=1; word <= trie->wordcount; word++) {
1661 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1662 (int)word, (int)(trie->wordinfo[word].prev),
1663 (int)(trie->wordinfo[word].len));
1665 PerlIO_printf(Perl_debug_log, "\n" );
1668 Dumps a fully constructed but uncompressed trie in list form.
1669 List tries normally only are used for construction when the number of
1670 possible chars (trie->uniquecharcount) is very high.
1671 Used for debugging make_trie().
1674 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1675 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1679 SV *sv=sv_newmortal();
1680 int colwidth= widecharmap ? 6 : 4;
1681 GET_RE_DEBUG_FLAGS_DECL;
1683 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1685 /* print out the table precompression. */
1686 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1687 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1688 "------:-----+-----------------\n" );
1690 for( state=1 ; state < next_alloc ; state ++ ) {
1693 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1694 (int)depth * 2 + 2,"", (UV)state );
1695 if ( ! trie->states[ state ].wordnum ) {
1696 PerlIO_printf( Perl_debug_log, "%5s| ","");
1698 PerlIO_printf( Perl_debug_log, "W%4x| ",
1699 trie->states[ state ].wordnum
1702 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1703 SV ** const tmp = av_fetch( revcharmap,
1704 TRIE_LIST_ITEM(state,charid).forid, 0);
1706 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1708 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1710 PL_colors[0], PL_colors[1],
1711 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1712 | PERL_PV_ESCAPE_FIRSTCHAR
1714 TRIE_LIST_ITEM(state,charid).forid,
1715 (UV)TRIE_LIST_ITEM(state,charid).newstate
1718 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1719 (int)((depth * 2) + 14), "");
1722 PerlIO_printf( Perl_debug_log, "\n");
1727 Dumps a fully constructed but uncompressed trie in table form.
1728 This is the normal DFA style state transition table, with a few
1729 twists to facilitate compression later.
1730 Used for debugging make_trie().
1733 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1734 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1739 SV *sv=sv_newmortal();
1740 int colwidth= widecharmap ? 6 : 4;
1741 GET_RE_DEBUG_FLAGS_DECL;
1743 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1746 print out the table precompression so that we can do a visual check
1747 that they are identical.
1750 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1752 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1753 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1755 PerlIO_printf( Perl_debug_log, "%*s",
1757 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1758 PL_colors[0], PL_colors[1],
1759 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1760 PERL_PV_ESCAPE_FIRSTCHAR
1766 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1768 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1769 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1772 PerlIO_printf( Perl_debug_log, "\n" );
1774 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1776 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1777 (int)depth * 2 + 2,"",
1778 (UV)TRIE_NODENUM( state ) );
1780 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1781 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1783 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1785 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1787 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1788 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1789 (UV)trie->trans[ state ].check );
1791 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1792 (UV)trie->trans[ state ].check,
1793 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1801 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1802 startbranch: the first branch in the whole branch sequence
1803 first : start branch of sequence of branch-exact nodes.
1804 May be the same as startbranch
1805 last : Thing following the last branch.
1806 May be the same as tail.
1807 tail : item following the branch sequence
1808 count : words in the sequence
1809 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1810 depth : indent depth
1812 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1814 A trie is an N'ary tree where the branches are determined by digital
1815 decomposition of the key. IE, at the root node you look up the 1st character and
1816 follow that branch repeat until you find the end of the branches. Nodes can be
1817 marked as "accepting" meaning they represent a complete word. Eg:
1821 would convert into the following structure. Numbers represent states, letters
1822 following numbers represent valid transitions on the letter from that state, if
1823 the number is in square brackets it represents an accepting state, otherwise it
1824 will be in parenthesis.
1826 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1830 (1) +-i->(6)-+-s->[7]
1832 +-s->(3)-+-h->(4)-+-e->[5]
1834 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1836 This shows that when matching against the string 'hers' we will begin at state 1
1837 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1838 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1839 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1840 single traverse. We store a mapping from accepting to state to which word was
1841 matched, and then when we have multiple possibilities we try to complete the
1842 rest of the regex in the order in which they occured in the alternation.
1844 The only prior NFA like behaviour that would be changed by the TRIE support is
1845 the silent ignoring of duplicate alternations which are of the form:
1847 / (DUPE|DUPE) X? (?{ ... }) Y /x
1849 Thus EVAL blocks following a trie may be called a different number of times with
1850 and without the optimisation. With the optimisations dupes will be silently
1851 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1852 the following demonstrates:
1854 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1856 which prints out 'word' three times, but
1858 'words'=~/(word|word|word)(?{ print $1 })S/
1860 which doesnt print it out at all. This is due to other optimisations kicking in.
1862 Example of what happens on a structural level:
1864 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1866 1: CURLYM[1] {1,32767}(18)
1877 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1878 and should turn into:
1880 1: CURLYM[1] {1,32767}(18)
1882 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1890 Cases where tail != last would be like /(?foo|bar)baz/:
1900 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1901 and would end up looking like:
1904 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1911 d = uvchr_to_utf8_flags(d, uv, 0);
1913 is the recommended Unicode-aware way of saying
1918 #define TRIE_STORE_REVCHAR(val) \
1921 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1922 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1923 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1924 SvCUR_set(zlopp, kapow - flrbbbbb); \
1927 av_push(revcharmap, zlopp); \
1929 char ooooff = (char)val; \
1930 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1934 /* This gets the next character from the input, folding it if not already
1936 #define TRIE_READ_CHAR STMT_START { \
1939 /* if it is UTF then it is either already folded, or does not need \
1941 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1943 else if (folder == PL_fold_latin1) { \
1944 /* This folder implies Unicode rules, which in the range expressible \
1945 * by not UTF is the lower case, with the two exceptions, one of \
1946 * which should have been taken care of before calling this */ \
1947 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1948 uvc = toLOWER_L1(*uc); \
1949 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1952 /* raw data, will be folded later if needed */ \
1960 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1961 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1962 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1963 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1965 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1966 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1967 TRIE_LIST_CUR( state )++; \
1970 #define TRIE_LIST_NEW(state) STMT_START { \
1971 Newxz( trie->states[ state ].trans.list, \
1972 4, reg_trie_trans_le ); \
1973 TRIE_LIST_CUR( state ) = 1; \
1974 TRIE_LIST_LEN( state ) = 4; \
1977 #define TRIE_HANDLE_WORD(state) STMT_START { \
1978 U16 dupe= trie->states[ state ].wordnum; \
1979 regnode * const noper_next = regnext( noper ); \
1982 /* store the word for dumping */ \
1984 if (OP(noper) != NOTHING) \
1985 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1987 tmp = newSVpvn_utf8( "", 0, UTF ); \
1988 av_push( trie_words, tmp ); \
1992 trie->wordinfo[curword].prev = 0; \
1993 trie->wordinfo[curword].len = wordlen; \
1994 trie->wordinfo[curword].accept = state; \
1996 if ( noper_next < tail ) { \
1998 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2000 trie->jump[curword] = (U16)(noper_next - convert); \
2002 jumper = noper_next; \
2004 nextbranch= regnext(cur); \
2008 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2009 /* chain, so that when the bits of chain are later */\
2010 /* linked together, the dups appear in the chain */\
2011 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2012 trie->wordinfo[dupe].prev = curword; \
2014 /* we haven't inserted this word yet. */ \
2015 trie->states[ state ].wordnum = curword; \
2020 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2021 ( ( base + charid >= ucharcount \
2022 && base + charid < ubound \
2023 && state == trie->trans[ base - ucharcount + charid ].check \
2024 && trie->trans[ base - ucharcount + charid ].next ) \
2025 ? trie->trans[ base - ucharcount + charid ].next \
2026 : ( state==1 ? special : 0 ) \
2030 #define MADE_JUMP_TRIE 2
2031 #define MADE_EXACT_TRIE 4
2034 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2035 regnode *first, regnode *last, regnode *tail,
2036 U32 word_count, U32 flags, U32 depth)
2038 /* first pass, loop through and scan words */
2039 reg_trie_data *trie;
2040 HV *widecharmap = NULL;
2041 AV *revcharmap = newAV();
2047 regnode *jumper = NULL;
2048 regnode *nextbranch = NULL;
2049 regnode *convert = NULL;
2050 U32 *prev_states; /* temp array mapping each state to previous one */
2051 /* we just use folder as a flag in utf8 */
2052 const U8 * folder = NULL;
2055 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2056 AV *trie_words = NULL;
2057 /* along with revcharmap, this only used during construction but both are
2058 * useful during debugging so we store them in the struct when debugging.
2061 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2062 STRLEN trie_charcount=0;
2064 SV *re_trie_maxbuff;
2065 GET_RE_DEBUG_FLAGS_DECL;
2067 PERL_ARGS_ASSERT_MAKE_TRIE;
2069 PERL_UNUSED_ARG(depth);
2076 case EXACTFU: folder = PL_fold_latin1; break;
2077 case EXACTF: folder = PL_fold; break;
2078 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2081 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2083 trie->startstate = 1;
2084 trie->wordcount = word_count;
2085 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2086 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2088 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2089 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2090 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2093 trie_words = newAV();
2096 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2097 assert(re_trie_maxbuff);
2098 if (!SvIOK(re_trie_maxbuff)) {
2099 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2101 DEBUG_TRIE_COMPILE_r({
2102 PerlIO_printf( Perl_debug_log,
2103 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2104 (int)depth * 2 + 2, "",
2105 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2106 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2109 /* Find the node we are going to overwrite */
2110 if ( first == startbranch && OP( last ) != BRANCH ) {
2111 /* whole branch chain */
2114 /* branch sub-chain */
2115 convert = NEXTOPER( first );
2118 /* -- First loop and Setup --
2120 We first traverse the branches and scan each word to determine if it
2121 contains widechars, and how many unique chars there are, this is
2122 important as we have to build a table with at least as many columns as we
2125 We use an array of integers to represent the character codes 0..255
2126 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2127 the native representation of the character value as the key and IV's for
2130 *TODO* If we keep track of how many times each character is used we can
2131 remap the columns so that the table compression later on is more
2132 efficient in terms of memory by ensuring the most common value is in the
2133 middle and the least common are on the outside. IMO this would be better
2134 than a most to least common mapping as theres a decent chance the most
2135 common letter will share a node with the least common, meaning the node
2136 will not be compressible. With a middle is most common approach the worst
2137 case is when we have the least common nodes twice.
2141 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2142 regnode *noper = NEXTOPER( cur );
2143 const U8 *uc = (U8*)STRING( noper );
2144 const U8 *e = uc + STR_LEN( noper );
2146 U32 wordlen = 0; /* required init */
2147 STRLEN minchars = 0;
2148 STRLEN maxchars = 0;
2149 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2152 if (OP(noper) == NOTHING) {
2153 regnode *noper_next= regnext(noper);
2154 if (noper_next != tail && OP(noper_next) == flags) {
2156 uc= (U8*)STRING(noper);
2157 e= uc + STR_LEN(noper);
2158 trie->minlen= STR_LEN(noper);
2165 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2166 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2167 regardless of encoding */
2168 if (OP( noper ) == EXACTFU_SS) {
2169 /* false positives are ok, so just set this */
2170 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2173 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2175 TRIE_CHARCOUNT(trie)++;
2178 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2179 * is in effect. Under /i, this character can match itself, or
2180 * anything that folds to it. If not under /i, it can match just
2181 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2182 * all fold to k, and all are single characters. But some folds
2183 * expand to more than one character, so for example LATIN SMALL
2184 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2185 * the string beginning at 'uc' is 'ffi', it could be matched by
2186 * three characters, or just by the one ligature character. (It
2187 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2188 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2189 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2190 * match.) The trie needs to know the minimum and maximum number
2191 * of characters that could match so that it can use size alone to
2192 * quickly reject many match attempts. The max is simple: it is
2193 * the number of folded characters in this branch (since a fold is
2194 * never shorter than what folds to it. */
2198 /* And the min is equal to the max if not under /i (indicated by
2199 * 'folder' being NULL), or there are no multi-character folds. If
2200 * there is a multi-character fold, the min is incremented just
2201 * once, for the character that folds to the sequence. Each
2202 * character in the sequence needs to be added to the list below of
2203 * characters in the trie, but we count only the first towards the
2204 * min number of characters needed. This is done through the
2205 * variable 'foldlen', which is returned by the macros that look
2206 * for these sequences as the number of bytes the sequence
2207 * occupies. Each time through the loop, we decrement 'foldlen' by
2208 * how many bytes the current char occupies. Only when it reaches
2209 * 0 do we increment 'minchars' or look for another multi-character
2211 if (folder == NULL) {
2214 else if (foldlen > 0) {
2215 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2220 /* See if *uc is the beginning of a multi-character fold. If
2221 * so, we decrement the length remaining to look at, to account
2222 * for the current character this iteration. (We can use 'uc'
2223 * instead of the fold returned by TRIE_READ_CHAR because for
2224 * non-UTF, the latin1_safe macro is smart enough to account
2225 * for all the unfolded characters, and because for UTF, the
2226 * string will already have been folded earlier in the
2227 * compilation process */
2229 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2230 foldlen -= UTF8SKIP(uc);
2233 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2238 /* The current character (and any potential folds) should be added
2239 * to the possible matching characters for this position in this
2243 U8 folded= folder[ (U8) uvc ];
2244 if ( !trie->charmap[ folded ] ) {
2245 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2246 TRIE_STORE_REVCHAR( folded );
2249 if ( !trie->charmap[ uvc ] ) {
2250 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2251 TRIE_STORE_REVCHAR( uvc );
2254 /* store the codepoint in the bitmap, and its folded
2256 TRIE_BITMAP_SET(trie, uvc);
2258 /* store the folded codepoint */
2259 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2262 /* store first byte of utf8 representation of
2263 variant codepoints */
2264 if (! UVCHR_IS_INVARIANT(uvc)) {
2265 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2268 set_bit = 0; /* We've done our bit :-) */
2272 /* XXX We could come up with the list of code points that fold
2273 * to this using PL_utf8_foldclosures, except not for
2274 * multi-char folds, as there may be multiple combinations
2275 * there that could work, which needs to wait until runtime to
2276 * resolve (The comment about LIGATURE FFI above is such an
2281 widecharmap = newHV();
2283 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2286 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2288 if ( !SvTRUE( *svpp ) ) {
2289 sv_setiv( *svpp, ++trie->uniquecharcount );
2290 TRIE_STORE_REVCHAR(uvc);
2293 } /* end loop through characters in this branch of the trie */
2295 /* We take the min and max for this branch and combine to find the min
2296 * and max for all branches processed so far */
2297 if( cur == first ) {
2298 trie->minlen = minchars;
2299 trie->maxlen = maxchars;
2300 } else if (minchars < trie->minlen) {
2301 trie->minlen = minchars;
2302 } else if (maxchars > trie->maxlen) {
2303 trie->maxlen = maxchars;
2305 } /* end first pass */
2306 DEBUG_TRIE_COMPILE_r(
2307 PerlIO_printf( Perl_debug_log,
2308 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2309 (int)depth * 2 + 2,"",
2310 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2311 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2312 (int)trie->minlen, (int)trie->maxlen )
2316 We now know what we are dealing with in terms of unique chars and
2317 string sizes so we can calculate how much memory a naive
2318 representation using a flat table will take. If it's over a reasonable
2319 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2320 conservative but potentially much slower representation using an array
2323 At the end we convert both representations into the same compressed
2324 form that will be used in regexec.c for matching with. The latter
2325 is a form that cannot be used to construct with but has memory
2326 properties similar to the list form and access properties similar
2327 to the table form making it both suitable for fast searches and
2328 small enough that its feasable to store for the duration of a program.
2330 See the comment in the code where the compressed table is produced
2331 inplace from the flat tabe representation for an explanation of how
2332 the compression works.
2337 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2340 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2341 > SvIV(re_trie_maxbuff) )
2344 Second Pass -- Array Of Lists Representation
2346 Each state will be represented by a list of charid:state records
2347 (reg_trie_trans_le) the first such element holds the CUR and LEN
2348 points of the allocated array. (See defines above).
2350 We build the initial structure using the lists, and then convert
2351 it into the compressed table form which allows faster lookups
2352 (but cant be modified once converted).
2355 STRLEN transcount = 1;
2357 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2358 "%*sCompiling trie using list compiler\n",
2359 (int)depth * 2 + 2, ""));
2361 trie->states = (reg_trie_state *)
2362 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2363 sizeof(reg_trie_state) );
2367 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2369 regnode *noper = NEXTOPER( cur );
2370 U8 *uc = (U8*)STRING( noper );
2371 const U8 *e = uc + STR_LEN( noper );
2372 U32 state = 1; /* required init */
2373 U16 charid = 0; /* sanity init */
2374 U32 wordlen = 0; /* required init */
2376 if (OP(noper) == NOTHING) {
2377 regnode *noper_next= regnext(noper);
2378 if (noper_next != tail && OP(noper_next) == flags) {
2380 uc= (U8*)STRING(noper);
2381 e= uc + STR_LEN(noper);
2385 if (OP(noper) != NOTHING) {
2386 for ( ; uc < e ; uc += len ) {
2391 charid = trie->charmap[ uvc ];
2393 SV** const svpp = hv_fetch( widecharmap,
2400 charid=(U16)SvIV( *svpp );
2403 /* charid is now 0 if we dont know the char read, or
2404 * nonzero if we do */
2411 if ( !trie->states[ state ].trans.list ) {
2412 TRIE_LIST_NEW( state );
2415 check <= TRIE_LIST_USED( state );
2418 if ( TRIE_LIST_ITEM( state, check ).forid
2421 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2426 newstate = next_alloc++;
2427 prev_states[newstate] = state;
2428 TRIE_LIST_PUSH( state, charid, newstate );
2433 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2437 TRIE_HANDLE_WORD(state);
2439 } /* end second pass */
2441 /* next alloc is the NEXT state to be allocated */
2442 trie->statecount = next_alloc;
2443 trie->states = (reg_trie_state *)
2444 PerlMemShared_realloc( trie->states,
2446 * sizeof(reg_trie_state) );
2448 /* and now dump it out before we compress it */
2449 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2450 revcharmap, next_alloc,
2454 trie->trans = (reg_trie_trans *)
2455 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2462 for( state=1 ; state < next_alloc ; state ++ ) {
2466 DEBUG_TRIE_COMPILE_MORE_r(
2467 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2471 if (trie->states[state].trans.list) {
2472 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2476 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2477 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2478 if ( forid < minid ) {
2480 } else if ( forid > maxid ) {
2484 if ( transcount < tp + maxid - minid + 1) {
2486 trie->trans = (reg_trie_trans *)
2487 PerlMemShared_realloc( trie->trans,
2489 * sizeof(reg_trie_trans) );
2490 Zero( trie->trans + (transcount / 2),
2494 base = trie->uniquecharcount + tp - minid;
2495 if ( maxid == minid ) {
2497 for ( ; zp < tp ; zp++ ) {
2498 if ( ! trie->trans[ zp ].next ) {
2499 base = trie->uniquecharcount + zp - minid;
2500 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2502 trie->trans[ zp ].check = state;
2508 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2510 trie->trans[ tp ].check = state;
2515 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2516 const U32 tid = base
2517 - trie->uniquecharcount
2518 + TRIE_LIST_ITEM( state, idx ).forid;
2519 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2521 trie->trans[ tid ].check = state;
2523 tp += ( maxid - minid + 1 );
2525 Safefree(trie->states[ state ].trans.list);
2528 DEBUG_TRIE_COMPILE_MORE_r(
2529 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2532 trie->states[ state ].trans.base=base;
2534 trie->lasttrans = tp + 1;
2538 Second Pass -- Flat Table Representation.
2540 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2541 each. We know that we will need Charcount+1 trans at most to store
2542 the data (one row per char at worst case) So we preallocate both
2543 structures assuming worst case.
2545 We then construct the trie using only the .next slots of the entry
2548 We use the .check field of the first entry of the node temporarily
2549 to make compression both faster and easier by keeping track of how
2550 many non zero fields are in the node.
2552 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2555 There are two terms at use here: state as a TRIE_NODEIDX() which is
2556 a number representing the first entry of the node, and state as a
2557 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2558 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2559 if there are 2 entrys per node. eg:
2567 The table is internally in the right hand, idx form. However as we
2568 also have to deal with the states array which is indexed by nodenum
2569 we have to use TRIE_NODENUM() to convert.
2572 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2573 "%*sCompiling trie using table compiler\n",
2574 (int)depth * 2 + 2, ""));
2576 trie->trans = (reg_trie_trans *)
2577 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2578 * trie->uniquecharcount + 1,
2579 sizeof(reg_trie_trans) );
2580 trie->states = (reg_trie_state *)
2581 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2582 sizeof(reg_trie_state) );
2583 next_alloc = trie->uniquecharcount + 1;
2586 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2588 regnode *noper = NEXTOPER( cur );
2589 const U8 *uc = (U8*)STRING( noper );
2590 const U8 *e = uc + STR_LEN( noper );
2592 U32 state = 1; /* required init */
2594 U16 charid = 0; /* sanity init */
2595 U32 accept_state = 0; /* sanity init */
2597 U32 wordlen = 0; /* required init */
2599 if (OP(noper) == NOTHING) {
2600 regnode *noper_next= regnext(noper);
2601 if (noper_next != tail && OP(noper_next) == flags) {
2603 uc= (U8*)STRING(noper);
2604 e= uc + STR_LEN(noper);
2608 if ( OP(noper) != NOTHING ) {
2609 for ( ; uc < e ; uc += len ) {
2614 charid = trie->charmap[ uvc ];
2616 SV* const * const svpp = hv_fetch( widecharmap,
2620 charid = svpp ? (U16)SvIV(*svpp) : 0;
2624 if ( !trie->trans[ state + charid ].next ) {
2625 trie->trans[ state + charid ].next = next_alloc;
2626 trie->trans[ state ].check++;
2627 prev_states[TRIE_NODENUM(next_alloc)]
2628 = TRIE_NODENUM(state);
2629 next_alloc += trie->uniquecharcount;
2631 state = trie->trans[ state + charid ].next;
2633 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2635 /* charid is now 0 if we dont know the char read, or
2636 * nonzero if we do */
2639 accept_state = TRIE_NODENUM( state );
2640 TRIE_HANDLE_WORD(accept_state);
2642 } /* end second pass */
2644 /* and now dump it out before we compress it */
2645 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2647 next_alloc, depth+1));
2651 * Inplace compress the table.*
2653 For sparse data sets the table constructed by the trie algorithm will
2654 be mostly 0/FAIL transitions or to put it another way mostly empty.
2655 (Note that leaf nodes will not contain any transitions.)
2657 This algorithm compresses the tables by eliminating most such
2658 transitions, at the cost of a modest bit of extra work during lookup:
2660 - Each states[] entry contains a .base field which indicates the
2661 index in the state[] array wheres its transition data is stored.
2663 - If .base is 0 there are no valid transitions from that node.
2665 - If .base is nonzero then charid is added to it to find an entry in
2668 -If trans[states[state].base+charid].check!=state then the
2669 transition is taken to be a 0/Fail transition. Thus if there are fail
2670 transitions at the front of the node then the .base offset will point
2671 somewhere inside the previous nodes data (or maybe even into a node
2672 even earlier), but the .check field determines if the transition is
2676 The following process inplace converts the table to the compressed
2677 table: We first do not compress the root node 1,and mark all its
2678 .check pointers as 1 and set its .base pointer as 1 as well. This
2679 allows us to do a DFA construction from the compressed table later,
2680 and ensures that any .base pointers we calculate later are greater
2683 - We set 'pos' to indicate the first entry of the second node.
2685 - We then iterate over the columns of the node, finding the first and
2686 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2687 and set the .check pointers accordingly, and advance pos
2688 appropriately and repreat for the next node. Note that when we copy
2689 the next pointers we have to convert them from the original
2690 NODEIDX form to NODENUM form as the former is not valid post
2693 - If a node has no transitions used we mark its base as 0 and do not
2694 advance the pos pointer.
2696 - If a node only has one transition we use a second pointer into the
2697 structure to fill in allocated fail transitions from other states.
2698 This pointer is independent of the main pointer and scans forward
2699 looking for null transitions that are allocated to a state. When it
2700 finds one it writes the single transition into the "hole". If the
2701 pointer doesnt find one the single transition is appended as normal.
2703 - Once compressed we can Renew/realloc the structures to release the
2706 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2707 specifically Fig 3.47 and the associated pseudocode.
2711 const U32 laststate = TRIE_NODENUM( next_alloc );
2714 trie->statecount = laststate;
2716 for ( state = 1 ; state < laststate ; state++ ) {
2718 const U32 stateidx = TRIE_NODEIDX( state );
2719 const U32 o_used = trie->trans[ stateidx ].check;
2720 U32 used = trie->trans[ stateidx ].check;
2721 trie->trans[ stateidx ].check = 0;
2724 used && charid < trie->uniquecharcount;
2727 if ( flag || trie->trans[ stateidx + charid ].next ) {
2728 if ( trie->trans[ stateidx + charid ].next ) {
2730 for ( ; zp < pos ; zp++ ) {
2731 if ( ! trie->trans[ zp ].next ) {
2735 trie->states[ state ].trans.base
2737 + trie->uniquecharcount
2739 trie->trans[ zp ].next
2740 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2742 trie->trans[ zp ].check = state;
2743 if ( ++zp > pos ) pos = zp;
2750 trie->states[ state ].trans.base
2751 = pos + trie->uniquecharcount - charid ;
2753 trie->trans[ pos ].next
2754 = SAFE_TRIE_NODENUM(
2755 trie->trans[ stateidx + charid ].next );
2756 trie->trans[ pos ].check = state;
2761 trie->lasttrans = pos + 1;
2762 trie->states = (reg_trie_state *)
2763 PerlMemShared_realloc( trie->states, laststate
2764 * sizeof(reg_trie_state) );
2765 DEBUG_TRIE_COMPILE_MORE_r(
2766 PerlIO_printf( Perl_debug_log,
2767 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2768 (int)depth * 2 + 2,"",
2769 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2773 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2776 } /* end table compress */
2778 DEBUG_TRIE_COMPILE_MORE_r(
2779 PerlIO_printf(Perl_debug_log,
2780 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2781 (int)depth * 2 + 2, "",
2782 (UV)trie->statecount,
2783 (UV)trie->lasttrans)
2785 /* resize the trans array to remove unused space */
2786 trie->trans = (reg_trie_trans *)
2787 PerlMemShared_realloc( trie->trans, trie->lasttrans
2788 * sizeof(reg_trie_trans) );
2790 { /* Modify the program and insert the new TRIE node */
2791 U8 nodetype =(U8)(flags & 0xFF);
2795 regnode *optimize = NULL;
2796 #ifdef RE_TRACK_PATTERN_OFFSETS
2799 U32 mjd_nodelen = 0;
2800 #endif /* RE_TRACK_PATTERN_OFFSETS */
2801 #endif /* DEBUGGING */
2803 This means we convert either the first branch or the first Exact,
2804 depending on whether the thing following (in 'last') is a branch
2805 or not and whther first is the startbranch (ie is it a sub part of
2806 the alternation or is it the whole thing.)
2807 Assuming its a sub part we convert the EXACT otherwise we convert
2808 the whole branch sequence, including the first.
2810 /* Find the node we are going to overwrite */
2811 if ( first != startbranch || OP( last ) == BRANCH ) {
2812 /* branch sub-chain */
2813 NEXT_OFF( first ) = (U16)(last - first);
2814 #ifdef RE_TRACK_PATTERN_OFFSETS
2816 mjd_offset= Node_Offset((convert));
2817 mjd_nodelen= Node_Length((convert));
2820 /* whole branch chain */
2822 #ifdef RE_TRACK_PATTERN_OFFSETS
2825 const regnode *nop = NEXTOPER( convert );
2826 mjd_offset= Node_Offset((nop));
2827 mjd_nodelen= Node_Length((nop));
2831 PerlIO_printf(Perl_debug_log,
2832 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2833 (int)depth * 2 + 2, "",
2834 (UV)mjd_offset, (UV)mjd_nodelen)
2837 /* But first we check to see if there is a common prefix we can
2838 split out as an EXACT and put in front of the TRIE node. */
2839 trie->startstate= 1;
2840 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2842 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2846 const U32 base = trie->states[ state ].trans.base;
2848 if ( trie->states[state].wordnum )
2851 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2852 if ( ( base + ofs >= trie->uniquecharcount ) &&
2853 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2854 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2856 if ( ++count > 1 ) {
2857 SV **tmp = av_fetch( revcharmap, ofs, 0);
2858 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2859 if ( state == 1 ) break;
2861 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2863 PerlIO_printf(Perl_debug_log,
2864 "%*sNew Start State=%"UVuf" Class: [",
2865 (int)depth * 2 + 2, "",
2868 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2869 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2871 TRIE_BITMAP_SET(trie,*ch);
2873 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2875 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2879 TRIE_BITMAP_SET(trie,*ch);
2881 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2882 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2888 SV **tmp = av_fetch( revcharmap, idx, 0);
2890 char *ch = SvPV( *tmp, len );
2892 SV *sv=sv_newmortal();
2893 PerlIO_printf( Perl_debug_log,
2894 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2895 (int)depth * 2 + 2, "",
2897 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2898 PL_colors[0], PL_colors[1],
2899 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2900 PERL_PV_ESCAPE_FIRSTCHAR
2905 OP( convert ) = nodetype;
2906 str=STRING(convert);
2909 STR_LEN(convert) += len;
2915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2920 trie->prefixlen = (state-1);
2922 regnode *n = convert+NODE_SZ_STR(convert);
2923 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2924 trie->startstate = state;
2925 trie->minlen -= (state - 1);
2926 trie->maxlen -= (state - 1);
2928 /* At least the UNICOS C compiler choked on this
2929 * being argument to DEBUG_r(), so let's just have
2932 #ifdef PERL_EXT_RE_BUILD
2938 regnode *fix = convert;
2939 U32 word = trie->wordcount;
2941 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2942 while( ++fix < n ) {
2943 Set_Node_Offset_Length(fix, 0, 0);
2946 SV ** const tmp = av_fetch( trie_words, word, 0 );
2948 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2949 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2951 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2959 NEXT_OFF(convert) = (U16)(tail - convert);
2960 DEBUG_r(optimize= n);
2966 if ( trie->maxlen ) {
2967 NEXT_OFF( convert ) = (U16)(tail - convert);
2968 ARG_SET( convert, data_slot );
2969 /* Store the offset to the first unabsorbed branch in
2970 jump[0], which is otherwise unused by the jump logic.
2971 We use this when dumping a trie and during optimisation. */
2973 trie->jump[0] = (U16)(nextbranch - convert);
2975 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2976 * and there is a bitmap
2977 * and the first "jump target" node we found leaves enough room
2978 * then convert the TRIE node into a TRIEC node, with the bitmap
2979 * embedded inline in the opcode - this is hypothetically faster.
2981 if ( !trie->states[trie->startstate].wordnum
2983 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2985 OP( convert ) = TRIEC;
2986 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2987 PerlMemShared_free(trie->bitmap);
2990 OP( convert ) = TRIE;
2992 /* store the type in the flags */
2993 convert->flags = nodetype;
2997 + regarglen[ OP( convert ) ];
2999 /* XXX We really should free up the resource in trie now,
3000 as we won't use them - (which resources?) dmq */
3002 /* needed for dumping*/
3003 DEBUG_r(if (optimize) {
3004 regnode *opt = convert;
3006 while ( ++opt < optimize) {
3007 Set_Node_Offset_Length(opt,0,0);
3010 Try to clean up some of the debris left after the
3013 while( optimize < jumper ) {
3014 mjd_nodelen += Node_Length((optimize));
3015 OP( optimize ) = OPTIMIZED;
3016 Set_Node_Offset_Length(optimize,0,0);
3019 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3021 } /* end node insert */
3023 /* Finish populating the prev field of the wordinfo array. Walk back
3024 * from each accept state until we find another accept state, and if
3025 * so, point the first word's .prev field at the second word. If the
3026 * second already has a .prev field set, stop now. This will be the
3027 * case either if we've already processed that word's accept state,
3028 * or that state had multiple words, and the overspill words were
3029 * already linked up earlier.
3036 for (word=1; word <= trie->wordcount; word++) {
3038 if (trie->wordinfo[word].prev)
3040 state = trie->wordinfo[word].accept;
3042 state = prev_states[state];
3045 prev = trie->states[state].wordnum;
3049 trie->wordinfo[word].prev = prev;
3051 Safefree(prev_states);
3055 /* and now dump out the compressed format */
3056 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3058 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3060 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3061 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3063 SvREFCNT_dec_NN(revcharmap);
3067 : trie->startstate>1
3073 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3075 /* The Trie is constructed and compressed now so we can build a fail array if
3078 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3080 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3084 We find the fail state for each state in the trie, this state is the longest
3085 proper suffix of the current state's 'word' that is also a proper prefix of
3086 another word in our trie. State 1 represents the word '' and is thus the
3087 default fail state. This allows the DFA not to have to restart after its
3088 tried and failed a word at a given point, it simply continues as though it
3089 had been matching the other word in the first place.
3091 'abcdgu'=~/abcdefg|cdgu/
3092 When we get to 'd' we are still matching the first word, we would encounter
3093 'g' which would fail, which would bring us to the state representing 'd' in
3094 the second word where we would try 'g' and succeed, proceeding to match
3097 /* add a fail transition */
3098 const U32 trie_offset = ARG(source);
3099 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3101 const U32 ucharcount = trie->uniquecharcount;
3102 const U32 numstates = trie->statecount;
3103 const U32 ubound = trie->lasttrans + ucharcount;
3107 U32 base = trie->states[ 1 ].trans.base;
3110 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3112 GET_RE_DEBUG_FLAGS_DECL;
3114 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3115 PERL_UNUSED_CONTEXT;
3117 PERL_UNUSED_ARG(depth);
3120 if ( OP(source) == TRIE ) {
3121 struct regnode_1 *op = (struct regnode_1 *)
3122 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3123 StructCopy(source,op,struct regnode_1);
3124 stclass = (regnode *)op;
3126 struct regnode_charclass *op = (struct regnode_charclass *)
3127 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3128 StructCopy(source,op,struct regnode_charclass);
3129 stclass = (regnode *)op;
3131 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3133 ARG_SET( stclass, data_slot );
3134 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3135 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3136 aho->trie=trie_offset;
3137 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3138 Copy( trie->states, aho->states, numstates, reg_trie_state );
3139 Newxz( q, numstates, U32);
3140 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3143 /* initialize fail[0..1] to be 1 so that we always have
3144 a valid final fail state */
3145 fail[ 0 ] = fail[ 1 ] = 1;
3147 for ( charid = 0; charid < ucharcount ; charid++ ) {
3148 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3150 q[ q_write ] = newstate;
3151 /* set to point at the root */
3152 fail[ q[ q_write++ ] ]=1;
3155 while ( q_read < q_write) {
3156 const U32 cur = q[ q_read++ % numstates ];
3157 base = trie->states[ cur ].trans.base;
3159 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3160 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3162 U32 fail_state = cur;
3165 fail_state = fail[ fail_state ];
3166 fail_base = aho->states[ fail_state ].trans.base;
3167 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3169 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3170 fail[ ch_state ] = fail_state;
3171 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3173 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3175 q[ q_write++ % numstates] = ch_state;
3179 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3180 when we fail in state 1, this allows us to use the
3181 charclass scan to find a valid start char. This is based on the principle
3182 that theres a good chance the string being searched contains lots of stuff
3183 that cant be a start char.
3185 fail[ 0 ] = fail[ 1 ] = 0;
3186 DEBUG_TRIE_COMPILE_r({
3187 PerlIO_printf(Perl_debug_log,
3188 "%*sStclass Failtable (%"UVuf" states): 0",
3189 (int)(depth * 2), "", (UV)numstates
3191 for( q_read=1; q_read<numstates; q_read++ ) {
3192 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3194 PerlIO_printf(Perl_debug_log, "\n");
3197 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3202 #define DEBUG_PEEP(str,scan,depth) \
3203 DEBUG_OPTIMISE_r({if (scan){ \
3204 SV * const mysv=sv_newmortal(); \
3205 regnode *Next = regnext(scan); \
3206 regprop(RExC_rx, mysv, scan, NULL); \
3207 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3208 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3209 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3213 /* The below joins as many adjacent EXACTish nodes as possible into a single
3214 * one. The regop may be changed if the node(s) contain certain sequences that
3215 * require special handling. The joining is only done if:
3216 * 1) there is room in the current conglomerated node to entirely contain the
3218 * 2) they are the exact same node type
3220 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3221 * these get optimized out
3223 * If a node is to match under /i (folded), the number of characters it matches
3224 * can be different than its character length if it contains a multi-character
3225 * fold. *min_subtract is set to the total delta number of characters of the
3228 * And *unfolded_multi_char is set to indicate whether or not the node contains
3229 * an unfolded multi-char fold. This happens when whether the fold is valid or
3230 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3231 * SMALL LETTER SHARP S, as only if the target string being matched against
3232 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3233 * folding rules depend on the locale in force at runtime. (Multi-char folds
3234 * whose components are all above the Latin1 range are not run-time locale
3235 * dependent, and have already been folded by the time this function is
3238 * This is as good a place as any to discuss the design of handling these
3239 * multi-character fold sequences. It's been wrong in Perl for a very long
3240 * time. There are three code points in Unicode whose multi-character folds
3241 * were long ago discovered to mess things up. The previous designs for
3242 * dealing with these involved assigning a special node for them. This
3243 * approach doesn't always work, as evidenced by this example:
3244 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3245 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3246 * would match just the \xDF, it won't be able to handle the case where a
3247 * successful match would have to cross the node's boundary. The new approach
3248 * that hopefully generally solves the problem generates an EXACTFU_SS node
3249 * that is "sss" in this case.
3251 * It turns out that there are problems with all multi-character folds, and not
3252 * just these three. Now the code is general, for all such cases. The
3253 * approach taken is:
3254 * 1) This routine examines each EXACTFish node that could contain multi-
3255 * character folded sequences. Since a single character can fold into
3256 * such a sequence, the minimum match length for this node is less than
3257 * the number of characters in the node. This routine returns in
3258 * *min_subtract how many characters to subtract from the the actual
3259 * length of the string to get a real minimum match length; it is 0 if
3260 * there are no multi-char foldeds. This delta is used by the caller to
3261 * adjust the min length of the match, and the delta between min and max,
3262 * so that the optimizer doesn't reject these possibilities based on size
3264 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3265 * is used for an EXACTFU node that contains at least one "ss" sequence in
3266 * it. For non-UTF-8 patterns and strings, this is the only case where
3267 * there is a possible fold length change. That means that a regular
3268 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3269 * with length changes, and so can be processed faster. regexec.c takes
3270 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3271 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3272 * known until runtime). This saves effort in regex matching. However,
3273 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3274 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3275 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3276 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3277 * possibilities for the non-UTF8 patterns are quite simple, except for
3278 * the sharp s. All the ones that don't involve a UTF-8 target string are
3279 * members of a fold-pair, and arrays are set up for all of them so that
3280 * the other member of the pair can be found quickly. Code elsewhere in
3281 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3282 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3283 * described in the next item.
3284 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3285 * validity of the fold won't be known until runtime, and so must remain
3286 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3287 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3288 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3289 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3290 * The reason this is a problem is that the optimizer part of regexec.c
3291 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3292 * that a character in the pattern corresponds to at most a single
3293 * character in the target string. (And I do mean character, and not byte
3294 * here, unlike other parts of the documentation that have never been
3295 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3296 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3297 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3298 * nodes, violate the assumption, and they are the only instances where it
3299 * is violated. I'm reluctant to try to change the assumption, as the
3300 * code involved is impenetrable to me (khw), so instead the code here
3301 * punts. This routine examines EXACTFL nodes, and (when the pattern
3302 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3303 * boolean indicating whether or not the node contains such a fold. When
3304 * it is true, the caller sets a flag that later causes the optimizer in
3305 * this file to not set values for the floating and fixed string lengths,
3306 * and thus avoids the optimizer code in regexec.c that makes the invalid
3307 * assumption. Thus, there is no optimization based on string lengths for
3308 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3309 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3310 * assumption is wrong only in these cases is that all other non-UTF-8
3311 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3312 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3313 * EXACTF nodes because we don't know at compile time if it actually
3314 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3315 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3316 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3317 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3318 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3319 * string would require the pattern to be forced into UTF-8, the overhead
3320 * of which we want to avoid. Similarly the unfolded multi-char folds in
3321 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3324 * Similarly, the code that generates tries doesn't currently handle
3325 * not-already-folded multi-char folds, and it looks like a pain to change
3326 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3327 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3328 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3329 * using /iaa matching will be doing so almost entirely with ASCII
3330 * strings, so this should rarely be encountered in practice */
3332 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3333 if (PL_regkind[OP(scan)] == EXACT) \
3334 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3337 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3338 UV *min_subtract, bool *unfolded_multi_char,
3339 U32 flags,regnode *val, U32 depth)
3341 /* Merge several consecutive EXACTish nodes into one. */
3342 regnode *n = regnext(scan);
3344 regnode *next = scan + NODE_SZ_STR(scan);
3348 regnode *stop = scan;
3349 GET_RE_DEBUG_FLAGS_DECL;
3351 PERL_UNUSED_ARG(depth);
3354 PERL_ARGS_ASSERT_JOIN_EXACT;
3355 #ifndef EXPERIMENTAL_INPLACESCAN
3356 PERL_UNUSED_ARG(flags);
3357 PERL_UNUSED_ARG(val);
3359 DEBUG_PEEP("join",scan,depth);
3361 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3362 * EXACT ones that are mergeable to the current one. */
3364 && (PL_regkind[OP(n)] == NOTHING
3365 || (stringok && OP(n) == OP(scan)))
3367 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3370 if (OP(n) == TAIL || n > next)
3372 if (PL_regkind[OP(n)] == NOTHING) {
3373 DEBUG_PEEP("skip:",n,depth);
3374 NEXT_OFF(scan) += NEXT_OFF(n);
3375 next = n + NODE_STEP_REGNODE;
3382 else if (stringok) {
3383 const unsigned int oldl = STR_LEN(scan);
3384 regnode * const nnext = regnext(n);
3386 /* XXX I (khw) kind of doubt that this works on platforms (should
3387 * Perl ever run on one) where U8_MAX is above 255 because of lots
3388 * of other assumptions */
3389 /* Don't join if the sum can't fit into a single node */
3390 if (oldl + STR_LEN(n) > U8_MAX)
3393 DEBUG_PEEP("merg",n,depth);
3396 NEXT_OFF(scan) += NEXT_OFF(n);
3397 STR_LEN(scan) += STR_LEN(n);
3398 next = n + NODE_SZ_STR(n);
3399 /* Now we can overwrite *n : */
3400 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3408 #ifdef EXPERIMENTAL_INPLACESCAN
3409 if (flags && !NEXT_OFF(n)) {
3410 DEBUG_PEEP("atch", val, depth);
3411 if (reg_off_by_arg[OP(n)]) {
3412 ARG_SET(n, val - n);
3415 NEXT_OFF(n) = val - n;
3423 *unfolded_multi_char = FALSE;
3425 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3426 * can now analyze for sequences of problematic code points. (Prior to
3427 * this final joining, sequences could have been split over boundaries, and
3428 * hence missed). The sequences only happen in folding, hence for any
3429 * non-EXACT EXACTish node */
3430 if (OP(scan) != EXACT) {
3431 U8* s0 = (U8*) STRING(scan);
3433 U8* s_end = s0 + STR_LEN(scan);
3435 int total_count_delta = 0; /* Total delta number of characters that
3436 multi-char folds expand to */
3438 /* One pass is made over the node's string looking for all the
3439 * possibilities. To avoid some tests in the loop, there are two main
3440 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3445 if (OP(scan) == EXACTFL) {
3448 /* An EXACTFL node would already have been changed to another
3449 * node type unless there is at least one character in it that
3450 * is problematic; likely a character whose fold definition
3451 * won't be known until runtime, and so has yet to be folded.
3452 * For all but the UTF-8 locale, folds are 1-1 in length, but
3453 * to handle the UTF-8 case, we need to create a temporary
3454 * folded copy using UTF-8 locale rules in order to analyze it.
3455 * This is because our macros that look to see if a sequence is
3456 * a multi-char fold assume everything is folded (otherwise the
3457 * tests in those macros would be too complicated and slow).
3458 * Note that here, the non-problematic folds will have already
3459 * been done, so we can just copy such characters. We actually
3460 * don't completely fold the EXACTFL string. We skip the
3461 * unfolded multi-char folds, as that would just create work
3462 * below to figure out the size they already are */
3464 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3467 STRLEN s_len = UTF8SKIP(s);
3468 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3469 Copy(s, d, s_len, U8);
3472 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3473 *unfolded_multi_char = TRUE;
3474 Copy(s, d, s_len, U8);
3477 else if (isASCII(*s)) {
3478 *(d++) = toFOLD(*s);
3482 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3488 /* Point the remainder of the routine to look at our temporary
3492 } /* End of creating folded copy of EXACTFL string */
3494 /* Examine the string for a multi-character fold sequence. UTF-8
3495 * patterns have all characters pre-folded by the time this code is
3497 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3498 length sequence we are looking for is 2 */
3500 int count = 0; /* How many characters in a multi-char fold */
3501 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3502 if (! len) { /* Not a multi-char fold: get next char */
3507 /* Nodes with 'ss' require special handling, except for
3508 * EXACTFA-ish for which there is no multi-char fold to this */
3509 if (len == 2 && *s == 's' && *(s+1) == 's'
3510 && OP(scan) != EXACTFA
3511 && OP(scan) != EXACTFA_NO_TRIE)
3514 if (OP(scan) != EXACTFL) {
3515 OP(scan) = EXACTFU_SS;
3519 else { /* Here is a generic multi-char fold. */
3520 U8* multi_end = s + len;
3522 /* Count how many characters are in it. In the case of
3523 * /aa, no folds which contain ASCII code points are
3524 * allowed, so check for those, and skip if found. */
3525 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3526 count = utf8_length(s, multi_end);
3530 while (s < multi_end) {
3533 goto next_iteration;
3543 /* The delta is how long the sequence is minus 1 (1 is how long
3544 * the character that folds to the sequence is) */
3545 total_count_delta += count - 1;
3549 /* We created a temporary folded copy of the string in EXACTFL
3550 * nodes. Therefore we need to be sure it doesn't go below zero,
3551 * as the real string could be shorter */
3552 if (OP(scan) == EXACTFL) {
3553 int total_chars = utf8_length((U8*) STRING(scan),
3554 (U8*) STRING(scan) + STR_LEN(scan));
3555 if (total_count_delta > total_chars) {
3556 total_count_delta = total_chars;
3560 *min_subtract += total_count_delta;
3563 else if (OP(scan) == EXACTFA) {
3565 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3566 * fold to the ASCII range (and there are no existing ones in the
3567 * upper latin1 range). But, as outlined in the comments preceding
3568 * this function, we need to flag any occurrences of the sharp s.
3569 * This character forbids trie formation (because of added
3572 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3573 OP(scan) = EXACTFA_NO_TRIE;
3574 *unfolded_multi_char = TRUE;
3583 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3584 * folds that are all Latin1. As explained in the comments
3585 * preceding this function, we look also for the sharp s in EXACTF
3586 * and EXACTFL nodes; it can be in the final position. Otherwise
3587 * we can stop looking 1 byte earlier because have to find at least
3588 * two characters for a multi-fold */
3589 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3594 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3595 if (! len) { /* Not a multi-char fold. */
3596 if (*s == LATIN_SMALL_LETTER_SHARP_S
3597 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3599 *unfolded_multi_char = TRUE;
3606 && isALPHA_FOLD_EQ(*s, 's')
3607 && isALPHA_FOLD_EQ(*(s+1), 's'))
3610 /* EXACTF nodes need to know that the minimum length
3611 * changed so that a sharp s in the string can match this
3612 * ss in the pattern, but they remain EXACTF nodes, as they
3613 * won't match this unless the target string is is UTF-8,
3614 * which we don't know until runtime. EXACTFL nodes can't
3615 * transform into EXACTFU nodes */
3616 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3617 OP(scan) = EXACTFU_SS;
3621 *min_subtract += len - 1;
3628 /* Allow dumping but overwriting the collection of skipped
3629 * ops and/or strings with fake optimized ops */
3630 n = scan + NODE_SZ_STR(scan);
3638 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3642 /* REx optimizer. Converts nodes into quicker variants "in place".
3643 Finds fixed substrings. */
3645 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3646 to the position after last scanned or to NULL. */
3648 #define INIT_AND_WITHP \
3649 assert(!and_withp); \
3650 Newx(and_withp,1, regnode_ssc); \
3651 SAVEFREEPV(and_withp)
3653 /* this is a chain of data about sub patterns we are processing that
3654 need to be handled separately/specially in study_chunk. Its so
3655 we can simulate recursion without losing state. */
3657 typedef struct scan_frame {
3658 regnode *last; /* last node to process in this frame */
3659 regnode *next; /* next node to process when last is reached */
3660 struct scan_frame *prev; /*previous frame*/
3661 U32 prev_recursed_depth;
3662 I32 stop; /* what stopparen do we use */
3667 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3668 SSize_t *minlenp, SSize_t *deltap,
3673 regnode_ssc *and_withp,
3674 U32 flags, U32 depth)
3675 /* scanp: Start here (read-write). */
3676 /* deltap: Write maxlen-minlen here. */
3677 /* last: Stop before this one. */
3678 /* data: string data about the pattern */
3679 /* stopparen: treat close N as END */
3680 /* recursed: which subroutines have we recursed into */
3681 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3683 /* There must be at least this number of characters to match */
3686 regnode *scan = *scanp, *next;
3688 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3689 int is_inf_internal = 0; /* The studied chunk is infinite */
3690 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3691 scan_data_t data_fake;
3692 SV *re_trie_maxbuff = NULL;
3693 regnode *first_non_open = scan;
3694 SSize_t stopmin = SSize_t_MAX;
3695 scan_frame *frame = NULL;
3696 GET_RE_DEBUG_FLAGS_DECL;
3698 PERL_ARGS_ASSERT_STUDY_CHUNK;
3701 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3704 while (first_non_open && OP(first_non_open) == OPEN)
3705 first_non_open=regnext(first_non_open);
3711 RExC_study_chunk_recursed_count++;
3713 while ( scan && OP(scan) != END && scan < last ){
3714 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3715 node length to get a real minimum (because
3716 the folded version may be shorter) */
3717 bool unfolded_multi_char = FALSE;
3718 /* Peephole optimizer: */
3719 DEBUG_OPTIMISE_MORE_r(
3721 PerlIO_printf(Perl_debug_log,
3722 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu ",
3723 ((int) depth*2), "", (long)stopparen,
3724 (unsigned long)RExC_study_chunk_recursed_count,
3725 (unsigned long)depth, (unsigned long)recursed_depth);
3726 if (recursed_depth) {
3729 for ( j = 0 ; j < recursed_depth ; j++ ) {
3730 PerlIO_printf(Perl_debug_log,"[");
3731 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3732 PerlIO_printf(Perl_debug_log,"%d",
3733 PAREN_TEST(RExC_study_chunk_recursed +
3734 (j * RExC_study_chunk_recursed_bytes), i)
3737 PerlIO_printf(Perl_debug_log,"]");
3740 PerlIO_printf(Perl_debug_log,"\n");
3743 DEBUG_STUDYDATA("Peep:", data, depth);
3744 DEBUG_PEEP("Peep", scan, depth);
3747 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3748 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3749 * by a different invocation of reg() -- Yves
3751 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3753 /* Follow the next-chain of the current node and optimize
3754 away all the NOTHINGs from it. */
3755 if (OP(scan) != CURLYX) {
3756 const int max = (reg_off_by_arg[OP(scan)]
3758 /* I32 may be smaller than U16 on CRAYs! */
3759 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3760 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3764 /* Skip NOTHING and LONGJMP. */
3765 while ((n = regnext(n))
3766 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3767 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3768 && off + noff < max)
3770 if (reg_off_by_arg[OP(scan)])
3773 NEXT_OFF(scan) = off;
3778 /* The principal pseudo-switch. Cannot be a switch, since we
3779 look into several different things. */
3780 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3781 || OP(scan) == IFTHEN) {
3782 next = regnext(scan);
3784 /* demq: the op(next)==code check is to see if we have
3785 * "branch-branch" AFAICT */
3787 if (OP(next) == code || code == IFTHEN) {
3788 /* NOTE - There is similar code to this block below for
3789 * handling TRIE nodes on a re-study. If you change stuff here
3790 * check there too. */
3791 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3793 regnode * const startbranch=scan;
3795 if (flags & SCF_DO_SUBSTR) {
3796 /* Cannot merge strings after this. */
3797 scan_commit(pRExC_state, data, minlenp, is_inf);
3800 if (flags & SCF_DO_STCLASS)
3801 ssc_init_zero(pRExC_state, &accum);
3803 while (OP(scan) == code) {
3804 SSize_t deltanext, minnext, fake;
3806 regnode_ssc this_class;
3809 data_fake.flags = 0;
3811 data_fake.whilem_c = data->whilem_c;
3812 data_fake.last_closep = data->last_closep;
3815 data_fake.last_closep = &fake;
3817 data_fake.pos_delta = delta;
3818 next = regnext(scan);
3819 scan = NEXTOPER(scan);
3821 scan = NEXTOPER(scan);
3822 if (flags & SCF_DO_STCLASS) {
3823 ssc_init(pRExC_state, &this_class);
3824 data_fake.start_class = &this_class;
3825 f = SCF_DO_STCLASS_AND;
3827 if (flags & SCF_WHILEM_VISITED_POS)
3828 f |= SCF_WHILEM_VISITED_POS;
3830 /* we suppose the run is continuous, last=next...*/
3831 minnext = study_chunk(pRExC_state, &scan, minlenp,
3832 &deltanext, next, &data_fake, stopparen,
3833 recursed_depth, NULL, f,depth+1);
3836 if (deltanext == SSize_t_MAX) {
3837 is_inf = is_inf_internal = 1;
3839 } else if (max1 < minnext + deltanext)
3840 max1 = minnext + deltanext;
3842 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3844 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3845 if ( stopmin > minnext)
3846 stopmin = min + min1;
3847 flags &= ~SCF_DO_SUBSTR;
3849 data->flags |= SCF_SEEN_ACCEPT;
3852 if (data_fake.flags & SF_HAS_EVAL)
3853 data->flags |= SF_HAS_EVAL;
3854 data->whilem_c = data_fake.whilem_c;
3856 if (flags & SCF_DO_STCLASS)
3857 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3859 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3861 if (flags & SCF_DO_SUBSTR) {
3862 data->pos_min += min1;
3863 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3864 data->pos_delta = SSize_t_MAX;
3866 data->pos_delta += max1 - min1;
3867 if (max1 != min1 || is_inf)
3868 data->longest = &(data->longest_float);
3871 if (delta == SSize_t_MAX
3872 || SSize_t_MAX - delta - (max1 - min1) < 0)
3873 delta = SSize_t_MAX;
3875 delta += max1 - min1;
3876 if (flags & SCF_DO_STCLASS_OR) {
3877 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3879 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3880 flags &= ~SCF_DO_STCLASS;
3883 else if (flags & SCF_DO_STCLASS_AND) {
3885 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3886 flags &= ~SCF_DO_STCLASS;
3889 /* Switch to OR mode: cache the old value of
3890 * data->start_class */
3892 StructCopy(data->start_class, and_withp, regnode_ssc);
3893 flags &= ~SCF_DO_STCLASS_AND;
3894 StructCopy(&accum, data->start_class, regnode_ssc);
3895 flags |= SCF_DO_STCLASS_OR;
3899 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3900 OP( startbranch ) == BRANCH )
3904 Assuming this was/is a branch we are dealing with: 'scan'
3905 now points at the item that follows the branch sequence,
3906 whatever it is. We now start at the beginning of the
3907 sequence and look for subsequences of
3913 which would be constructed from a pattern like
3916 If we can find such a subsequence we need to turn the first
3917 element into a trie and then add the subsequent branch exact
3918 strings to the trie.
3922 1. patterns where the whole set of branches can be
3925 2. patterns where only a subset can be converted.
3927 In case 1 we can replace the whole set with a single regop
3928 for the trie. In case 2 we need to keep the start and end
3931 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3932 becomes BRANCH TRIE; BRANCH X;
3934 There is an additional case, that being where there is a
3935 common prefix, which gets split out into an EXACT like node
3936 preceding the TRIE node.
3938 If x(1..n)==tail then we can do a simple trie, if not we make
3939 a "jump" trie, such that when we match the appropriate word
3940 we "jump" to the appropriate tail node. Essentially we turn
3941 a nested if into a case structure of sorts.
3946 if (!re_trie_maxbuff) {
3947 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3948 if (!SvIOK(re_trie_maxbuff))
3949 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3951 if ( SvIV(re_trie_maxbuff)>=0 ) {
3953 regnode *first = (regnode *)NULL;
3954 regnode *last = (regnode *)NULL;
3955 regnode *tail = scan;
3960 SV * const mysv = sv_newmortal(); /* for dumping */
3962 /* var tail is used because there may be a TAIL
3963 regop in the way. Ie, the exacts will point to the
3964 thing following the TAIL, but the last branch will
3965 point at the TAIL. So we advance tail. If we
3966 have nested (?:) we may have to move through several
3970 while ( OP( tail ) == TAIL ) {
3971 /* this is the TAIL generated by (?:) */
3972 tail = regnext( tail );
3976 DEBUG_TRIE_COMPILE_r({
3977 regprop(RExC_rx, mysv, tail, NULL);
3978 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3979 (int)depth * 2 + 2, "",
3980 "Looking for TRIE'able sequences. Tail node is: ",
3981 SvPV_nolen_const( mysv )
3987 Step through the branches
3988 cur represents each branch,
3989 noper is the first thing to be matched as part
3991 noper_next is the regnext() of that node.
3993 We normally handle a case like this
3994 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3995 support building with NOJUMPTRIE, which restricts
3996 the trie logic to structures like /FOO|BAR/.
3998 If noper is a trieable nodetype then the branch is
3999 a possible optimization target. If we are building
4000 under NOJUMPTRIE then we require that noper_next is
4001 the same as scan (our current position in the regex
4004 Once we have two or more consecutive such branches
4005 we can create a trie of the EXACT's contents and
4006 stitch it in place into the program.
4008 If the sequence represents all of the branches in
4009 the alternation we replace the entire thing with a
4012 Otherwise when it is a subsequence we need to
4013 stitch it in place and replace only the relevant
4014 branches. This means the first branch has to remain
4015 as it is used by the alternation logic, and its
4016 next pointer, and needs to be repointed at the item
4017 on the branch chain following the last branch we
4018 have optimized away.
4020 This could be either a BRANCH, in which case the
4021 subsequence is internal, or it could be the item
4022 following the branch sequence in which case the
4023 subsequence is at the end (which does not
4024 necessarily mean the first node is the start of the
4027 TRIE_TYPE(X) is a define which maps the optype to a
4031 ----------------+-----------
4035 EXACTFU_SS | EXACTFU
4040 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
4041 ( EXACT == (X) ) ? EXACT : \
4042 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
4043 ( EXACTFA == (X) ) ? EXACTFA : \
4046 /* dont use tail as the end marker for this traverse */
4047 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4048 regnode * const noper = NEXTOPER( cur );
4049 U8 noper_type = OP( noper );
4050 U8 noper_trietype = TRIE_TYPE( noper_type );
4051 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4052 regnode * const noper_next = regnext( noper );
4053 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4054 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4057 DEBUG_TRIE_COMPILE_r({
4058 regprop(RExC_rx, mysv, cur, NULL);
4059 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4060 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
4062 regprop(RExC_rx, mysv, noper, NULL);
4063 PerlIO_printf( Perl_debug_log, " -> %s",
4064 SvPV_nolen_const(mysv));
4067 regprop(RExC_rx, mysv, noper_next, NULL);
4068 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4069 SvPV_nolen_const(mysv));
4071 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4072 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4073 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4077 /* Is noper a trieable nodetype that can be merged
4078 * with the current trie (if there is one)? */
4082 ( noper_trietype == NOTHING)
4083 || ( trietype == NOTHING )
4084 || ( trietype == noper_trietype )
4087 && noper_next == tail
4091 /* Handle mergable triable node Either we are
4092 * the first node in a new trieable sequence,
4093 * in which case we do some bookkeeping,
4094 * otherwise we update the end pointer. */
4097 if ( noper_trietype == NOTHING ) {
4098 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4099 regnode * const noper_next = regnext( noper );
4100 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4101 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4104 if ( noper_next_trietype ) {
4105 trietype = noper_next_trietype;
4106 } else if (noper_next_type) {
4107 /* a NOTHING regop is 1 regop wide.
4108 * We need at least two for a trie
4109 * so we can't merge this in */
4113 trietype = noper_trietype;
4116 if ( trietype == NOTHING )
4117 trietype = noper_trietype;
4122 } /* end handle mergable triable node */
4124 /* handle unmergable node -
4125 * noper may either be a triable node which can
4126 * not be tried together with the current trie,
4127 * or a non triable node */
4129 /* If last is set and trietype is not
4130 * NOTHING then we have found at least two
4131 * triable branch sequences in a row of a
4132 * similar trietype so we can turn them
4133 * into a trie. If/when we allow NOTHING to
4134 * start a trie sequence this condition
4135 * will be required, and it isn't expensive
4136 * so we leave it in for now. */
4137 if ( trietype && trietype != NOTHING )
4138 make_trie( pRExC_state,
4139 startbranch, first, cur, tail,
4140 count, trietype, depth+1 );
4141 last = NULL; /* note: we clear/update
4142 first, trietype etc below,
4143 so we dont do it here */
4147 && noper_next == tail
4150 /* noper is triable, so we can start a new
4154 trietype = noper_trietype;
4156 /* if we already saw a first but the
4157 * current node is not triable then we have
4158 * to reset the first information. */
4163 } /* end handle unmergable node */
4164 } /* loop over branches */
4165 DEBUG_TRIE_COMPILE_r({
4166 regprop(RExC_rx, mysv, cur, NULL);
4167 PerlIO_printf( Perl_debug_log,
4168 "%*s- %s (%d) <SCAN FINISHED>\n",
4170 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4173 if ( last && trietype ) {
4174 if ( trietype != NOTHING ) {
4175 /* the last branch of the sequence was part of
4176 * a trie, so we have to construct it here
4177 * outside of the loop */
4178 made= make_trie( pRExC_state, startbranch,
4179 first, scan, tail, count,
4180 trietype, depth+1 );
4181 #ifdef TRIE_STUDY_OPT
4182 if ( ((made == MADE_EXACT_TRIE &&
4183 startbranch == first)
4184 || ( first_non_open == first )) &&
4186 flags |= SCF_TRIE_RESTUDY;
4187 if ( startbranch == first
4190 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4195 /* at this point we know whatever we have is a
4196 * NOTHING sequence/branch AND if 'startbranch'
4197 * is 'first' then we can turn the whole thing
4200 if ( startbranch == first ) {
4202 /* the entire thing is a NOTHING sequence,
4203 * something like this: (?:|) So we can
4204 * turn it into a plain NOTHING op. */
4205 DEBUG_TRIE_COMPILE_r({
4206 regprop(RExC_rx, mysv, cur, NULL);
4207 PerlIO_printf( Perl_debug_log,
4208 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4209 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4212 OP(startbranch)= NOTHING;
4213 NEXT_OFF(startbranch)= tail - startbranch;
4214 for ( opt= startbranch + 1; opt < tail ; opt++ )
4218 } /* end if ( last) */
4219 } /* TRIE_MAXBUF is non zero */
4224 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4225 scan = NEXTOPER(NEXTOPER(scan));
4226 } else /* single branch is optimized. */
4227 scan = NEXTOPER(scan);
4229 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4230 scan_frame *newframe = NULL;
4234 U32 my_recursed_depth= recursed_depth;
4236 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4237 /* set the pointer */
4238 if (OP(scan) == GOSUB) {
4240 RExC_recurse[ARG2L(scan)] = scan;
4241 start = RExC_open_parens[paren-1];
4242 end = RExC_close_parens[paren-1];
4245 start = RExC_rxi->program + 1;
4248 /* this code is intended to handle expanding regex "subs" so
4249 * we can apply various optimizations. For instance with
4250 * /(?(DEFINE)(?<foo>foo)(?<bar>bar))(?&foo)(?&bar)/ we
4251 * want to recognize that the mandatory substr is going to be
4253 * However if we are not in SCF_DO_SUBSTR mode then there is
4254 * no point in doing this, and it can cause a serious slowdown.
4256 * Note also that this was a workaround for the core problem
4257 * which was that during compilation logic the excessive
4258 * recursion resulted in slowly consuming all the memory on
4259 * the box. Exactly what causes this is unclear. It does not
4260 * appear to be directly related to allocating the "visited"
4261 * bitmaps that is RExC_study_chunk_recursed.
4263 * In reality study_chunk() does far far too much, and probably
4264 * this an other issues would go away if we split it into
4265 * multiple components.
4269 if (flags & SCF_DO_SUBSTR) {
4273 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4275 /* it is quite possible that there are more efficient ways
4276 * to do this. We maintain a bitmap per level of recursion
4277 * of which patterns we have entered so we can detect if a
4278 * pattern creates a possible infinite loop. When we
4279 * recurse down a level we copy the previous levels bitmap
4280 * down. When we are at recursion level 0 we zero the top
4281 * level bitmap. It would be nice to implement a different
4282 * more efficient way of doing this. In particular the top
4283 * level bitmap may be unnecessary.
4285 if (!recursed_depth) {
4286 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4288 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4289 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4290 RExC_study_chunk_recursed_bytes, U8);
4292 /* we havent recursed into this paren yet, so recurse into it */
4293 DEBUG_STUDYDATA("set:", data,depth);
4294 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4295 my_recursed_depth= recursed_depth + 1;
4296 Newx(newframe,1,scan_frame);
4298 DEBUG_STUDYDATA("inf:", data,depth);
4299 /* some form of infinite recursion, assume infinite length
4301 if (flags & SCF_DO_SUBSTR) {
4302 scan_commit(pRExC_state, data, minlenp, is_inf);
4303 data->longest = &(data->longest_float);
4305 is_inf = is_inf_internal = 1;
4306 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4307 ssc_anything(data->start_class);
4308 flags &= ~SCF_DO_STCLASS;
4312 Newx(newframe,1,scan_frame);
4315 end = regnext(scan);
4320 SAVEFREEPV(newframe);
4321 newframe->next = regnext(scan);
4322 newframe->last = last;
4323 newframe->stop = stopparen;
4324 newframe->prev = frame;
4325 newframe->prev_recursed_depth = recursed_depth;
4327 DEBUG_STUDYDATA("frame-new:",data,depth);
4328 DEBUG_PEEP("fnew", scan, depth);
4335 recursed_depth= my_recursed_depth;
4340 else if (OP(scan) == EXACT) {
4341 SSize_t l = STR_LEN(scan);
4344 const U8 * const s = (U8*)STRING(scan);
4345 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4346 l = utf8_length(s, s + l);
4348 uc = *((U8*)STRING(scan));
4351 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4352 /* The code below prefers earlier match for fixed
4353 offset, later match for variable offset. */
4354 if (data->last_end == -1) { /* Update the start info. */
4355 data->last_start_min = data->pos_min;
4356 data->last_start_max = is_inf
4357 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4359 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4361 SvUTF8_on(data->last_found);
4363 SV * const sv = data->last_found;
4364 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4365 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4366 if (mg && mg->mg_len >= 0)
4367 mg->mg_len += utf8_length((U8*)STRING(scan),
4368 (U8*)STRING(scan)+STR_LEN(scan));
4370 data->last_end = data->pos_min + l;
4371 data->pos_min += l; /* As in the first entry. */
4372 data->flags &= ~SF_BEFORE_EOL;
4375 /* ANDing the code point leaves at most it, and not in locale, and
4376 * can't match null string */
4377 if (flags & SCF_DO_STCLASS_AND) {
4378 ssc_cp_and(data->start_class, uc);
4379 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4380 ssc_clear_locale(data->start_class);
4382 else if (flags & SCF_DO_STCLASS_OR) {
4383 ssc_add_cp(data->start_class, uc);
4384 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4386 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4387 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4389 flags &= ~SCF_DO_STCLASS;
4391 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4393 SSize_t l = STR_LEN(scan);
4394 UV uc = *((U8*)STRING(scan));
4395 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4396 separate code points */
4397 const U8 * s = (U8*)STRING(scan);
4399 /* Search for fixed substrings supports EXACT only. */
4400 if (flags & SCF_DO_SUBSTR) {
4402 scan_commit(pRExC_state, data, minlenp, is_inf);
4405 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4406 l = utf8_length(s, s + l);
4408 if (unfolded_multi_char) {
4409 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4411 min += l - min_subtract;
4413 delta += min_subtract;
4414 if (flags & SCF_DO_SUBSTR) {
4415 data->pos_min += l - min_subtract;
4416 if (data->pos_min < 0) {
4419 data->pos_delta += min_subtract;
4421 data->longest = &(data->longest_float);
4425 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4426 ssc_clear_locale(data->start_class);
4431 /* We punt and assume can match anything if the node begins
4432 * with a multi-character fold. Things are complicated. For
4433 * example, /ffi/i could match any of:
4434 * "\N{LATIN SMALL LIGATURE FFI}"
4435 * "\N{LATIN SMALL LIGATURE FF}I"
4436 * "F\N{LATIN SMALL LIGATURE FI}"
4437 * plus several other things; and making sure we have all the
4438 * possibilities is hard. */
4439 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4441 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4445 /* Any Latin1 range character can potentially match any
4446 * other depending on the locale */
4447 if (OP(scan) == EXACTFL) {
4448 _invlist_union(EXACTF_invlist, PL_Latin1,
4452 /* But otherwise, it matches at least itself. We can
4453 * quickly tell if it has a distinct fold, and if so,
4454 * it matches that as well */
4455 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4456 if (IS_IN_SOME_FOLD_L1(uc)) {
4457 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4458 PL_fold_latin1[uc]);
4462 /* Some characters match above-Latin1 ones under /i. This
4463 * is true of EXACTFL ones when the locale is UTF-8 */
4464 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4465 && (! isASCII(uc) || (OP(scan) != EXACTFA
4466 && OP(scan) != EXACTFA_NO_TRIE)))
4468 add_above_Latin1_folds(pRExC_state,
4474 else { /* Pattern is UTF-8 */
4475 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4476 STRLEN foldlen = UTF8SKIP(s);
4477 const U8* e = s + STR_LEN(scan);
4480 /* The only code points that aren't folded in a UTF EXACTFish
4481 * node are are the problematic ones in EXACTFL nodes */
4482 if (OP(scan) == EXACTFL
4483 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4485 /* We need to check for the possibility that this EXACTFL
4486 * node begins with a multi-char fold. Therefore we fold
4487 * the first few characters of it so that we can make that
4492 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4494 *(d++) = (U8) toFOLD(*s);
4499 to_utf8_fold(s, d, &len);
4505 /* And set up so the code below that looks in this folded
4506 * buffer instead of the node's string */
4508 foldlen = UTF8SKIP(folded);
4512 /* When we reach here 's' points to the fold of the first
4513 * character(s) of the node; and 'e' points to far enough along
4514 * the folded string to be just past any possible multi-char
4515 * fold. 'foldlen' is the length in bytes of the first
4518 * Unlike the non-UTF-8 case, the macro for determining if a
4519 * string is a multi-char fold requires all the characters to
4520 * already be folded. This is because of all the complications
4521 * if not. Note that they are folded anyway, except in EXACTFL
4522 * nodes. Like the non-UTF case above, we punt if the node
4523 * begins with a multi-char fold */
4525 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4527 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4529 else { /* Single char fold */
4531 /* It matches all the things that fold to it, which are
4532 * found in PL_utf8_foldclosures (including itself) */
4533 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4534 if (! PL_utf8_foldclosures) {
4535 _load_PL_utf8_foldclosures();
4537 if ((listp = hv_fetch(PL_utf8_foldclosures,
4538 (char *) s, foldlen, FALSE)))
4540 AV* list = (AV*) *listp;
4542 for (k = 0; k <= av_tindex(list); k++) {
4543 SV** c_p = av_fetch(list, k, FALSE);
4549 /* /aa doesn't allow folds between ASCII and non- */
4550 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4551 && isASCII(c) != isASCII(uc))
4556 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4561 if (flags & SCF_DO_STCLASS_AND) {
4562 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4563 ANYOF_POSIXL_ZERO(data->start_class);
4564 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4566 else if (flags & SCF_DO_STCLASS_OR) {
4567 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4568 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4570 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4571 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4573 flags &= ~SCF_DO_STCLASS;
4574 SvREFCNT_dec(EXACTF_invlist);
4576 else if (REGNODE_VARIES(OP(scan))) {
4577 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4578 I32 fl = 0, f = flags;
4579 regnode * const oscan = scan;
4580 regnode_ssc this_class;
4581 regnode_ssc *oclass = NULL;
4582 I32 next_is_eval = 0;
4584 switch (PL_regkind[OP(scan)]) {
4585 case WHILEM: /* End of (?:...)* . */
4586 scan = NEXTOPER(scan);
4589 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4590 next = NEXTOPER(scan);
4591 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4593 maxcount = REG_INFTY;
4594 next = regnext(scan);
4595 scan = NEXTOPER(scan);
4599 if (flags & SCF_DO_SUBSTR)
4604 if (flags & SCF_DO_STCLASS) {
4606 maxcount = REG_INFTY;
4607 next = regnext(scan);
4608 scan = NEXTOPER(scan);
4611 if (flags & SCF_DO_SUBSTR) {
4612 scan_commit(pRExC_state, data, minlenp, is_inf);
4613 /* Cannot extend fixed substrings */
4614 data->longest = &(data->longest_float);
4616 is_inf = is_inf_internal = 1;
4617 scan = regnext(scan);
4618 goto optimize_curly_tail;
4620 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4621 && (scan->flags == stopparen))
4626 mincount = ARG1(scan);
4627 maxcount = ARG2(scan);
4629 next = regnext(scan);
4630 if (OP(scan) == CURLYX) {
4631 I32 lp = (data ? *(data->last_closep) : 0);
4632 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4634 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4635 next_is_eval = (OP(scan) == EVAL);
4637 if (flags & SCF_DO_SUBSTR) {
4639 scan_commit(pRExC_state, data, minlenp, is_inf);
4640 /* Cannot extend fixed substrings */
4641 pos_before = data->pos_min;
4645 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4647 data->flags |= SF_IS_INF;
4649 if (flags & SCF_DO_STCLASS) {
4650 ssc_init(pRExC_state, &this_class);
4651 oclass = data->start_class;
4652 data->start_class = &this_class;
4653 f |= SCF_DO_STCLASS_AND;
4654 f &= ~SCF_DO_STCLASS_OR;
4656 /* Exclude from super-linear cache processing any {n,m}
4657 regops for which the combination of input pos and regex
4658 pos is not enough information to determine if a match
4661 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4662 regex pos at the \s*, the prospects for a match depend not
4663 only on the input position but also on how many (bar\s*)
4664 repeats into the {4,8} we are. */
4665 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4666 f &= ~SCF_WHILEM_VISITED_POS;
4668 /* This will finish on WHILEM, setting scan, or on NULL: */
4669 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4670 last, data, stopparen, recursed_depth, NULL,
4672 ? (f & ~SCF_DO_SUBSTR)
4676 if (flags & SCF_DO_STCLASS)
4677 data->start_class = oclass;
4678 if (mincount == 0 || minnext == 0) {
4679 if (flags & SCF_DO_STCLASS_OR) {
4680 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4682 else if (flags & SCF_DO_STCLASS_AND) {
4683 /* Switch to OR mode: cache the old value of
4684 * data->start_class */
4686 StructCopy(data->start_class, and_withp, regnode_ssc);
4687 flags &= ~SCF_DO_STCLASS_AND;
4688 StructCopy(&this_class, data->start_class, regnode_ssc);
4689 flags |= SCF_DO_STCLASS_OR;
4690 ANYOF_FLAGS(data->start_class)
4691 |= SSC_MATCHES_EMPTY_STRING;
4693 } else { /* Non-zero len */
4694 if (flags & SCF_DO_STCLASS_OR) {
4695 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4696 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4698 else if (flags & SCF_DO_STCLASS_AND)
4699 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4700 flags &= ~SCF_DO_STCLASS;
4702 if (!scan) /* It was not CURLYX, but CURLY. */
4704 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4705 /* ? quantifier ok, except for (?{ ... }) */
4706 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4707 && (minnext == 0) && (deltanext == 0)
4708 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4709 && maxcount <= REG_INFTY/3) /* Complement check for big
4712 /* Fatal warnings may leak the regexp without this: */
4713 SAVEFREESV(RExC_rx_sv);
4714 ckWARNreg(RExC_parse,
4715 "Quantifier unexpected on zero-length expression");
4716 (void)ReREFCNT_inc(RExC_rx_sv);
4719 min += minnext * mincount;
4720 is_inf_internal |= deltanext == SSize_t_MAX
4721 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4722 is_inf |= is_inf_internal;
4724 delta = SSize_t_MAX;
4726 delta += (minnext + deltanext) * maxcount
4727 - minnext * mincount;
4729 /* Try powerful optimization CURLYX => CURLYN. */
4730 if ( OP(oscan) == CURLYX && data
4731 && data->flags & SF_IN_PAR
4732 && !(data->flags & SF_HAS_EVAL)
4733 && !deltanext && minnext == 1 ) {
4734 /* Try to optimize to CURLYN. */
4735 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4736 regnode * const nxt1 = nxt;
4743 if (!REGNODE_SIMPLE(OP(nxt))
4744 && !(PL_regkind[OP(nxt)] == EXACT
4745 && STR_LEN(nxt) == 1))
4751 if (OP(nxt) != CLOSE)
4753 if (RExC_open_parens) {
4754 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4755 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4757 /* Now we know that nxt2 is the only contents: */
4758 oscan->flags = (U8)ARG(nxt);
4760 OP(nxt1) = NOTHING; /* was OPEN. */
4763 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4764 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4765 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4766 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4767 OP(nxt + 1) = OPTIMIZED; /* was count. */
4768 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4773 /* Try optimization CURLYX => CURLYM. */
4774 if ( OP(oscan) == CURLYX && data
4775 && !(data->flags & SF_HAS_PAR)
4776 && !(data->flags & SF_HAS_EVAL)
4777 && !deltanext /* atom is fixed width */
4778 && minnext != 0 /* CURLYM can't handle zero width */
4780 /* Nor characters whose fold at run-time may be
4781 * multi-character */
4782 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4784 /* XXXX How to optimize if data == 0? */
4785 /* Optimize to a simpler form. */
4786 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4790 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4791 && (OP(nxt2) != WHILEM))
4793 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4794 /* Need to optimize away parenths. */
4795 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4796 /* Set the parenth number. */
4797 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4799 oscan->flags = (U8)ARG(nxt);
4800 if (RExC_open_parens) {
4801 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4802 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4804 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4805 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4808 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4809 OP(nxt + 1) = OPTIMIZED; /* was count. */
4810 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4811 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4814 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4815 regnode *nnxt = regnext(nxt1);
4817 if (reg_off_by_arg[OP(nxt1)])
4818 ARG_SET(nxt1, nxt2 - nxt1);
4819 else if (nxt2 - nxt1 < U16_MAX)
4820 NEXT_OFF(nxt1) = nxt2 - nxt1;
4822 OP(nxt) = NOTHING; /* Cannot beautify */
4827 /* Optimize again: */
4828 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4829 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4834 else if ((OP(oscan) == CURLYX)
4835 && (flags & SCF_WHILEM_VISITED_POS)
4836 /* See the comment on a similar expression above.
4837 However, this time it's not a subexpression
4838 we care about, but the expression itself. */
4839 && (maxcount == REG_INFTY)
4840 && data && ++data->whilem_c < 16) {
4841 /* This stays as CURLYX, we can put the count/of pair. */
4842 /* Find WHILEM (as in regexec.c) */
4843 regnode *nxt = oscan + NEXT_OFF(oscan);
4845 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4847 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4848 | (RExC_whilem_seen << 4)); /* On WHILEM */
4850 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4852 if (flags & SCF_DO_SUBSTR) {
4853 SV *last_str = NULL;
4854 STRLEN last_chrs = 0;
4855 int counted = mincount != 0;
4857 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4859 SSize_t b = pos_before >= data->last_start_min
4860 ? pos_before : data->last_start_min;
4862 const char * const s = SvPV_const(data->last_found, l);
4863 SSize_t old = b - data->last_start_min;
4866 old = utf8_hop((U8*)s, old) - (U8*)s;
4868 /* Get the added string: */
4869 last_str = newSVpvn_utf8(s + old, l, UTF);
4870 last_chrs = UTF ? utf8_length((U8*)(s + old),
4871 (U8*)(s + old + l)) : l;
4872 if (deltanext == 0 && pos_before == b) {
4873 /* What was added is a constant string */
4876 SvGROW(last_str, (mincount * l) + 1);
4877 repeatcpy(SvPVX(last_str) + l,
4878 SvPVX_const(last_str), l,
4880 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4881 /* Add additional parts. */
4882 SvCUR_set(data->last_found,
4883 SvCUR(data->last_found) - l);
4884 sv_catsv(data->last_found, last_str);
4886 SV * sv = data->last_found;
4888 SvUTF8(sv) && SvMAGICAL(sv) ?
4889 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4890 if (mg && mg->mg_len >= 0)
4891 mg->mg_len += last_chrs * (mincount-1);
4893 last_chrs *= mincount;
4894 data->last_end += l * (mincount - 1);
4897 /* start offset must point into the last copy */
4898 data->last_start_min += minnext * (mincount - 1);
4899 data->last_start_max += is_inf ? SSize_t_MAX
4900 : (maxcount - 1) * (minnext + data->pos_delta);
4903 /* It is counted once already... */
4904 data->pos_min += minnext * (mincount - counted);
4906 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4907 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4908 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4909 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4911 if (deltanext != SSize_t_MAX)
4912 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4913 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4914 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4916 if (deltanext == SSize_t_MAX
4917 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4918 data->pos_delta = SSize_t_MAX;
4920 data->pos_delta += - counted * deltanext +
4921 (minnext + deltanext) * maxcount - minnext * mincount;
4922 if (mincount != maxcount) {
4923 /* Cannot extend fixed substrings found inside
4925 scan_commit(pRExC_state, data, minlenp, is_inf);
4926 if (mincount && last_str) {
4927 SV * const sv = data->last_found;
4928 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4929 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4933 sv_setsv(sv, last_str);
4934 data->last_end = data->pos_min;
4935 data->last_start_min = data->pos_min - last_chrs;
4936 data->last_start_max = is_inf
4938 : data->pos_min + data->pos_delta - last_chrs;
4940 data->longest = &(data->longest_float);
4942 SvREFCNT_dec(last_str);
4944 if (data && (fl & SF_HAS_EVAL))
4945 data->flags |= SF_HAS_EVAL;
4946 optimize_curly_tail:
4947 if (OP(oscan) != CURLYX) {
4948 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4950 NEXT_OFF(oscan) += NEXT_OFF(next);
4956 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4961 if (flags & SCF_DO_SUBSTR) {
4962 /* Cannot expect anything... */
4963 scan_commit(pRExC_state, data, minlenp, is_inf);
4964 data->longest = &(data->longest_float);
4966 is_inf = is_inf_internal = 1;
4967 if (flags & SCF_DO_STCLASS_OR) {
4968 if (OP(scan) == CLUMP) {
4969 /* Actually is any start char, but very few code points
4970 * aren't start characters */
4971 ssc_match_all_cp(data->start_class);
4974 ssc_anything(data->start_class);
4977 flags &= ~SCF_DO_STCLASS;
4981 else if (OP(scan) == LNBREAK) {
4982 if (flags & SCF_DO_STCLASS) {
4983 if (flags & SCF_DO_STCLASS_AND) {
4984 ssc_intersection(data->start_class,
4985 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4986 ssc_clear_locale(data->start_class);
4987 ANYOF_FLAGS(data->start_class)
4988 &= ~SSC_MATCHES_EMPTY_STRING;
4990 else if (flags & SCF_DO_STCLASS_OR) {
4991 ssc_union(data->start_class,
4992 PL_XPosix_ptrs[_CC_VERTSPACE],
4994 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4996 /* See commit msg for
4997 * 749e076fceedeb708a624933726e7989f2302f6a */
4998 ANYOF_FLAGS(data->start_class)
4999 &= ~SSC_MATCHES_EMPTY_STRING;
5001 flags &= ~SCF_DO_STCLASS;
5004 delta++; /* Because of the 2 char string cr-lf */
5005 if (flags & SCF_DO_SUBSTR) {
5006 /* Cannot expect anything... */
5007 scan_commit(pRExC_state, data, minlenp, is_inf);
5009 data->pos_delta += 1;
5010 data->longest = &(data->longest_float);
5013 else if (REGNODE_SIMPLE(OP(scan))) {
5015 if (flags & SCF_DO_SUBSTR) {
5016 scan_commit(pRExC_state, data, minlenp, is_inf);
5020 if (flags & SCF_DO_STCLASS) {
5022 SV* my_invlist = sv_2mortal(_new_invlist(0));
5025 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5026 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5028 /* Some of the logic below assumes that switching
5029 locale on will only add false positives. */
5034 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5039 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5040 ssc_match_all_cp(data->start_class);
5045 SV* REG_ANY_invlist = _new_invlist(2);
5046 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5048 if (flags & SCF_DO_STCLASS_OR) {
5049 ssc_union(data->start_class,
5051 TRUE /* TRUE => invert, hence all but \n
5055 else if (flags & SCF_DO_STCLASS_AND) {
5056 ssc_intersection(data->start_class,
5058 TRUE /* TRUE => invert */
5060 ssc_clear_locale(data->start_class);
5062 SvREFCNT_dec_NN(REG_ANY_invlist);
5067 if (flags & SCF_DO_STCLASS_AND)
5068 ssc_and(pRExC_state, data->start_class,
5069 (regnode_charclass *) scan);
5071 ssc_or(pRExC_state, data->start_class,
5072 (regnode_charclass *) scan);
5080 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5081 if (flags & SCF_DO_STCLASS_AND) {
5082 bool was_there = cBOOL(
5083 ANYOF_POSIXL_TEST(data->start_class,
5085 ANYOF_POSIXL_ZERO(data->start_class);
5086 if (was_there) { /* Do an AND */
5087 ANYOF_POSIXL_SET(data->start_class, namedclass);
5089 /* No individual code points can now match */
5090 data->start_class->invlist
5091 = sv_2mortal(_new_invlist(0));
5094 int complement = namedclass + ((invert) ? -1 : 1);
5096 assert(flags & SCF_DO_STCLASS_OR);
5098 /* If the complement of this class was already there,
5099 * the result is that they match all code points,
5100 * (\d + \D == everything). Remove the classes from
5101 * future consideration. Locale is not relevant in
5103 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5104 ssc_match_all_cp(data->start_class);
5105 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5106 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5108 else { /* The usual case; just add this class to the
5110 ANYOF_POSIXL_SET(data->start_class, namedclass);
5115 case NPOSIXA: /* For these, we always know the exact set of
5120 if (FLAGS(scan) == _CC_ASCII) {
5121 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5124 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5125 PL_XPosix_ptrs[_CC_ASCII],
5136 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5138 /* NPOSIXD matches all upper Latin1 code points unless the
5139 * target string being matched is UTF-8, which is
5140 * unknowable until match time. Since we are going to
5141 * invert, we want to get rid of all of them so that the
5142 * inversion will match all */
5143 if (OP(scan) == NPOSIXD) {
5144 _invlist_subtract(my_invlist, PL_UpperLatin1,
5150 if (flags & SCF_DO_STCLASS_AND) {
5151 ssc_intersection(data->start_class, my_invlist, invert);
5152 ssc_clear_locale(data->start_class);
5155 assert(flags & SCF_DO_STCLASS_OR);
5156 ssc_union(data->start_class, my_invlist, invert);
5159 if (flags & SCF_DO_STCLASS_OR)
5160 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5161 flags &= ~SCF_DO_STCLASS;
5164 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5165 data->flags |= (OP(scan) == MEOL
5168 scan_commit(pRExC_state, data, minlenp, is_inf);
5171 else if ( PL_regkind[OP(scan)] == BRANCHJ
5172 /* Lookbehind, or need to calculate parens/evals/stclass: */
5173 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5174 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5176 if ( OP(scan) == UNLESSM &&
5178 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5179 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5182 regnode *upto= regnext(scan);
5184 SV * const mysv_val=sv_newmortal();
5185 DEBUG_STUDYDATA("OPFAIL",data,depth);
5187 /*DEBUG_PARSE_MSG("opfail");*/
5188 regprop(RExC_rx, mysv_val, upto, NULL);
5189 PerlIO_printf(Perl_debug_log,
5190 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5191 SvPV_nolen_const(mysv_val),
5192 (IV)REG_NODE_NUM(upto),
5197 NEXT_OFF(scan) = upto - scan;
5198 for (opt= scan + 1; opt < upto ; opt++)
5199 OP(opt) = OPTIMIZED;
5203 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5204 || OP(scan) == UNLESSM )
5206 /* Negative Lookahead/lookbehind
5207 In this case we can't do fixed string optimisation.
5210 SSize_t deltanext, minnext, fake = 0;
5215 data_fake.flags = 0;
5217 data_fake.whilem_c = data->whilem_c;
5218 data_fake.last_closep = data->last_closep;
5221 data_fake.last_closep = &fake;
5222 data_fake.pos_delta = delta;
5223 if ( flags & SCF_DO_STCLASS && !scan->flags
5224 && OP(scan) == IFMATCH ) { /* Lookahead */
5225 ssc_init(pRExC_state, &intrnl);
5226 data_fake.start_class = &intrnl;
5227 f |= SCF_DO_STCLASS_AND;
5229 if (flags & SCF_WHILEM_VISITED_POS)
5230 f |= SCF_WHILEM_VISITED_POS;
5231 next = regnext(scan);
5232 nscan = NEXTOPER(NEXTOPER(scan));
5233 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5234 last, &data_fake, stopparen,
5235 recursed_depth, NULL, f, depth+1);
5238 FAIL("Variable length lookbehind not implemented");
5240 else if (minnext > (I32)U8_MAX) {
5241 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5244 scan->flags = (U8)minnext;
5247 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5249 if (data_fake.flags & SF_HAS_EVAL)
5250 data->flags |= SF_HAS_EVAL;
5251 data->whilem_c = data_fake.whilem_c;
5253 if (f & SCF_DO_STCLASS_AND) {
5254 if (flags & SCF_DO_STCLASS_OR) {
5255 /* OR before, AND after: ideally we would recurse with
5256 * data_fake to get the AND applied by study of the
5257 * remainder of the pattern, and then derecurse;
5258 * *** HACK *** for now just treat as "no information".
5259 * See [perl #56690].
5261 ssc_init(pRExC_state, data->start_class);
5263 /* AND before and after: combine and continue. These
5264 * assertions are zero-length, so can match an EMPTY
5266 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5267 ANYOF_FLAGS(data->start_class)
5268 |= SSC_MATCHES_EMPTY_STRING;
5272 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5274 /* Positive Lookahead/lookbehind
5275 In this case we can do fixed string optimisation,
5276 but we must be careful about it. Note in the case of
5277 lookbehind the positions will be offset by the minimum
5278 length of the pattern, something we won't know about
5279 until after the recurse.
5281 SSize_t deltanext, fake = 0;
5285 /* We use SAVEFREEPV so that when the full compile
5286 is finished perl will clean up the allocated
5287 minlens when it's all done. This way we don't
5288 have to worry about freeing them when we know
5289 they wont be used, which would be a pain.
5292 Newx( minnextp, 1, SSize_t );
5293 SAVEFREEPV(minnextp);
5296 StructCopy(data, &data_fake, scan_data_t);
5297 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5300 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5301 data_fake.last_found=newSVsv(data->last_found);
5305 data_fake.last_closep = &fake;
5306 data_fake.flags = 0;
5307 data_fake.pos_delta = delta;
5309 data_fake.flags |= SF_IS_INF;
5310 if ( flags & SCF_DO_STCLASS && !scan->flags
5311 && OP(scan) == IFMATCH ) { /* Lookahead */
5312 ssc_init(pRExC_state, &intrnl);
5313 data_fake.start_class = &intrnl;
5314 f |= SCF_DO_STCLASS_AND;
5316 if (flags & SCF_WHILEM_VISITED_POS)
5317 f |= SCF_WHILEM_VISITED_POS;
5318 next = regnext(scan);
5319 nscan = NEXTOPER(NEXTOPER(scan));
5321 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5322 &deltanext, last, &data_fake,
5323 stopparen, recursed_depth, NULL,
5327 FAIL("Variable length lookbehind not implemented");
5329 else if (*minnextp > (I32)U8_MAX) {
5330 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5333 scan->flags = (U8)*minnextp;
5338 if (f & SCF_DO_STCLASS_AND) {
5339 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5340 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5343 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5345 if (data_fake.flags & SF_HAS_EVAL)
5346 data->flags |= SF_HAS_EVAL;
5347 data->whilem_c = data_fake.whilem_c;
5348 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5349 if (RExC_rx->minlen<*minnextp)
5350 RExC_rx->minlen=*minnextp;
5351 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5352 SvREFCNT_dec_NN(data_fake.last_found);
5354 if ( data_fake.minlen_fixed != minlenp )
5356 data->offset_fixed= data_fake.offset_fixed;
5357 data->minlen_fixed= data_fake.minlen_fixed;
5358 data->lookbehind_fixed+= scan->flags;
5360 if ( data_fake.minlen_float != minlenp )
5362 data->minlen_float= data_fake.minlen_float;
5363 data->offset_float_min=data_fake.offset_float_min;
5364 data->offset_float_max=data_fake.offset_float_max;
5365 data->lookbehind_float+= scan->flags;
5372 else if (OP(scan) == OPEN) {
5373 if (stopparen != (I32)ARG(scan))
5376 else if (OP(scan) == CLOSE) {
5377 if (stopparen == (I32)ARG(scan)) {
5380 if ((I32)ARG(scan) == is_par) {
5381 next = regnext(scan);
5383 if ( next && (OP(next) != WHILEM) && next < last)
5384 is_par = 0; /* Disable optimization */
5387 *(data->last_closep) = ARG(scan);
5389 else if (OP(scan) == EVAL) {
5391 data->flags |= SF_HAS_EVAL;
5393 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5394 if (flags & SCF_DO_SUBSTR) {
5395 scan_commit(pRExC_state, data, minlenp, is_inf);
5396 flags &= ~SCF_DO_SUBSTR;
5398 if (data && OP(scan)==ACCEPT) {
5399 data->flags |= SCF_SEEN_ACCEPT;
5404 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5406 if (flags & SCF_DO_SUBSTR) {
5407 scan_commit(pRExC_state, data, minlenp, is_inf);
5408 data->longest = &(data->longest_float);
5410 is_inf = is_inf_internal = 1;
5411 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5412 ssc_anything(data->start_class);
5413 flags &= ~SCF_DO_STCLASS;
5415 else if (OP(scan) == GPOS) {
5416 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5417 !(delta || is_inf || (data && data->pos_delta)))
5419 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5420 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5421 if (RExC_rx->gofs < (STRLEN)min)
5422 RExC_rx->gofs = min;
5424 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5428 #ifdef TRIE_STUDY_OPT
5429 #ifdef FULL_TRIE_STUDY
5430 else if (PL_regkind[OP(scan)] == TRIE) {
5431 /* NOTE - There is similar code to this block above for handling
5432 BRANCH nodes on the initial study. If you change stuff here
5434 regnode *trie_node= scan;
5435 regnode *tail= regnext(scan);
5436 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5437 SSize_t max1 = 0, min1 = SSize_t_MAX;
5440 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5441 /* Cannot merge strings after this. */
5442 scan_commit(pRExC_state, data, minlenp, is_inf);
5444 if (flags & SCF_DO_STCLASS)
5445 ssc_init_zero(pRExC_state, &accum);
5451 const regnode *nextbranch= NULL;
5454 for ( word=1 ; word <= trie->wordcount ; word++)
5456 SSize_t deltanext=0, minnext=0, f = 0, fake;
5457 regnode_ssc this_class;
5459 data_fake.flags = 0;
5461 data_fake.whilem_c = data->whilem_c;
5462 data_fake.last_closep = data->last_closep;
5465 data_fake.last_closep = &fake;
5466 data_fake.pos_delta = delta;
5467 if (flags & SCF_DO_STCLASS) {
5468 ssc_init(pRExC_state, &this_class);
5469 data_fake.start_class = &this_class;
5470 f = SCF_DO_STCLASS_AND;
5472 if (flags & SCF_WHILEM_VISITED_POS)
5473 f |= SCF_WHILEM_VISITED_POS;
5475 if (trie->jump[word]) {
5477 nextbranch = trie_node + trie->jump[0];
5478 scan= trie_node + trie->jump[word];
5479 /* We go from the jump point to the branch that follows
5480 it. Note this means we need the vestigal unused
5481 branches even though they arent otherwise used. */
5482 minnext = study_chunk(pRExC_state, &scan, minlenp,
5483 &deltanext, (regnode *)nextbranch, &data_fake,
5484 stopparen, recursed_depth, NULL, f,depth+1);
5486 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5487 nextbranch= regnext((regnode*)nextbranch);
5489 if (min1 > (SSize_t)(minnext + trie->minlen))
5490 min1 = minnext + trie->minlen;
5491 if (deltanext == SSize_t_MAX) {
5492 is_inf = is_inf_internal = 1;
5494 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5495 max1 = minnext + deltanext + trie->maxlen;
5497 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5499 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5500 if ( stopmin > min + min1)
5501 stopmin = min + min1;
5502 flags &= ~SCF_DO_SUBSTR;
5504 data->flags |= SCF_SEEN_ACCEPT;
5507 if (data_fake.flags & SF_HAS_EVAL)
5508 data->flags |= SF_HAS_EVAL;
5509 data->whilem_c = data_fake.whilem_c;
5511 if (flags & SCF_DO_STCLASS)
5512 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5515 if (flags & SCF_DO_SUBSTR) {
5516 data->pos_min += min1;
5517 data->pos_delta += max1 - min1;
5518 if (max1 != min1 || is_inf)
5519 data->longest = &(data->longest_float);
5522 delta += max1 - min1;
5523 if (flags & SCF_DO_STCLASS_OR) {
5524 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5526 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5527 flags &= ~SCF_DO_STCLASS;
5530 else if (flags & SCF_DO_STCLASS_AND) {
5532 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5533 flags &= ~SCF_DO_STCLASS;
5536 /* Switch to OR mode: cache the old value of
5537 * data->start_class */
5539 StructCopy(data->start_class, and_withp, regnode_ssc);
5540 flags &= ~SCF_DO_STCLASS_AND;
5541 StructCopy(&accum, data->start_class, regnode_ssc);
5542 flags |= SCF_DO_STCLASS_OR;
5549 else if (PL_regkind[OP(scan)] == TRIE) {
5550 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5553 min += trie->minlen;
5554 delta += (trie->maxlen - trie->minlen);
5555 flags &= ~SCF_DO_STCLASS; /* xxx */
5556 if (flags & SCF_DO_SUBSTR) {
5557 /* Cannot expect anything... */
5558 scan_commit(pRExC_state, data, minlenp, is_inf);
5559 data->pos_min += trie->minlen;
5560 data->pos_delta += (trie->maxlen - trie->minlen);
5561 if (trie->maxlen != trie->minlen)
5562 data->longest = &(data->longest_float);
5564 if (trie->jump) /* no more substrings -- for now /grr*/
5565 flags &= ~SCF_DO_SUBSTR;
5567 #endif /* old or new */
5568 #endif /* TRIE_STUDY_OPT */
5570 /* Else: zero-length, ignore. */
5571 scan = regnext(scan);
5573 /* If we are exiting a recursion we can unset its recursed bit
5574 * and allow ourselves to enter it again - no danger of an
5575 * infinite loop there.
5576 if (stopparen > -1 && recursed) {
5577 DEBUG_STUDYDATA("unset:", data,depth);
5578 PAREN_UNSET( recursed, stopparen);
5582 DEBUG_STUDYDATA("frame-end:",data,depth);
5583 DEBUG_PEEP("fend", scan, depth);
5584 /* restore previous context */
5587 stopparen = frame->stop;
5588 recursed_depth = frame->prev_recursed_depth;
5591 frame = frame->prev;
5592 goto fake_study_recurse;
5597 DEBUG_STUDYDATA("pre-fin:",data,depth);
5600 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5602 if (flags & SCF_DO_SUBSTR && is_inf)
5603 data->pos_delta = SSize_t_MAX - data->pos_min;
5604 if (is_par > (I32)U8_MAX)
5606 if (is_par && pars==1 && data) {
5607 data->flags |= SF_IN_PAR;
5608 data->flags &= ~SF_HAS_PAR;
5610 else if (pars && data) {
5611 data->flags |= SF_HAS_PAR;
5612 data->flags &= ~SF_IN_PAR;
5614 if (flags & SCF_DO_STCLASS_OR)
5615 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5616 if (flags & SCF_TRIE_RESTUDY)
5617 data->flags |= SCF_TRIE_RESTUDY;
5619 DEBUG_STUDYDATA("post-fin:",data,depth);
5622 SSize_t final_minlen= min < stopmin ? min : stopmin;
5624 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5625 RExC_maxlen = final_minlen + delta;
5627 return final_minlen;
5633 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5635 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5637 PERL_ARGS_ASSERT_ADD_DATA;
5639 Renewc(RExC_rxi->data,
5640 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5641 char, struct reg_data);
5643 Renew(RExC_rxi->data->what, count + n, U8);
5645 Newx(RExC_rxi->data->what, n, U8);
5646 RExC_rxi->data->count = count + n;
5647 Copy(s, RExC_rxi->data->what + count, n, U8);
5651 /*XXX: todo make this not included in a non debugging perl, but appears to be
5652 * used anyway there, in 'use re' */
5653 #ifndef PERL_IN_XSUB_RE
5655 Perl_reginitcolors(pTHX)
5657 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5659 char *t = savepv(s);
5663 t = strchr(t, '\t');
5669 PL_colors[i] = t = (char *)"";
5674 PL_colors[i++] = (char *)"";
5681 #ifdef TRIE_STUDY_OPT
5682 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5685 (data.flags & SCF_TRIE_RESTUDY) \
5693 #define CHECK_RESTUDY_GOTO_butfirst
5697 * pregcomp - compile a regular expression into internal code
5699 * Decides which engine's compiler to call based on the hint currently in
5703 #ifndef PERL_IN_XSUB_RE
5705 /* return the currently in-scope regex engine (or the default if none) */
5707 regexp_engine const *
5708 Perl_current_re_engine(pTHX)
5710 if (IN_PERL_COMPILETIME) {
5711 HV * const table = GvHV(PL_hintgv);
5714 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5715 return &PL_core_reg_engine;
5716 ptr = hv_fetchs(table, "regcomp", FALSE);
5717 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5718 return &PL_core_reg_engine;
5719 return INT2PTR(regexp_engine*,SvIV(*ptr));
5723 if (!PL_curcop->cop_hints_hash)
5724 return &PL_core_reg_engine;
5725 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5726 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5727 return &PL_core_reg_engine;
5728 return INT2PTR(regexp_engine*,SvIV(ptr));
5734 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5736 regexp_engine const *eng = current_re_engine();
5737 GET_RE_DEBUG_FLAGS_DECL;
5739 PERL_ARGS_ASSERT_PREGCOMP;
5741 /* Dispatch a request to compile a regexp to correct regexp engine. */
5743 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5746 return CALLREGCOMP_ENG(eng, pattern, flags);
5750 /* public(ish) entry point for the perl core's own regex compiling code.
5751 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5752 * pattern rather than a list of OPs, and uses the internal engine rather
5753 * than the current one */
5756 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5758 SV *pat = pattern; /* defeat constness! */
5759 PERL_ARGS_ASSERT_RE_COMPILE;
5760 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5761 #ifdef PERL_IN_XSUB_RE
5764 &PL_core_reg_engine,
5766 NULL, NULL, rx_flags, 0);
5770 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5771 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5772 * point to the realloced string and length.
5774 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5778 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5779 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5781 U8 *const src = (U8*)*pat_p;
5786 GET_RE_DEBUG_FLAGS_DECL;
5788 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5789 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5791 Newx(dst, *plen_p * 2 + 1, U8);
5794 while (s < *plen_p) {
5795 append_utf8_from_native_byte(src[s], &d);
5796 if (n < num_code_blocks) {
5797 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5798 pRExC_state->code_blocks[n].start = d - dst - 1;
5799 assert(*(d - 1) == '(');
5802 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5803 pRExC_state->code_blocks[n].end = d - dst - 1;
5804 assert(*(d - 1) == ')');
5813 *pat_p = (char*) dst;
5815 RExC_orig_utf8 = RExC_utf8 = 1;
5820 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5821 * while recording any code block indices, and handling overloading,
5822 * nested qr// objects etc. If pat is null, it will allocate a new
5823 * string, or just return the first arg, if there's only one.
5825 * Returns the malloced/updated pat.
5826 * patternp and pat_count is the array of SVs to be concatted;
5827 * oplist is the optional list of ops that generated the SVs;
5828 * recompile_p is a pointer to a boolean that will be set if
5829 * the regex will need to be recompiled.
5830 * delim, if non-null is an SV that will be inserted between each element
5834 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5835 SV *pat, SV ** const patternp, int pat_count,
5836 OP *oplist, bool *recompile_p, SV *delim)
5840 bool use_delim = FALSE;
5841 bool alloced = FALSE;
5843 /* if we know we have at least two args, create an empty string,
5844 * then concatenate args to that. For no args, return an empty string */
5845 if (!pat && pat_count != 1) {
5851 for (svp = patternp; svp < patternp + pat_count; svp++) {
5854 STRLEN orig_patlen = 0;
5856 SV *msv = use_delim ? delim : *svp;
5857 if (!msv) msv = &PL_sv_undef;
5859 /* if we've got a delimiter, we go round the loop twice for each
5860 * svp slot (except the last), using the delimiter the second
5869 if (SvTYPE(msv) == SVt_PVAV) {
5870 /* we've encountered an interpolated array within
5871 * the pattern, e.g. /...@a..../. Expand the list of elements,
5872 * then recursively append elements.
5873 * The code in this block is based on S_pushav() */
5875 AV *const av = (AV*)msv;
5876 const SSize_t maxarg = AvFILL(av) + 1;
5880 assert(oplist->op_type == OP_PADAV
5881 || oplist->op_type == OP_RV2AV);
5882 oplist = OP_SIBLING(oplist);
5885 if (SvRMAGICAL(av)) {
5888 Newx(array, maxarg, SV*);
5890 for (i=0; i < maxarg; i++) {
5891 SV ** const svp = av_fetch(av, i, FALSE);
5892 array[i] = svp ? *svp : &PL_sv_undef;
5896 array = AvARRAY(av);
5898 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5899 array, maxarg, NULL, recompile_p,
5901 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5907 /* we make the assumption here that each op in the list of
5908 * op_siblings maps to one SV pushed onto the stack,
5909 * except for code blocks, with have both an OP_NULL and
5911 * This allows us to match up the list of SVs against the
5912 * list of OPs to find the next code block.
5914 * Note that PUSHMARK PADSV PADSV ..
5916 * PADRANGE PADSV PADSV ..
5917 * so the alignment still works. */
5920 if (oplist->op_type == OP_NULL
5921 && (oplist->op_flags & OPf_SPECIAL))
5923 assert(n < pRExC_state->num_code_blocks);
5924 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5925 pRExC_state->code_blocks[n].block = oplist;
5926 pRExC_state->code_blocks[n].src_regex = NULL;
5929 oplist = OP_SIBLING(oplist); /* skip CONST */
5932 oplist = OP_SIBLING(oplist);;
5935 /* apply magic and QR overloading to arg */
5938 if (SvROK(msv) && SvAMAGIC(msv)) {
5939 SV *sv = AMG_CALLunary(msv, regexp_amg);
5943 if (SvTYPE(sv) != SVt_REGEXP)
5944 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5949 /* try concatenation overload ... */
5950 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5951 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5954 /* overloading involved: all bets are off over literal
5955 * code. Pretend we haven't seen it */
5956 pRExC_state->num_code_blocks -= n;
5960 /* ... or failing that, try "" overload */
5961 while (SvAMAGIC(msv)
5962 && (sv = AMG_CALLunary(msv, string_amg))
5966 && SvRV(msv) == SvRV(sv))
5971 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5975 /* this is a partially unrolled
5976 * sv_catsv_nomg(pat, msv);
5977 * that allows us to adjust code block indices if
5980 char *dst = SvPV_force_nomg(pat, dlen);
5982 if (SvUTF8(msv) && !SvUTF8(pat)) {
5983 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5984 sv_setpvn(pat, dst, dlen);
5987 sv_catsv_nomg(pat, msv);
5994 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5997 /* extract any code blocks within any embedded qr//'s */
5998 if (rx && SvTYPE(rx) == SVt_REGEXP
5999 && RX_ENGINE((REGEXP*)rx)->op_comp)
6002 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6003 if (ri->num_code_blocks) {
6005 /* the presence of an embedded qr// with code means
6006 * we should always recompile: the text of the
6007 * qr// may not have changed, but it may be a
6008 * different closure than last time */
6010 Renew(pRExC_state->code_blocks,
6011 pRExC_state->num_code_blocks + ri->num_code_blocks,
6012 struct reg_code_block);
6013 pRExC_state->num_code_blocks += ri->num_code_blocks;
6015 for (i=0; i < ri->num_code_blocks; i++) {
6016 struct reg_code_block *src, *dst;
6017 STRLEN offset = orig_patlen
6018 + ReANY((REGEXP *)rx)->pre_prefix;
6019 assert(n < pRExC_state->num_code_blocks);
6020 src = &ri->code_blocks[i];
6021 dst = &pRExC_state->code_blocks[n];
6022 dst->start = src->start + offset;
6023 dst->end = src->end + offset;
6024 dst->block = src->block;
6025 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6034 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6043 /* see if there are any run-time code blocks in the pattern.
6044 * False positives are allowed */
6047 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6048 char *pat, STRLEN plen)
6053 PERL_UNUSED_CONTEXT;
6055 for (s = 0; s < plen; s++) {
6056 if (n < pRExC_state->num_code_blocks
6057 && s == pRExC_state->code_blocks[n].start)
6059 s = pRExC_state->code_blocks[n].end;
6063 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6065 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6067 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6074 /* Handle run-time code blocks. We will already have compiled any direct
6075 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6076 * copy of it, but with any literal code blocks blanked out and
6077 * appropriate chars escaped; then feed it into
6079 * eval "qr'modified_pattern'"
6083 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6087 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6089 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6090 * and merge them with any code blocks of the original regexp.
6092 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6093 * instead, just save the qr and return FALSE; this tells our caller that
6094 * the original pattern needs upgrading to utf8.
6098 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6099 char *pat, STRLEN plen)
6103 GET_RE_DEBUG_FLAGS_DECL;
6105 if (pRExC_state->runtime_code_qr) {
6106 /* this is the second time we've been called; this should
6107 * only happen if the main pattern got upgraded to utf8
6108 * during compilation; re-use the qr we compiled first time
6109 * round (which should be utf8 too)
6111 qr = pRExC_state->runtime_code_qr;
6112 pRExC_state->runtime_code_qr = NULL;
6113 assert(RExC_utf8 && SvUTF8(qr));
6119 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6123 /* determine how many extra chars we need for ' and \ escaping */
6124 for (s = 0; s < plen; s++) {
6125 if (pat[s] == '\'' || pat[s] == '\\')
6129 Newx(newpat, newlen, char);
6131 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6133 for (s = 0; s < plen; s++) {
6134 if (n < pRExC_state->num_code_blocks
6135 && s == pRExC_state->code_blocks[n].start)
6137 /* blank out literal code block */
6138 assert(pat[s] == '(');
6139 while (s <= pRExC_state->code_blocks[n].end) {
6147 if (pat[s] == '\'' || pat[s] == '\\')
6152 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6156 PerlIO_printf(Perl_debug_log,
6157 "%sre-parsing pattern for runtime code:%s %s\n",
6158 PL_colors[4],PL_colors[5],newpat);
6161 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6166 PUSHSTACKi(PERLSI_REQUIRE);
6167 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6168 * parsing qr''; normally only q'' does this. It also alters
6170 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6171 SvREFCNT_dec_NN(sv);
6176 SV * const errsv = ERRSV;
6177 if (SvTRUE_NN(errsv))
6179 Safefree(pRExC_state->code_blocks);
6180 /* use croak_sv ? */
6181 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6184 assert(SvROK(qr_ref));
6186 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6187 /* the leaving below frees the tmp qr_ref.
6188 * Give qr a life of its own */
6196 if (!RExC_utf8 && SvUTF8(qr)) {
6197 /* first time through; the pattern got upgraded; save the
6198 * qr for the next time through */
6199 assert(!pRExC_state->runtime_code_qr);
6200 pRExC_state->runtime_code_qr = qr;
6205 /* extract any code blocks within the returned qr// */
6208 /* merge the main (r1) and run-time (r2) code blocks into one */
6210 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6211 struct reg_code_block *new_block, *dst;
6212 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6215 if (!r2->num_code_blocks) /* we guessed wrong */
6217 SvREFCNT_dec_NN(qr);
6222 r1->num_code_blocks + r2->num_code_blocks,
6223 struct reg_code_block);
6226 while ( i1 < r1->num_code_blocks
6227 || i2 < r2->num_code_blocks)
6229 struct reg_code_block *src;
6232 if (i1 == r1->num_code_blocks) {
6233 src = &r2->code_blocks[i2++];
6236 else if (i2 == r2->num_code_blocks)
6237 src = &r1->code_blocks[i1++];
6238 else if ( r1->code_blocks[i1].start
6239 < r2->code_blocks[i2].start)
6241 src = &r1->code_blocks[i1++];
6242 assert(src->end < r2->code_blocks[i2].start);
6245 assert( r1->code_blocks[i1].start
6246 > r2->code_blocks[i2].start);
6247 src = &r2->code_blocks[i2++];
6249 assert(src->end < r1->code_blocks[i1].start);
6252 assert(pat[src->start] == '(');
6253 assert(pat[src->end] == ')');
6254 dst->start = src->start;
6255 dst->end = src->end;
6256 dst->block = src->block;
6257 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6261 r1->num_code_blocks += r2->num_code_blocks;
6262 Safefree(r1->code_blocks);
6263 r1->code_blocks = new_block;
6266 SvREFCNT_dec_NN(qr);
6272 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6273 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6274 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6275 STRLEN longest_length, bool eol, bool meol)
6277 /* This is the common code for setting up the floating and fixed length
6278 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6279 * as to whether succeeded or not */
6284 if (! (longest_length
6285 || (eol /* Can't have SEOL and MULTI */
6286 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6288 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6289 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6294 /* copy the information about the longest from the reg_scan_data
6295 over to the program. */
6296 if (SvUTF8(sv_longest)) {
6297 *rx_utf8 = sv_longest;
6300 *rx_substr = sv_longest;
6303 /* end_shift is how many chars that must be matched that
6304 follow this item. We calculate it ahead of time as once the
6305 lookbehind offset is added in we lose the ability to correctly
6307 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6308 *rx_end_shift = ml - offset
6309 - longest_length + (SvTAIL(sv_longest) != 0)
6312 t = (eol/* Can't have SEOL and MULTI */
6313 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6314 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6320 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6321 * regular expression into internal code.
6322 * The pattern may be passed either as:
6323 * a list of SVs (patternp plus pat_count)
6324 * a list of OPs (expr)
6325 * If both are passed, the SV list is used, but the OP list indicates
6326 * which SVs are actually pre-compiled code blocks
6328 * The SVs in the list have magic and qr overloading applied to them (and
6329 * the list may be modified in-place with replacement SVs in the latter
6332 * If the pattern hasn't changed from old_re, then old_re will be
6335 * eng is the current engine. If that engine has an op_comp method, then
6336 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6337 * do the initial concatenation of arguments and pass on to the external
6340 * If is_bare_re is not null, set it to a boolean indicating whether the
6341 * arg list reduced (after overloading) to a single bare regex which has
6342 * been returned (i.e. /$qr/).
6344 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6346 * pm_flags contains the PMf_* flags, typically based on those from the
6347 * pm_flags field of the related PMOP. Currently we're only interested in
6348 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6350 * We can't allocate space until we know how big the compiled form will be,
6351 * but we can't compile it (and thus know how big it is) until we've got a
6352 * place to put the code. So we cheat: we compile it twice, once with code
6353 * generation turned off and size counting turned on, and once "for real".
6354 * This also means that we don't allocate space until we are sure that the
6355 * thing really will compile successfully, and we never have to move the
6356 * code and thus invalidate pointers into it. (Note that it has to be in
6357 * one piece because free() must be able to free it all.) [NB: not true in perl]
6359 * Beware that the optimization-preparation code in here knows about some
6360 * of the structure of the compiled regexp. [I'll say.]
6364 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6365 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6366 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6370 regexp_internal *ri;
6378 SV *code_blocksv = NULL;
6379 SV** new_patternp = patternp;
6381 /* these are all flags - maybe they should be turned
6382 * into a single int with different bit masks */
6383 I32 sawlookahead = 0;
6388 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6390 bool runtime_code = 0;
6392 RExC_state_t RExC_state;
6393 RExC_state_t * const pRExC_state = &RExC_state;
6394 #ifdef TRIE_STUDY_OPT
6396 RExC_state_t copyRExC_state;
6398 GET_RE_DEBUG_FLAGS_DECL;
6400 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6402 DEBUG_r(if (!PL_colorset) reginitcolors());
6404 #ifndef PERL_IN_XSUB_RE
6405 /* Initialize these here instead of as-needed, as is quick and avoids
6406 * having to test them each time otherwise */
6407 if (! PL_AboveLatin1) {
6408 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6409 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6410 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6411 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6412 PL_HasMultiCharFold =
6413 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6415 /* This is calculated here, because the Perl program that generates the
6416 * static global ones doesn't currently have access to
6417 * NUM_ANYOF_CODE_POINTS */
6418 PL_InBitmap = _new_invlist(2);
6419 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6420 NUM_ANYOF_CODE_POINTS - 1);
6424 pRExC_state->code_blocks = NULL;
6425 pRExC_state->num_code_blocks = 0;
6428 *is_bare_re = FALSE;
6430 if (expr && (expr->op_type == OP_LIST ||
6431 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6432 /* allocate code_blocks if needed */
6436 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6437 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6438 ncode++; /* count of DO blocks */
6440 pRExC_state->num_code_blocks = ncode;
6441 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6446 /* compile-time pattern with just OP_CONSTs and DO blocks */
6451 /* find how many CONSTs there are */
6454 if (expr->op_type == OP_CONST)
6457 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6458 if (o->op_type == OP_CONST)
6462 /* fake up an SV array */
6464 assert(!new_patternp);
6465 Newx(new_patternp, n, SV*);
6466 SAVEFREEPV(new_patternp);
6470 if (expr->op_type == OP_CONST)
6471 new_patternp[n] = cSVOPx_sv(expr);
6473 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6474 if (o->op_type == OP_CONST)
6475 new_patternp[n++] = cSVOPo_sv;
6480 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6481 "Assembling pattern from %d elements%s\n", pat_count,
6482 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6484 /* set expr to the first arg op */
6486 if (pRExC_state->num_code_blocks
6487 && expr->op_type != OP_CONST)
6489 expr = cLISTOPx(expr)->op_first;
6490 assert( expr->op_type == OP_PUSHMARK
6491 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6492 || expr->op_type == OP_PADRANGE);
6493 expr = OP_SIBLING(expr);
6496 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6497 expr, &recompile, NULL);
6499 /* handle bare (possibly after overloading) regex: foo =~ $re */
6504 if (SvTYPE(re) == SVt_REGEXP) {
6508 Safefree(pRExC_state->code_blocks);
6509 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6510 "Precompiled pattern%s\n",
6511 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6517 exp = SvPV_nomg(pat, plen);
6519 if (!eng->op_comp) {
6520 if ((SvUTF8(pat) && IN_BYTES)
6521 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6523 /* make a temporary copy; either to convert to bytes,
6524 * or to avoid repeating get-magic / overloaded stringify */
6525 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6526 (IN_BYTES ? 0 : SvUTF8(pat)));
6528 Safefree(pRExC_state->code_blocks);
6529 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6532 /* ignore the utf8ness if the pattern is 0 length */
6533 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6534 RExC_uni_semantics = 0;
6535 RExC_contains_locale = 0;
6536 RExC_contains_i = 0;
6537 pRExC_state->runtime_code_qr = NULL;
6540 SV *dsv= sv_newmortal();
6541 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6542 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6543 PL_colors[4],PL_colors[5],s);
6547 /* we jump here if we upgrade the pattern to utf8 and have to
6550 if ((pm_flags & PMf_USE_RE_EVAL)
6551 /* this second condition covers the non-regex literal case,
6552 * i.e. $foo =~ '(?{})'. */
6553 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6555 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6557 /* return old regex if pattern hasn't changed */
6558 /* XXX: note in the below we have to check the flags as well as the
6561 * Things get a touch tricky as we have to compare the utf8 flag
6562 * independently from the compile flags. */
6566 && !!RX_UTF8(old_re) == !!RExC_utf8
6567 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6568 && RX_PRECOMP(old_re)
6569 && RX_PRELEN(old_re) == plen
6570 && memEQ(RX_PRECOMP(old_re), exp, plen)
6571 && !runtime_code /* with runtime code, always recompile */ )
6573 Safefree(pRExC_state->code_blocks);
6577 rx_flags = orig_rx_flags;
6579 if (rx_flags & PMf_FOLD) {
6580 RExC_contains_i = 1;
6582 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6584 /* Set to use unicode semantics if the pattern is in utf8 and has the
6585 * 'depends' charset specified, as it means unicode when utf8 */
6586 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6590 RExC_flags = rx_flags;
6591 RExC_pm_flags = pm_flags;
6594 if (TAINTING_get && TAINT_get)
6595 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6597 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6598 /* whoops, we have a non-utf8 pattern, whilst run-time code
6599 * got compiled as utf8. Try again with a utf8 pattern */
6600 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6601 pRExC_state->num_code_blocks);
6602 goto redo_first_pass;
6605 assert(!pRExC_state->runtime_code_qr);
6611 RExC_in_lookbehind = 0;
6612 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6614 RExC_override_recoding = 0;
6615 RExC_in_multi_char_class = 0;
6617 /* First pass: determine size, legality. */
6620 RExC_end = exp + plen;
6625 RExC_emit = (regnode *) &RExC_emit_dummy;
6626 RExC_whilem_seen = 0;
6627 RExC_open_parens = NULL;
6628 RExC_close_parens = NULL;
6630 RExC_paren_names = NULL;
6632 RExC_paren_name_list = NULL;
6634 RExC_recurse = NULL;
6635 RExC_study_chunk_recursed = NULL;
6636 RExC_study_chunk_recursed_bytes= 0;
6637 RExC_recurse_count = 0;
6638 pRExC_state->code_index = 0;
6640 #if 0 /* REGC() is (currently) a NOP at the first pass.
6641 * Clever compilers notice this and complain. --jhi */
6642 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6645 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6647 RExC_lastparse=NULL;
6649 /* reg may croak on us, not giving us a chance to free
6650 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6651 need it to survive as long as the regexp (qr/(?{})/).
6652 We must check that code_blocksv is not already set, because we may
6653 have jumped back to restart the sizing pass. */
6654 if (pRExC_state->code_blocks && !code_blocksv) {
6655 code_blocksv = newSV_type(SVt_PV);
6656 SAVEFREESV(code_blocksv);
6657 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6658 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6660 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6661 /* It's possible to write a regexp in ascii that represents Unicode
6662 codepoints outside of the byte range, such as via \x{100}. If we
6663 detect such a sequence we have to convert the entire pattern to utf8
6664 and then recompile, as our sizing calculation will have been based
6665 on 1 byte == 1 character, but we will need to use utf8 to encode
6666 at least some part of the pattern, and therefore must convert the whole
6669 if (flags & RESTART_UTF8) {
6670 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6671 pRExC_state->num_code_blocks);
6672 goto redo_first_pass;
6674 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6677 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6680 PerlIO_printf(Perl_debug_log,
6681 "Required size %"IVdf" nodes\n"
6682 "Starting second pass (creation)\n",
6685 RExC_lastparse=NULL;
6688 /* The first pass could have found things that force Unicode semantics */
6689 if ((RExC_utf8 || RExC_uni_semantics)
6690 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6692 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6695 /* Small enough for pointer-storage convention?
6696 If extralen==0, this means that we will not need long jumps. */
6697 if (RExC_size >= 0x10000L && RExC_extralen)
6698 RExC_size += RExC_extralen;
6701 if (RExC_whilem_seen > 15)
6702 RExC_whilem_seen = 15;
6704 /* Allocate space and zero-initialize. Note, the two step process
6705 of zeroing when in debug mode, thus anything assigned has to
6706 happen after that */
6707 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6709 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6710 char, regexp_internal);
6711 if ( r == NULL || ri == NULL )
6712 FAIL("Regexp out of space");
6714 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6715 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6718 /* bulk initialize base fields with 0. */
6719 Zero(ri, sizeof(regexp_internal), char);
6722 /* non-zero initialization begins here */
6725 r->extflags = rx_flags;
6726 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6728 if (pm_flags & PMf_IS_QR) {
6729 ri->code_blocks = pRExC_state->code_blocks;
6730 ri->num_code_blocks = pRExC_state->num_code_blocks;
6735 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6736 if (pRExC_state->code_blocks[n].src_regex)
6737 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6738 SAVEFREEPV(pRExC_state->code_blocks);
6742 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6743 bool has_charset = (get_regex_charset(r->extflags)
6744 != REGEX_DEPENDS_CHARSET);
6746 /* The caret is output if there are any defaults: if not all the STD
6747 * flags are set, or if no character set specifier is needed */
6749 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6751 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6752 == REG_RUN_ON_COMMENT_SEEN);
6753 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6754 >> RXf_PMf_STD_PMMOD_SHIFT);
6755 const char *fptr = STD_PAT_MODS; /*"msix"*/
6757 /* Allocate for the worst case, which is all the std flags are turned
6758 * on. If more precision is desired, we could do a population count of
6759 * the flags set. This could be done with a small lookup table, or by
6760 * shifting, masking and adding, or even, when available, assembly
6761 * language for a machine-language population count.
6762 * We never output a minus, as all those are defaults, so are
6763 * covered by the caret */
6764 const STRLEN wraplen = plen + has_p + has_runon
6765 + has_default /* If needs a caret */
6767 /* If needs a character set specifier */
6768 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6769 + (sizeof(STD_PAT_MODS) - 1)
6770 + (sizeof("(?:)") - 1);
6772 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6773 r->xpv_len_u.xpvlenu_pv = p;
6775 SvFLAGS(rx) |= SVf_UTF8;
6778 /* If a default, cover it using the caret */
6780 *p++= DEFAULT_PAT_MOD;
6784 const char* const name = get_regex_charset_name(r->extflags, &len);
6785 Copy(name, p, len, char);
6789 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6792 while((ch = *fptr++)) {
6800 Copy(RExC_precomp, p, plen, char);
6801 assert ((RX_WRAPPED(rx) - p) < 16);
6802 r->pre_prefix = p - RX_WRAPPED(rx);
6808 SvCUR_set(rx, p - RX_WRAPPED(rx));
6812 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6814 /* setup various meta data about recursion, this all requires
6815 * RExC_npar to be correctly set, and a bit later on we clear it */
6816 if (RExC_seen & REG_RECURSE_SEEN) {
6817 Newxz(RExC_open_parens, RExC_npar,regnode *);
6818 SAVEFREEPV(RExC_open_parens);
6819 Newxz(RExC_close_parens,RExC_npar,regnode *);
6820 SAVEFREEPV(RExC_close_parens);
6822 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6823 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6824 * So its 1 if there are no parens. */
6825 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6826 ((RExC_npar & 0x07) != 0);
6827 Newx(RExC_study_chunk_recursed,
6828 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6829 SAVEFREEPV(RExC_study_chunk_recursed);
6832 /* Useful during FAIL. */
6833 #ifdef RE_TRACK_PATTERN_OFFSETS
6834 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6835 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6836 "%s %"UVuf" bytes for offset annotations.\n",
6837 ri->u.offsets ? "Got" : "Couldn't get",
6838 (UV)((2*RExC_size+1) * sizeof(U32))));
6840 SetProgLen(ri,RExC_size);
6845 /* Second pass: emit code. */
6846 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6847 RExC_pm_flags = pm_flags;
6849 RExC_end = exp + plen;
6852 RExC_emit_start = ri->program;
6853 RExC_emit = ri->program;
6854 RExC_emit_bound = ri->program + RExC_size + 1;
6855 pRExC_state->code_index = 0;
6857 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6858 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6860 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6862 /* XXXX To minimize changes to RE engine we always allocate
6863 3-units-long substrs field. */
6864 Newx(r->substrs, 1, struct reg_substr_data);
6865 if (RExC_recurse_count) {
6866 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6867 SAVEFREEPV(RExC_recurse);
6871 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6873 RExC_study_chunk_recursed_count= 0;
6875 Zero(r->substrs, 1, struct reg_substr_data);
6876 if (RExC_study_chunk_recursed)
6877 Zero(RExC_study_chunk_recursed,
6878 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6880 #ifdef TRIE_STUDY_OPT
6882 StructCopy(&zero_scan_data, &data, scan_data_t);
6883 copyRExC_state = RExC_state;
6886 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6888 RExC_state = copyRExC_state;
6889 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6890 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6892 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6893 StructCopy(&zero_scan_data, &data, scan_data_t);
6896 StructCopy(&zero_scan_data, &data, scan_data_t);
6899 /* Dig out information for optimizations. */
6900 r->extflags = RExC_flags; /* was pm_op */
6901 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6904 SvUTF8_on(rx); /* Unicode in it? */
6905 ri->regstclass = NULL;
6906 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6907 r->intflags |= PREGf_NAUGHTY;
6908 scan = ri->program + 1; /* First BRANCH. */
6910 /* testing for BRANCH here tells us whether there is "must appear"
6911 data in the pattern. If there is then we can use it for optimisations */
6912 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6915 STRLEN longest_float_length, longest_fixed_length;
6916 regnode_ssc ch_class; /* pointed to by data */
6918 SSize_t last_close = 0; /* pointed to by data */
6919 regnode *first= scan;
6920 regnode *first_next= regnext(first);
6922 * Skip introductions and multiplicators >= 1
6923 * so that we can extract the 'meat' of the pattern that must
6924 * match in the large if() sequence following.
6925 * NOTE that EXACT is NOT covered here, as it is normally
6926 * picked up by the optimiser separately.
6928 * This is unfortunate as the optimiser isnt handling lookahead
6929 * properly currently.
6932 while ((OP(first) == OPEN && (sawopen = 1)) ||
6933 /* An OR of *one* alternative - should not happen now. */
6934 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6935 /* for now we can't handle lookbehind IFMATCH*/
6936 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6937 (OP(first) == PLUS) ||
6938 (OP(first) == MINMOD) ||
6939 /* An {n,m} with n>0 */
6940 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6941 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6944 * the only op that could be a regnode is PLUS, all the rest
6945 * will be regnode_1 or regnode_2.
6947 * (yves doesn't think this is true)
6949 if (OP(first) == PLUS)
6952 if (OP(first) == MINMOD)
6954 first += regarglen[OP(first)];
6956 first = NEXTOPER(first);
6957 first_next= regnext(first);
6960 /* Starting-point info. */
6962 DEBUG_PEEP("first:",first,0);
6963 /* Ignore EXACT as we deal with it later. */
6964 if (PL_regkind[OP(first)] == EXACT) {
6965 if (OP(first) == EXACT)
6966 NOOP; /* Empty, get anchored substr later. */
6968 ri->regstclass = first;
6971 else if (PL_regkind[OP(first)] == TRIE &&
6972 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6974 /* this can happen only on restudy */
6975 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6978 else if (REGNODE_SIMPLE(OP(first)))
6979 ri->regstclass = first;
6980 else if (PL_regkind[OP(first)] == BOUND ||
6981 PL_regkind[OP(first)] == NBOUND)
6982 ri->regstclass = first;
6983 else if (PL_regkind[OP(first)] == BOL) {
6984 r->intflags |= (OP(first) == MBOL
6987 first = NEXTOPER(first);
6990 else if (OP(first) == GPOS) {
6991 r->intflags |= PREGf_ANCH_GPOS;
6992 first = NEXTOPER(first);
6995 else if ((!sawopen || !RExC_sawback) &&
6997 (OP(first) == STAR &&
6998 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6999 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7001 /* turn .* into ^.* with an implied $*=1 */
7003 (OP(NEXTOPER(first)) == REG_ANY)
7006 r->intflags |= (type | PREGf_IMPLICIT);
7007 first = NEXTOPER(first);
7010 if (sawplus && !sawminmod && !sawlookahead
7011 && (!sawopen || !RExC_sawback)
7012 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7013 /* x+ must match at the 1st pos of run of x's */
7014 r->intflags |= PREGf_SKIP;
7016 /* Scan is after the zeroth branch, first is atomic matcher. */
7017 #ifdef TRIE_STUDY_OPT
7020 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7021 (IV)(first - scan + 1))
7025 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7026 (IV)(first - scan + 1))
7032 * If there's something expensive in the r.e., find the
7033 * longest literal string that must appear and make it the
7034 * regmust. Resolve ties in favor of later strings, since
7035 * the regstart check works with the beginning of the r.e.
7036 * and avoiding duplication strengthens checking. Not a
7037 * strong reason, but sufficient in the absence of others.
7038 * [Now we resolve ties in favor of the earlier string if
7039 * it happens that c_offset_min has been invalidated, since the
7040 * earlier string may buy us something the later one won't.]
7043 data.longest_fixed = newSVpvs("");
7044 data.longest_float = newSVpvs("");
7045 data.last_found = newSVpvs("");
7046 data.longest = &(data.longest_fixed);
7047 ENTER_with_name("study_chunk");
7048 SAVEFREESV(data.longest_fixed);
7049 SAVEFREESV(data.longest_float);
7050 SAVEFREESV(data.last_found);
7052 if (!ri->regstclass) {
7053 ssc_init(pRExC_state, &ch_class);
7054 data.start_class = &ch_class;
7055 stclass_flag = SCF_DO_STCLASS_AND;
7056 } else /* XXXX Check for BOUND? */
7058 data.last_closep = &last_close;
7061 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7062 scan + RExC_size, /* Up to end */
7064 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7065 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7069 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7072 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7073 && data.last_start_min == 0 && data.last_end > 0
7074 && !RExC_seen_zerolen
7075 && !(RExC_seen & REG_VERBARG_SEEN)
7076 && !(RExC_seen & REG_GPOS_SEEN)
7078 r->extflags |= RXf_CHECK_ALL;
7080 scan_commit(pRExC_state, &data,&minlen,0);
7082 longest_float_length = CHR_SVLEN(data.longest_float);
7084 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7085 && data.offset_fixed == data.offset_float_min
7086 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7087 && S_setup_longest (aTHX_ pRExC_state,
7091 &(r->float_end_shift),
7092 data.lookbehind_float,
7093 data.offset_float_min,
7095 longest_float_length,
7096 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7097 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7099 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7100 r->float_max_offset = data.offset_float_max;
7101 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7102 r->float_max_offset -= data.lookbehind_float;
7103 SvREFCNT_inc_simple_void_NN(data.longest_float);
7106 r->float_substr = r->float_utf8 = NULL;
7107 longest_float_length = 0;
7110 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7112 if (S_setup_longest (aTHX_ pRExC_state,
7114 &(r->anchored_utf8),
7115 &(r->anchored_substr),
7116 &(r->anchored_end_shift),
7117 data.lookbehind_fixed,
7120 longest_fixed_length,
7121 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7122 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7124 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7125 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7128 r->anchored_substr = r->anchored_utf8 = NULL;
7129 longest_fixed_length = 0;
7131 LEAVE_with_name("study_chunk");
7134 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7135 ri->regstclass = NULL;
7137 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7139 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7140 && is_ssc_worth_it(pRExC_state, data.start_class))
7142 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7144 ssc_finalize(pRExC_state, data.start_class);
7146 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7147 StructCopy(data.start_class,
7148 (regnode_ssc*)RExC_rxi->data->data[n],
7150 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7151 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7152 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7153 regprop(r, sv, (regnode*)data.start_class, NULL);
7154 PerlIO_printf(Perl_debug_log,
7155 "synthetic stclass \"%s\".\n",
7156 SvPVX_const(sv));});
7157 data.start_class = NULL;
7160 /* A temporary algorithm prefers floated substr to fixed one to dig
7162 if (longest_fixed_length > longest_float_length) {
7163 r->substrs->check_ix = 0;
7164 r->check_end_shift = r->anchored_end_shift;
7165 r->check_substr = r->anchored_substr;
7166 r->check_utf8 = r->anchored_utf8;
7167 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7168 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7169 r->intflags |= PREGf_NOSCAN;
7172 r->substrs->check_ix = 1;
7173 r->check_end_shift = r->float_end_shift;
7174 r->check_substr = r->float_substr;
7175 r->check_utf8 = r->float_utf8;
7176 r->check_offset_min = r->float_min_offset;
7177 r->check_offset_max = r->float_max_offset;
7179 if ((r->check_substr || r->check_utf8) ) {
7180 r->extflags |= RXf_USE_INTUIT;
7181 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7182 r->extflags |= RXf_INTUIT_TAIL;
7184 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7186 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7187 if ( (STRLEN)minlen < longest_float_length )
7188 minlen= longest_float_length;
7189 if ( (STRLEN)minlen < longest_fixed_length )
7190 minlen= longest_fixed_length;
7194 /* Several toplevels. Best we can is to set minlen. */
7196 regnode_ssc ch_class;
7197 SSize_t last_close = 0;
7199 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7201 scan = ri->program + 1;
7202 ssc_init(pRExC_state, &ch_class);
7203 data.start_class = &ch_class;
7204 data.last_closep = &last_close;
7207 minlen = study_chunk(pRExC_state,
7208 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7209 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7210 ? SCF_TRIE_DOING_RESTUDY
7214 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7216 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7217 = r->float_substr = r->float_utf8 = NULL;
7219 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7220 && is_ssc_worth_it(pRExC_state, data.start_class))
7222 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7224 ssc_finalize(pRExC_state, data.start_class);
7226 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7227 StructCopy(data.start_class,
7228 (regnode_ssc*)RExC_rxi->data->data[n],
7230 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7231 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7232 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7233 regprop(r, sv, (regnode*)data.start_class, NULL);
7234 PerlIO_printf(Perl_debug_log,
7235 "synthetic stclass \"%s\".\n",
7236 SvPVX_const(sv));});
7237 data.start_class = NULL;
7241 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7242 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7243 r->maxlen = REG_INFTY;
7246 r->maxlen = RExC_maxlen;
7249 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7250 the "real" pattern. */
7252 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7253 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7255 r->minlenret = minlen;
7256 if (r->minlen < minlen)
7259 if (RExC_seen & REG_GPOS_SEEN)
7260 r->intflags |= PREGf_GPOS_SEEN;
7261 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7262 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7264 if (pRExC_state->num_code_blocks)
7265 r->extflags |= RXf_EVAL_SEEN;
7266 if (RExC_seen & REG_CANY_SEEN)
7267 r->intflags |= PREGf_CANY_SEEN;
7268 if (RExC_seen & REG_VERBARG_SEEN)
7270 r->intflags |= PREGf_VERBARG_SEEN;
7271 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7273 if (RExC_seen & REG_CUTGROUP_SEEN)
7274 r->intflags |= PREGf_CUTGROUP_SEEN;
7275 if (pm_flags & PMf_USE_RE_EVAL)
7276 r->intflags |= PREGf_USE_RE_EVAL;
7277 if (RExC_paren_names)
7278 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7280 RXp_PAREN_NAMES(r) = NULL;
7282 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7283 * so it can be used in pp.c */
7284 if (r->intflags & PREGf_ANCH)
7285 r->extflags |= RXf_IS_ANCHORED;
7289 /* this is used to identify "special" patterns that might result
7290 * in Perl NOT calling the regex engine and instead doing the match "itself",
7291 * particularly special cases in split//. By having the regex compiler
7292 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7293 * we avoid weird issues with equivalent patterns resulting in different behavior,
7294 * AND we allow non Perl engines to get the same optimizations by the setting the
7295 * flags appropriately - Yves */
7296 regnode *first = ri->program + 1;
7298 regnode *next = NEXTOPER(first);
7301 if (PL_regkind[fop] == NOTHING && nop == END)
7302 r->extflags |= RXf_NULL;
7303 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7304 /* when fop is SBOL first->flags will be true only when it was
7305 * produced by parsing /\A/, and not when parsing /^/. This is
7306 * very important for the split code as there we want to
7307 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7308 * See rt #122761 for more details. -- Yves */
7309 r->extflags |= RXf_START_ONLY;
7310 else if (fop == PLUS
7311 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7312 && OP(regnext(first)) == END)
7313 r->extflags |= RXf_WHITE;
7314 else if ( r->extflags & RXf_SPLIT
7316 && STR_LEN(first) == 1
7317 && *(STRING(first)) == ' '
7318 && OP(regnext(first)) == END )
7319 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7323 if (RExC_contains_locale) {
7324 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7328 if (RExC_paren_names) {
7329 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7330 ri->data->data[ri->name_list_idx]
7331 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7334 ri->name_list_idx = 0;
7336 if (RExC_recurse_count) {
7337 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7338 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7339 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7342 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7343 /* assume we don't need to swap parens around before we match */
7345 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7346 (unsigned long)RExC_study_chunk_recursed_count);
7350 PerlIO_printf(Perl_debug_log,"Final program:\n");
7353 #ifdef RE_TRACK_PATTERN_OFFSETS
7354 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7355 const STRLEN len = ri->u.offsets[0];
7357 GET_RE_DEBUG_FLAGS_DECL;
7358 PerlIO_printf(Perl_debug_log,
7359 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7360 for (i = 1; i <= len; i++) {
7361 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7362 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7363 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7365 PerlIO_printf(Perl_debug_log, "\n");
7370 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7371 * by setting the regexp SV to readonly-only instead. If the
7372 * pattern's been recompiled, the USEDness should remain. */
7373 if (old_re && SvREADONLY(old_re))
7381 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7384 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7386 PERL_UNUSED_ARG(value);
7388 if (flags & RXapif_FETCH) {
7389 return reg_named_buff_fetch(rx, key, flags);
7390 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7391 Perl_croak_no_modify();
7393 } else if (flags & RXapif_EXISTS) {
7394 return reg_named_buff_exists(rx, key, flags)
7397 } else if (flags & RXapif_REGNAMES) {
7398 return reg_named_buff_all(rx, flags);
7399 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7400 return reg_named_buff_scalar(rx, flags);
7402 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7408 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7411 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7412 PERL_UNUSED_ARG(lastkey);
7414 if (flags & RXapif_FIRSTKEY)
7415 return reg_named_buff_firstkey(rx, flags);
7416 else if (flags & RXapif_NEXTKEY)
7417 return reg_named_buff_nextkey(rx, flags);
7419 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7426 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7429 AV *retarray = NULL;
7431 struct regexp *const rx = ReANY(r);
7433 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7435 if (flags & RXapif_ALL)
7438 if (rx && RXp_PAREN_NAMES(rx)) {
7439 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7442 SV* sv_dat=HeVAL(he_str);
7443 I32 *nums=(I32*)SvPVX(sv_dat);
7444 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7445 if ((I32)(rx->nparens) >= nums[i]
7446 && rx->offs[nums[i]].start != -1
7447 && rx->offs[nums[i]].end != -1)
7450 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7455 ret = newSVsv(&PL_sv_undef);
7458 av_push(retarray, ret);
7461 return newRV_noinc(MUTABLE_SV(retarray));
7468 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7471 struct regexp *const rx = ReANY(r);
7473 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7475 if (rx && RXp_PAREN_NAMES(rx)) {
7476 if (flags & RXapif_ALL) {
7477 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7479 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7481 SvREFCNT_dec_NN(sv);
7493 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7495 struct regexp *const rx = ReANY(r);
7497 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7499 if ( rx && RXp_PAREN_NAMES(rx) ) {
7500 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7502 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7509 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7511 struct regexp *const rx = ReANY(r);
7512 GET_RE_DEBUG_FLAGS_DECL;
7514 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7516 if (rx && RXp_PAREN_NAMES(rx)) {
7517 HV *hv = RXp_PAREN_NAMES(rx);
7519 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7522 SV* sv_dat = HeVAL(temphe);
7523 I32 *nums = (I32*)SvPVX(sv_dat);
7524 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7525 if ((I32)(rx->lastparen) >= nums[i] &&
7526 rx->offs[nums[i]].start != -1 &&
7527 rx->offs[nums[i]].end != -1)
7533 if (parno || flags & RXapif_ALL) {
7534 return newSVhek(HeKEY_hek(temphe));
7542 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7547 struct regexp *const rx = ReANY(r);
7549 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7551 if (rx && RXp_PAREN_NAMES(rx)) {
7552 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7553 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7554 } else if (flags & RXapif_ONE) {
7555 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7556 av = MUTABLE_AV(SvRV(ret));
7557 length = av_tindex(av);
7558 SvREFCNT_dec_NN(ret);
7559 return newSViv(length + 1);
7561 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7566 return &PL_sv_undef;
7570 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7572 struct regexp *const rx = ReANY(r);
7575 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7577 if (rx && RXp_PAREN_NAMES(rx)) {
7578 HV *hv= RXp_PAREN_NAMES(rx);
7580 (void)hv_iterinit(hv);
7581 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7584 SV* sv_dat = HeVAL(temphe);
7585 I32 *nums = (I32*)SvPVX(sv_dat);
7586 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7587 if ((I32)(rx->lastparen) >= nums[i] &&
7588 rx->offs[nums[i]].start != -1 &&
7589 rx->offs[nums[i]].end != -1)
7595 if (parno || flags & RXapif_ALL) {
7596 av_push(av, newSVhek(HeKEY_hek(temphe)));
7601 return newRV_noinc(MUTABLE_SV(av));
7605 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7608 struct regexp *const rx = ReANY(r);
7614 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7616 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7617 || n == RX_BUFF_IDX_CARET_FULLMATCH
7618 || n == RX_BUFF_IDX_CARET_POSTMATCH
7621 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7623 /* on something like
7626 * the KEEPCOPY is set on the PMOP rather than the regex */
7627 if (PL_curpm && r == PM_GETRE(PL_curpm))
7628 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7637 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7638 /* no need to distinguish between them any more */
7639 n = RX_BUFF_IDX_FULLMATCH;
7641 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7642 && rx->offs[0].start != -1)
7644 /* $`, ${^PREMATCH} */
7645 i = rx->offs[0].start;
7649 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7650 && rx->offs[0].end != -1)
7652 /* $', ${^POSTMATCH} */
7653 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7654 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7657 if ( 0 <= n && n <= (I32)rx->nparens &&
7658 (s1 = rx->offs[n].start) != -1 &&
7659 (t1 = rx->offs[n].end) != -1)
7661 /* $&, ${^MATCH}, $1 ... */
7663 s = rx->subbeg + s1 - rx->suboffset;
7668 assert(s >= rx->subbeg);
7669 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7671 #ifdef NO_TAINT_SUPPORT
7672 sv_setpvn(sv, s, i);
7674 const int oldtainted = TAINT_get;
7676 sv_setpvn(sv, s, i);
7677 TAINT_set(oldtainted);
7679 if ( (rx->intflags & PREGf_CANY_SEEN)
7680 ? (RXp_MATCH_UTF8(rx)
7681 && (!i || is_utf8_string((U8*)s, i)))
7682 : (RXp_MATCH_UTF8(rx)) )
7689 if (RXp_MATCH_TAINTED(rx)) {
7690 if (SvTYPE(sv) >= SVt_PVMG) {
7691 MAGIC* const mg = SvMAGIC(sv);
7694 SvMAGIC_set(sv, mg->mg_moremagic);
7696 if ((mgt = SvMAGIC(sv))) {
7697 mg->mg_moremagic = mgt;
7698 SvMAGIC_set(sv, mg);
7709 sv_setsv(sv,&PL_sv_undef);
7715 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7716 SV const * const value)
7718 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7720 PERL_UNUSED_ARG(rx);
7721 PERL_UNUSED_ARG(paren);
7722 PERL_UNUSED_ARG(value);
7725 Perl_croak_no_modify();
7729 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7732 struct regexp *const rx = ReANY(r);
7736 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7738 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7739 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7740 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7743 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7745 /* on something like
7748 * the KEEPCOPY is set on the PMOP rather than the regex */
7749 if (PL_curpm && r == PM_GETRE(PL_curpm))
7750 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7756 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7758 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7759 case RX_BUFF_IDX_PREMATCH: /* $` */
7760 if (rx->offs[0].start != -1) {
7761 i = rx->offs[0].start;
7770 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7771 case RX_BUFF_IDX_POSTMATCH: /* $' */
7772 if (rx->offs[0].end != -1) {
7773 i = rx->sublen - rx->offs[0].end;
7775 s1 = rx->offs[0].end;
7782 default: /* $& / ${^MATCH}, $1, $2, ... */
7783 if (paren <= (I32)rx->nparens &&
7784 (s1 = rx->offs[paren].start) != -1 &&
7785 (t1 = rx->offs[paren].end) != -1)
7791 if (ckWARN(WARN_UNINITIALIZED))
7792 report_uninit((const SV *)sv);
7797 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7798 const char * const s = rx->subbeg - rx->suboffset + s1;
7803 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7810 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7812 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7813 PERL_UNUSED_ARG(rx);
7817 return newSVpvs("Regexp");
7820 /* Scans the name of a named buffer from the pattern.
7821 * If flags is REG_RSN_RETURN_NULL returns null.
7822 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7823 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7824 * to the parsed name as looked up in the RExC_paren_names hash.
7825 * If there is an error throws a vFAIL().. type exception.
7828 #define REG_RSN_RETURN_NULL 0
7829 #define REG_RSN_RETURN_NAME 1
7830 #define REG_RSN_RETURN_DATA 2
7833 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7835 char *name_start = RExC_parse;
7837 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7839 assert (RExC_parse <= RExC_end);
7840 if (RExC_parse == RExC_end) NOOP;
7841 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7842 /* skip IDFIRST by using do...while */
7845 RExC_parse += UTF8SKIP(RExC_parse);
7846 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7850 } while (isWORDCHAR(*RExC_parse));
7852 RExC_parse++; /* so the <- from the vFAIL is after the offending
7854 vFAIL("Group name must start with a non-digit word character");
7858 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7859 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7860 if ( flags == REG_RSN_RETURN_NAME)
7862 else if (flags==REG_RSN_RETURN_DATA) {
7865 if ( ! sv_name ) /* should not happen*/
7866 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7867 if (RExC_paren_names)
7868 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7870 sv_dat = HeVAL(he_str);
7872 vFAIL("Reference to nonexistent named group");
7876 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7877 (unsigned long) flags);
7879 assert(0); /* NOT REACHED */
7884 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7885 int rem=(int)(RExC_end - RExC_parse); \
7894 if (RExC_lastparse!=RExC_parse) \
7895 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7898 iscut ? "..." : "<" \
7901 PerlIO_printf(Perl_debug_log,"%16s",""); \
7904 num = RExC_size + 1; \
7906 num=REG_NODE_NUM(RExC_emit); \
7907 if (RExC_lastnum!=num) \
7908 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7910 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7911 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7912 (int)((depth*2)), "", \
7916 RExC_lastparse=RExC_parse; \
7921 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7922 DEBUG_PARSE_MSG((funcname)); \
7923 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7925 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7926 DEBUG_PARSE_MSG((funcname)); \
7927 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7930 /* This section of code defines the inversion list object and its methods. The
7931 * interfaces are highly subject to change, so as much as possible is static to
7932 * this file. An inversion list is here implemented as a malloc'd C UV array
7933 * as an SVt_INVLIST scalar.
7935 * An inversion list for Unicode is an array of code points, sorted by ordinal
7936 * number. The zeroth element is the first code point in the list. The 1th
7937 * element is the first element beyond that not in the list. In other words,
7938 * the first range is
7939 * invlist[0]..(invlist[1]-1)
7940 * The other ranges follow. Thus every element whose index is divisible by two
7941 * marks the beginning of a range that is in the list, and every element not
7942 * divisible by two marks the beginning of a range not in the list. A single
7943 * element inversion list that contains the single code point N generally
7944 * consists of two elements
7947 * (The exception is when N is the highest representable value on the
7948 * machine, in which case the list containing just it would be a single
7949 * element, itself. By extension, if the last range in the list extends to
7950 * infinity, then the first element of that range will be in the inversion list
7951 * at a position that is divisible by two, and is the final element in the
7953 * Taking the complement (inverting) an inversion list is quite simple, if the
7954 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7955 * This implementation reserves an element at the beginning of each inversion
7956 * list to always contain 0; there is an additional flag in the header which
7957 * indicates if the list begins at the 0, or is offset to begin at the next
7960 * More about inversion lists can be found in "Unicode Demystified"
7961 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7962 * More will be coming when functionality is added later.
7964 * The inversion list data structure is currently implemented as an SV pointing
7965 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7966 * array of UV whose memory management is automatically handled by the existing
7967 * facilities for SV's.
7969 * Some of the methods should always be private to the implementation, and some
7970 * should eventually be made public */
7972 /* The header definitions are in F<inline_invlist.c> */
7974 PERL_STATIC_INLINE UV*
7975 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7977 /* Returns a pointer to the first element in the inversion list's array.
7978 * This is called upon initialization of an inversion list. Where the
7979 * array begins depends on whether the list has the code point U+0000 in it
7980 * or not. The other parameter tells it whether the code that follows this
7981 * call is about to put a 0 in the inversion list or not. The first
7982 * element is either the element reserved for 0, if TRUE, or the element
7983 * after it, if FALSE */
7985 bool* offset = get_invlist_offset_addr(invlist);
7986 UV* zero_addr = (UV *) SvPVX(invlist);
7988 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7991 assert(! _invlist_len(invlist));
7995 /* 1^1 = 0; 1^0 = 1 */
7996 *offset = 1 ^ will_have_0;
7997 return zero_addr + *offset;
8000 PERL_STATIC_INLINE UV*
8001 S_invlist_array(SV* const invlist)
8003 /* Returns the pointer to the inversion list's array. Every time the
8004 * length changes, this needs to be called in case malloc or realloc moved
8007 PERL_ARGS_ASSERT_INVLIST_ARRAY;
8009 /* Must not be empty. If these fail, you probably didn't check for <len>
8010 * being non-zero before trying to get the array */
8011 assert(_invlist_len(invlist));
8013 /* The very first element always contains zero, The array begins either
8014 * there, or if the inversion list is offset, at the element after it.
8015 * The offset header field determines which; it contains 0 or 1 to indicate
8016 * how much additionally to add */
8017 assert(0 == *(SvPVX(invlist)));
8018 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8021 PERL_STATIC_INLINE void
8022 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8024 /* Sets the current number of elements stored in the inversion list.
8025 * Updates SvCUR correspondingly */
8026 PERL_UNUSED_CONTEXT;
8027 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8029 assert(SvTYPE(invlist) == SVt_INVLIST);
8034 : TO_INTERNAL_SIZE(len + offset));
8035 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8038 PERL_STATIC_INLINE IV*
8039 S_get_invlist_previous_index_addr(SV* invlist)
8041 /* Return the address of the IV that is reserved to hold the cached index
8043 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8045 assert(SvTYPE(invlist) == SVt_INVLIST);
8047 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8050 PERL_STATIC_INLINE IV
8051 S_invlist_previous_index(SV* const invlist)
8053 /* Returns cached index of previous search */
8055 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8057 return *get_invlist_previous_index_addr(invlist);
8060 PERL_STATIC_INLINE void
8061 S_invlist_set_previous_index(SV* const invlist, const IV index)
8063 /* Caches <index> for later retrieval */
8065 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8067 assert(index == 0 || index < (int) _invlist_len(invlist));
8069 *get_invlist_previous_index_addr(invlist) = index;
8072 PERL_STATIC_INLINE UV
8073 S_invlist_max(SV* const invlist)
8075 /* Returns the maximum number of elements storable in the inversion list's
8076 * array, without having to realloc() */
8078 PERL_ARGS_ASSERT_INVLIST_MAX;
8080 assert(SvTYPE(invlist) == SVt_INVLIST);
8082 /* Assumes worst case, in which the 0 element is not counted in the
8083 * inversion list, so subtracts 1 for that */
8084 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8085 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8086 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8089 #ifndef PERL_IN_XSUB_RE
8091 Perl__new_invlist(pTHX_ IV initial_size)
8094 /* Return a pointer to a newly constructed inversion list, with enough
8095 * space to store 'initial_size' elements. If that number is negative, a
8096 * system default is used instead */
8100 if (initial_size < 0) {
8104 /* Allocate the initial space */
8105 new_list = newSV_type(SVt_INVLIST);
8107 /* First 1 is in case the zero element isn't in the list; second 1 is for
8109 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8110 invlist_set_len(new_list, 0, 0);
8112 /* Force iterinit() to be used to get iteration to work */
8113 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8115 *get_invlist_previous_index_addr(new_list) = 0;
8121 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8123 /* Return a pointer to a newly constructed inversion list, initialized to
8124 * point to <list>, which has to be in the exact correct inversion list
8125 * form, including internal fields. Thus this is a dangerous routine that
8126 * should not be used in the wrong hands. The passed in 'list' contains
8127 * several header fields at the beginning that are not part of the
8128 * inversion list body proper */
8130 const STRLEN length = (STRLEN) list[0];
8131 const UV version_id = list[1];
8132 const bool offset = cBOOL(list[2]);
8133 #define HEADER_LENGTH 3
8134 /* If any of the above changes in any way, you must change HEADER_LENGTH
8135 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8136 * perl -E 'say int(rand 2**31-1)'
8138 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8139 data structure type, so that one being
8140 passed in can be validated to be an
8141 inversion list of the correct vintage.
8144 SV* invlist = newSV_type(SVt_INVLIST);
8146 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8148 if (version_id != INVLIST_VERSION_ID) {
8149 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8152 /* The generated array passed in includes header elements that aren't part
8153 * of the list proper, so start it just after them */
8154 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8156 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8157 shouldn't touch it */
8159 *(get_invlist_offset_addr(invlist)) = offset;
8161 /* The 'length' passed to us is the physical number of elements in the
8162 * inversion list. But if there is an offset the logical number is one
8164 invlist_set_len(invlist, length - offset, offset);
8166 invlist_set_previous_index(invlist, 0);
8168 /* Initialize the iteration pointer. */
8169 invlist_iterfinish(invlist);
8171 SvREADONLY_on(invlist);
8175 #endif /* ifndef PERL_IN_XSUB_RE */
8178 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8180 /* Grow the maximum size of an inversion list */
8182 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8184 assert(SvTYPE(invlist) == SVt_INVLIST);
8186 /* Add one to account for the zero element at the beginning which may not
8187 * be counted by the calling parameters */
8188 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8191 PERL_STATIC_INLINE void
8192 S_invlist_trim(SV* const invlist)
8194 PERL_ARGS_ASSERT_INVLIST_TRIM;
8196 assert(SvTYPE(invlist) == SVt_INVLIST);
8198 /* Change the length of the inversion list to how many entries it currently
8200 SvPV_shrink_to_cur((SV *) invlist);
8204 S__append_range_to_invlist(pTHX_ SV* const invlist,
8205 const UV start, const UV end)
8207 /* Subject to change or removal. Append the range from 'start' to 'end' at
8208 * the end of the inversion list. The range must be above any existing
8212 UV max = invlist_max(invlist);
8213 UV len = _invlist_len(invlist);
8216 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8218 if (len == 0) { /* Empty lists must be initialized */
8219 offset = start != 0;
8220 array = _invlist_array_init(invlist, ! offset);
8223 /* Here, the existing list is non-empty. The current max entry in the
8224 * list is generally the first value not in the set, except when the
8225 * set extends to the end of permissible values, in which case it is
8226 * the first entry in that final set, and so this call is an attempt to
8227 * append out-of-order */
8229 UV final_element = len - 1;
8230 array = invlist_array(invlist);
8231 if (array[final_element] > start
8232 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8234 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8235 array[final_element], start,
8236 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8239 /* Here, it is a legal append. If the new range begins with the first
8240 * value not in the set, it is extending the set, so the new first
8241 * value not in the set is one greater than the newly extended range.
8243 offset = *get_invlist_offset_addr(invlist);
8244 if (array[final_element] == start) {
8245 if (end != UV_MAX) {
8246 array[final_element] = end + 1;
8249 /* But if the end is the maximum representable on the machine,
8250 * just let the range that this would extend to have no end */
8251 invlist_set_len(invlist, len - 1, offset);
8257 /* Here the new range doesn't extend any existing set. Add it */
8259 len += 2; /* Includes an element each for the start and end of range */
8261 /* If wll overflow the existing space, extend, which may cause the array to
8264 invlist_extend(invlist, len);
8266 /* Have to set len here to avoid assert failure in invlist_array() */
8267 invlist_set_len(invlist, len, offset);
8269 array = invlist_array(invlist);
8272 invlist_set_len(invlist, len, offset);
8275 /* The next item on the list starts the range, the one after that is
8276 * one past the new range. */
8277 array[len - 2] = start;
8278 if (end != UV_MAX) {
8279 array[len - 1] = end + 1;
8282 /* But if the end is the maximum representable on the machine, just let
8283 * the range have no end */
8284 invlist_set_len(invlist, len - 1, offset);
8288 #ifndef PERL_IN_XSUB_RE
8291 Perl__invlist_search(SV* const invlist, const UV cp)
8293 /* Searches the inversion list for the entry that contains the input code
8294 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8295 * return value is the index into the list's array of the range that
8300 IV high = _invlist_len(invlist);
8301 const IV highest_element = high - 1;
8304 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8306 /* If list is empty, return failure. */
8311 /* (We can't get the array unless we know the list is non-empty) */
8312 array = invlist_array(invlist);
8314 mid = invlist_previous_index(invlist);
8315 assert(mid >=0 && mid <= highest_element);
8317 /* <mid> contains the cache of the result of the previous call to this
8318 * function (0 the first time). See if this call is for the same result,
8319 * or if it is for mid-1. This is under the theory that calls to this
8320 * function will often be for related code points that are near each other.
8321 * And benchmarks show that caching gives better results. We also test
8322 * here if the code point is within the bounds of the list. These tests
8323 * replace others that would have had to be made anyway to make sure that
8324 * the array bounds were not exceeded, and these give us extra information
8325 * at the same time */
8326 if (cp >= array[mid]) {
8327 if (cp >= array[highest_element]) {
8328 return highest_element;
8331 /* Here, array[mid] <= cp < array[highest_element]. This means that
8332 * the final element is not the answer, so can exclude it; it also
8333 * means that <mid> is not the final element, so can refer to 'mid + 1'
8335 if (cp < array[mid + 1]) {
8341 else { /* cp < aray[mid] */
8342 if (cp < array[0]) { /* Fail if outside the array */
8346 if (cp >= array[mid - 1]) {
8351 /* Binary search. What we are looking for is <i> such that
8352 * array[i] <= cp < array[i+1]
8353 * The loop below converges on the i+1. Note that there may not be an
8354 * (i+1)th element in the array, and things work nonetheless */
8355 while (low < high) {
8356 mid = (low + high) / 2;
8357 assert(mid <= highest_element);
8358 if (array[mid] <= cp) { /* cp >= array[mid] */
8361 /* We could do this extra test to exit the loop early.
8362 if (cp < array[low]) {
8367 else { /* cp < array[mid] */
8374 invlist_set_previous_index(invlist, high);
8379 Perl__invlist_populate_swatch(SV* const invlist,
8380 const UV start, const UV end, U8* swatch)
8382 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8383 * but is used when the swash has an inversion list. This makes this much
8384 * faster, as it uses a binary search instead of a linear one. This is
8385 * intimately tied to that function, and perhaps should be in utf8.c,
8386 * except it is intimately tied to inversion lists as well. It assumes
8387 * that <swatch> is all 0's on input */
8390 const IV len = _invlist_len(invlist);
8394 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8396 if (len == 0) { /* Empty inversion list */
8400 array = invlist_array(invlist);
8402 /* Find which element it is */
8403 i = _invlist_search(invlist, start);
8405 /* We populate from <start> to <end> */
8406 while (current < end) {
8409 /* The inversion list gives the results for every possible code point
8410 * after the first one in the list. Only those ranges whose index is
8411 * even are ones that the inversion list matches. For the odd ones,
8412 * and if the initial code point is not in the list, we have to skip
8413 * forward to the next element */
8414 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8416 if (i >= len) { /* Finished if beyond the end of the array */
8420 if (current >= end) { /* Finished if beyond the end of what we
8422 if (LIKELY(end < UV_MAX)) {
8426 /* We get here when the upper bound is the maximum
8427 * representable on the machine, and we are looking for just
8428 * that code point. Have to special case it */
8430 goto join_end_of_list;
8433 assert(current >= start);
8435 /* The current range ends one below the next one, except don't go past
8438 upper = (i < len && array[i] < end) ? array[i] : end;
8440 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8441 * for each code point in it */
8442 for (; current < upper; current++) {
8443 const STRLEN offset = (STRLEN)(current - start);
8444 swatch[offset >> 3] |= 1 << (offset & 7);
8449 /* Quit if at the end of the list */
8452 /* But first, have to deal with the highest possible code point on
8453 * the platform. The previous code assumes that <end> is one
8454 * beyond where we want to populate, but that is impossible at the
8455 * platform's infinity, so have to handle it specially */
8456 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8458 const STRLEN offset = (STRLEN)(end - start);
8459 swatch[offset >> 3] |= 1 << (offset & 7);
8464 /* Advance to the next range, which will be for code points not in the
8473 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8474 const bool complement_b, SV** output)
8476 /* Take the union of two inversion lists and point <output> to it. *output
8477 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8478 * the reference count to that list will be decremented if not already a
8479 * temporary (mortal); otherwise *output will be made correspondingly
8480 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8481 * second list is returned. If <complement_b> is TRUE, the union is taken
8482 * of the complement (inversion) of <b> instead of b itself.
8484 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8485 * Richard Gillam, published by Addison-Wesley, and explained at some
8486 * length there. The preface says to incorporate its examples into your
8487 * code at your own risk.
8489 * The algorithm is like a merge sort.
8491 * XXX A potential performance improvement is to keep track as we go along
8492 * if only one of the inputs contributes to the result, meaning the other
8493 * is a subset of that one. In that case, we can skip the final copy and
8494 * return the larger of the input lists, but then outside code might need
8495 * to keep track of whether to free the input list or not */
8497 const UV* array_a; /* a's array */
8499 UV len_a; /* length of a's array */
8502 SV* u; /* the resulting union */
8506 UV i_a = 0; /* current index into a's array */
8510 /* running count, as explained in the algorithm source book; items are
8511 * stopped accumulating and are output when the count changes to/from 0.
8512 * The count is incremented when we start a range that's in the set, and
8513 * decremented when we start a range that's not in the set. So its range
8514 * is 0 to 2. Only when the count is zero is something not in the set.
8518 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8521 /* If either one is empty, the union is the other one */
8522 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8523 bool make_temp = FALSE; /* Should we mortalize the result? */
8527 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8533 *output = invlist_clone(b);
8535 _invlist_invert(*output);
8537 } /* else *output already = b; */
8540 sv_2mortal(*output);
8544 else if ((len_b = _invlist_len(b)) == 0) {
8545 bool make_temp = FALSE;
8547 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8552 /* The complement of an empty list is a list that has everything in it,
8553 * so the union with <a> includes everything too */
8556 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8560 *output = _new_invlist(1);
8561 _append_range_to_invlist(*output, 0, UV_MAX);
8563 else if (*output != a) {
8564 *output = invlist_clone(a);
8566 /* else *output already = a; */
8569 sv_2mortal(*output);
8574 /* Here both lists exist and are non-empty */
8575 array_a = invlist_array(a);
8576 array_b = invlist_array(b);
8578 /* If are to take the union of 'a' with the complement of b, set it
8579 * up so are looking at b's complement. */
8582 /* To complement, we invert: if the first element is 0, remove it. To
8583 * do this, we just pretend the array starts one later */
8584 if (array_b[0] == 0) {
8590 /* But if the first element is not zero, we pretend the list starts
8591 * at the 0 that is always stored immediately before the array. */
8597 /* Size the union for the worst case: that the sets are completely
8599 u = _new_invlist(len_a + len_b);
8601 /* Will contain U+0000 if either component does */
8602 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8603 || (len_b > 0 && array_b[0] == 0));
8605 /* Go through each list item by item, stopping when exhausted one of
8607 while (i_a < len_a && i_b < len_b) {
8608 UV cp; /* The element to potentially add to the union's array */
8609 bool cp_in_set; /* is it in the the input list's set or not */
8611 /* We need to take one or the other of the two inputs for the union.
8612 * Since we are merging two sorted lists, we take the smaller of the
8613 * next items. In case of a tie, we take the one that is in its set
8614 * first. If we took one not in the set first, it would decrement the
8615 * count, possibly to 0 which would cause it to be output as ending the
8616 * range, and the next time through we would take the same number, and
8617 * output it again as beginning the next range. By doing it the
8618 * opposite way, there is no possibility that the count will be
8619 * momentarily decremented to 0, and thus the two adjoining ranges will
8620 * be seamlessly merged. (In a tie and both are in the set or both not
8621 * in the set, it doesn't matter which we take first.) */
8622 if (array_a[i_a] < array_b[i_b]
8623 || (array_a[i_a] == array_b[i_b]
8624 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8626 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8630 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8631 cp = array_b[i_b++];
8634 /* Here, have chosen which of the two inputs to look at. Only output
8635 * if the running count changes to/from 0, which marks the
8636 * beginning/end of a range in that's in the set */
8639 array_u[i_u++] = cp;
8646 array_u[i_u++] = cp;
8651 /* Here, we are finished going through at least one of the lists, which
8652 * means there is something remaining in at most one. We check if the list
8653 * that hasn't been exhausted is positioned such that we are in the middle
8654 * of a range in its set or not. (i_a and i_b point to the element beyond
8655 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8656 * is potentially more to output.
8657 * There are four cases:
8658 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8659 * in the union is entirely from the non-exhausted set.
8660 * 2) Both were in their sets, count is 2. Nothing further should
8661 * be output, as everything that remains will be in the exhausted
8662 * list's set, hence in the union; decrementing to 1 but not 0 insures
8664 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8665 * Nothing further should be output because the union includes
8666 * everything from the exhausted set. Not decrementing ensures that.
8667 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8668 * decrementing to 0 insures that we look at the remainder of the
8669 * non-exhausted set */
8670 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8671 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8676 /* The final length is what we've output so far, plus what else is about to
8677 * be output. (If 'count' is non-zero, then the input list we exhausted
8678 * has everything remaining up to the machine's limit in its set, and hence
8679 * in the union, so there will be no further output. */
8682 /* At most one of the subexpressions will be non-zero */
8683 len_u += (len_a - i_a) + (len_b - i_b);
8686 /* Set result to final length, which can change the pointer to array_u, so
8688 if (len_u != _invlist_len(u)) {
8689 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8691 array_u = invlist_array(u);
8694 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8695 * the other) ended with everything above it not in its set. That means
8696 * that the remaining part of the union is precisely the same as the
8697 * non-exhausted list, so can just copy it unchanged. (If both list were
8698 * exhausted at the same time, then the operations below will be both 0.)
8701 IV copy_count; /* At most one will have a non-zero copy count */
8702 if ((copy_count = len_a - i_a) > 0) {
8703 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8705 else if ((copy_count = len_b - i_b) > 0) {
8706 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8710 /* We may be removing a reference to one of the inputs. If so, the output
8711 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8712 * count decremented) */
8713 if (a == *output || b == *output) {
8714 assert(! invlist_is_iterating(*output));
8715 if ((SvTEMP(*output))) {
8719 SvREFCNT_dec_NN(*output);
8729 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8730 const bool complement_b, SV** i)
8732 /* Take the intersection of two inversion lists and point <i> to it. *i
8733 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8734 * the reference count to that list will be decremented if not already a
8735 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8736 * The first list, <a>, may be NULL, in which case an empty list is
8737 * returned. If <complement_b> is TRUE, the result will be the
8738 * intersection of <a> and the complement (or inversion) of <b> instead of
8741 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8742 * Richard Gillam, published by Addison-Wesley, and explained at some
8743 * length there. The preface says to incorporate its examples into your
8744 * code at your own risk. In fact, it had bugs
8746 * The algorithm is like a merge sort, and is essentially the same as the
8750 const UV* array_a; /* a's array */
8752 UV len_a; /* length of a's array */
8755 SV* r; /* the resulting intersection */
8759 UV i_a = 0; /* current index into a's array */
8763 /* running count, as explained in the algorithm source book; items are
8764 * stopped accumulating and are output when the count changes to/from 2.
8765 * The count is incremented when we start a range that's in the set, and
8766 * decremented when we start a range that's not in the set. So its range
8767 * is 0 to 2. Only when the count is 2 is something in the intersection.
8771 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8774 /* Special case if either one is empty */
8775 len_a = (a == NULL) ? 0 : _invlist_len(a);
8776 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8777 bool make_temp = FALSE;
8779 if (len_a != 0 && complement_b) {
8781 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8782 * be empty. Here, also we are using 'b's complement, which hence
8783 * must be every possible code point. Thus the intersection is
8787 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8792 *i = invlist_clone(a);
8794 /* else *i is already 'a' */
8802 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8803 * intersection must be empty */
8805 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8810 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8814 *i = _new_invlist(0);
8822 /* Here both lists exist and are non-empty */
8823 array_a = invlist_array(a);
8824 array_b = invlist_array(b);
8826 /* If are to take the intersection of 'a' with the complement of b, set it
8827 * up so are looking at b's complement. */
8830 /* To complement, we invert: if the first element is 0, remove it. To
8831 * do this, we just pretend the array starts one later */
8832 if (array_b[0] == 0) {
8838 /* But if the first element is not zero, we pretend the list starts
8839 * at the 0 that is always stored immediately before the array. */
8845 /* Size the intersection for the worst case: that the intersection ends up
8846 * fragmenting everything to be completely disjoint */
8847 r= _new_invlist(len_a + len_b);
8849 /* Will contain U+0000 iff both components do */
8850 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8851 && len_b > 0 && array_b[0] == 0);
8853 /* Go through each list item by item, stopping when exhausted one of
8855 while (i_a < len_a && i_b < len_b) {
8856 UV cp; /* The element to potentially add to the intersection's
8858 bool cp_in_set; /* Is it in the input list's set or not */
8860 /* We need to take one or the other of the two inputs for the
8861 * intersection. Since we are merging two sorted lists, we take the
8862 * smaller of the next items. In case of a tie, we take the one that
8863 * is not in its set first (a difference from the union algorithm). If
8864 * we took one in the set first, it would increment the count, possibly
8865 * to 2 which would cause it to be output as starting a range in the
8866 * intersection, and the next time through we would take that same
8867 * number, and output it again as ending the set. By doing it the
8868 * opposite of this, there is no possibility that the count will be
8869 * momentarily incremented to 2. (In a tie and both are in the set or
8870 * both not in the set, it doesn't matter which we take first.) */
8871 if (array_a[i_a] < array_b[i_b]
8872 || (array_a[i_a] == array_b[i_b]
8873 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8875 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8879 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8883 /* Here, have chosen which of the two inputs to look at. Only output
8884 * if the running count changes to/from 2, which marks the
8885 * beginning/end of a range that's in the intersection */
8889 array_r[i_r++] = cp;
8894 array_r[i_r++] = cp;
8900 /* Here, we are finished going through at least one of the lists, which
8901 * means there is something remaining in at most one. We check if the list
8902 * that has been exhausted is positioned such that we are in the middle
8903 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8904 * the ones we care about.) There are four cases:
8905 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8906 * nothing left in the intersection.
8907 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8908 * above 2. What should be output is exactly that which is in the
8909 * non-exhausted set, as everything it has is also in the intersection
8910 * set, and everything it doesn't have can't be in the intersection
8911 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8912 * gets incremented to 2. Like the previous case, the intersection is
8913 * everything that remains in the non-exhausted set.
8914 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8915 * remains 1. And the intersection has nothing more. */
8916 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8917 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8922 /* The final length is what we've output so far plus what else is in the
8923 * intersection. At most one of the subexpressions below will be non-zero
8927 len_r += (len_a - i_a) + (len_b - i_b);
8930 /* Set result to final length, which can change the pointer to array_r, so
8932 if (len_r != _invlist_len(r)) {
8933 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8935 array_r = invlist_array(r);
8938 /* Finish outputting any remaining */
8939 if (count >= 2) { /* At most one will have a non-zero copy count */
8941 if ((copy_count = len_a - i_a) > 0) {
8942 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8944 else if ((copy_count = len_b - i_b) > 0) {
8945 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8949 /* We may be removing a reference to one of the inputs. If so, the output
8950 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8951 * count decremented) */
8952 if (a == *i || b == *i) {
8953 assert(! invlist_is_iterating(*i));
8958 SvREFCNT_dec_NN(*i);
8968 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8970 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8971 * set. A pointer to the inversion list is returned. This may actually be
8972 * a new list, in which case the passed in one has been destroyed. The
8973 * passed in inversion list can be NULL, in which case a new one is created
8974 * with just the one range in it */
8979 if (invlist == NULL) {
8980 invlist = _new_invlist(2);
8984 len = _invlist_len(invlist);
8987 /* If comes after the final entry actually in the list, can just append it
8990 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8991 && start >= invlist_array(invlist)[len - 1]))
8993 _append_range_to_invlist(invlist, start, end);
8997 /* Here, can't just append things, create and return a new inversion list
8998 * which is the union of this range and the existing inversion list */
8999 range_invlist = _new_invlist(2);
9000 _append_range_to_invlist(range_invlist, start, end);
9002 _invlist_union(invlist, range_invlist, &invlist);
9004 /* The temporary can be freed */
9005 SvREFCNT_dec_NN(range_invlist);
9011 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9012 UV** other_elements_ptr)
9014 /* Create and return an inversion list whose contents are to be populated
9015 * by the caller. The caller gives the number of elements (in 'size') and
9016 * the very first element ('element0'). This function will set
9017 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9020 * Obviously there is some trust involved that the caller will properly
9021 * fill in the other elements of the array.
9023 * (The first element needs to be passed in, as the underlying code does
9024 * things differently depending on whether it is zero or non-zero) */
9026 SV* invlist = _new_invlist(size);
9029 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9031 _append_range_to_invlist(invlist, element0, element0);
9032 offset = *get_invlist_offset_addr(invlist);
9034 invlist_set_len(invlist, size, offset);
9035 *other_elements_ptr = invlist_array(invlist) + 1;
9041 PERL_STATIC_INLINE SV*
9042 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9043 return _add_range_to_invlist(invlist, cp, cp);
9046 #ifndef PERL_IN_XSUB_RE
9048 Perl__invlist_invert(pTHX_ SV* const invlist)
9050 /* Complement the input inversion list. This adds a 0 if the list didn't
9051 * have a zero; removes it otherwise. As described above, the data
9052 * structure is set up so that this is very efficient */
9054 PERL_ARGS_ASSERT__INVLIST_INVERT;
9056 assert(! invlist_is_iterating(invlist));
9058 /* The inverse of matching nothing is matching everything */
9059 if (_invlist_len(invlist) == 0) {
9060 _append_range_to_invlist(invlist, 0, UV_MAX);
9064 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9069 PERL_STATIC_INLINE SV*
9070 S_invlist_clone(pTHX_ SV* const invlist)
9073 /* Return a new inversion list that is a copy of the input one, which is
9074 * unchanged. The new list will not be mortal even if the old one was. */
9076 /* Need to allocate extra space to accommodate Perl's addition of a
9077 * trailing NUL to SvPV's, since it thinks they are always strings */
9078 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9079 STRLEN physical_length = SvCUR(invlist);
9080 bool offset = *(get_invlist_offset_addr(invlist));
9082 PERL_ARGS_ASSERT_INVLIST_CLONE;
9084 *(get_invlist_offset_addr(new_invlist)) = offset;
9085 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9086 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9091 PERL_STATIC_INLINE STRLEN*
9092 S_get_invlist_iter_addr(SV* invlist)
9094 /* Return the address of the UV that contains the current iteration
9097 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9099 assert(SvTYPE(invlist) == SVt_INVLIST);
9101 return &(((XINVLIST*) SvANY(invlist))->iterator);
9104 PERL_STATIC_INLINE void
9105 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9107 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9109 *get_invlist_iter_addr(invlist) = 0;
9112 PERL_STATIC_INLINE void
9113 S_invlist_iterfinish(SV* invlist)
9115 /* Terminate iterator for invlist. This is to catch development errors.
9116 * Any iteration that is interrupted before completed should call this
9117 * function. Functions that add code points anywhere else but to the end
9118 * of an inversion list assert that they are not in the middle of an
9119 * iteration. If they were, the addition would make the iteration
9120 * problematical: if the iteration hadn't reached the place where things
9121 * were being added, it would be ok */
9123 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9125 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9129 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9131 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9132 * This call sets in <*start> and <*end>, the next range in <invlist>.
9133 * Returns <TRUE> if successful and the next call will return the next
9134 * range; <FALSE> if was already at the end of the list. If the latter,
9135 * <*start> and <*end> are unchanged, and the next call to this function
9136 * will start over at the beginning of the list */
9138 STRLEN* pos = get_invlist_iter_addr(invlist);
9139 UV len = _invlist_len(invlist);
9142 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9145 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9149 array = invlist_array(invlist);
9151 *start = array[(*pos)++];
9157 *end = array[(*pos)++] - 1;
9163 PERL_STATIC_INLINE bool
9164 S_invlist_is_iterating(SV* const invlist)
9166 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9168 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9171 PERL_STATIC_INLINE UV
9172 S_invlist_highest(SV* const invlist)
9174 /* Returns the highest code point that matches an inversion list. This API
9175 * has an ambiguity, as it returns 0 under either the highest is actually
9176 * 0, or if the list is empty. If this distinction matters to you, check
9177 * for emptiness before calling this function */
9179 UV len = _invlist_len(invlist);
9182 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9188 array = invlist_array(invlist);
9190 /* The last element in the array in the inversion list always starts a
9191 * range that goes to infinity. That range may be for code points that are
9192 * matched in the inversion list, or it may be for ones that aren't
9193 * matched. In the latter case, the highest code point in the set is one
9194 * less than the beginning of this range; otherwise it is the final element
9195 * of this range: infinity */
9196 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9198 : array[len - 1] - 1;
9201 #ifndef PERL_IN_XSUB_RE
9203 Perl__invlist_contents(pTHX_ SV* const invlist)
9205 /* Get the contents of an inversion list into a string SV so that they can
9206 * be printed out. It uses the format traditionally done for debug tracing
9210 SV* output = newSVpvs("\n");
9212 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9214 assert(! invlist_is_iterating(invlist));
9216 invlist_iterinit(invlist);
9217 while (invlist_iternext(invlist, &start, &end)) {
9218 if (end == UV_MAX) {
9219 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9221 else if (end != start) {
9222 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9226 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9234 #ifndef PERL_IN_XSUB_RE
9236 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9237 const char * const indent, SV* const invlist)
9239 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9240 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9241 * the string 'indent'. The output looks like this:
9242 [0] 0x000A .. 0x000D
9244 [4] 0x2028 .. 0x2029
9245 [6] 0x3104 .. INFINITY
9246 * This means that the first range of code points matched by the list are
9247 * 0xA through 0xD; the second range contains only the single code point
9248 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9249 * are used to define each range (except if the final range extends to
9250 * infinity, only a single element is needed). The array index of the
9251 * first element for the corresponding range is given in brackets. */
9256 PERL_ARGS_ASSERT__INVLIST_DUMP;
9258 if (invlist_is_iterating(invlist)) {
9259 Perl_dump_indent(aTHX_ level, file,
9260 "%sCan't dump inversion list because is in middle of iterating\n",
9265 invlist_iterinit(invlist);
9266 while (invlist_iternext(invlist, &start, &end)) {
9267 if (end == UV_MAX) {
9268 Perl_dump_indent(aTHX_ level, file,
9269 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9270 indent, (UV)count, start);
9272 else if (end != start) {
9273 Perl_dump_indent(aTHX_ level, file,
9274 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9275 indent, (UV)count, start, end);
9278 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9279 indent, (UV)count, start);
9286 Perl__load_PL_utf8_foldclosures (pTHX)
9288 assert(! PL_utf8_foldclosures);
9290 /* If the folds haven't been read in, call a fold function
9292 if (! PL_utf8_tofold) {
9293 U8 dummy[UTF8_MAXBYTES_CASE+1];
9295 /* This string is just a short named one above \xff */
9296 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9297 assert(PL_utf8_tofold); /* Verify that worked */
9299 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9303 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9305 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9307 /* Return a boolean as to if the two passed in inversion lists are
9308 * identical. The final argument, if TRUE, says to take the complement of
9309 * the second inversion list before doing the comparison */
9311 const UV* array_a = invlist_array(a);
9312 const UV* array_b = invlist_array(b);
9313 UV len_a = _invlist_len(a);
9314 UV len_b = _invlist_len(b);
9316 UV i = 0; /* current index into the arrays */
9317 bool retval = TRUE; /* Assume are identical until proven otherwise */
9319 PERL_ARGS_ASSERT__INVLISTEQ;
9321 /* If are to compare 'a' with the complement of b, set it
9322 * up so are looking at b's complement. */
9325 /* The complement of nothing is everything, so <a> would have to have
9326 * just one element, starting at zero (ending at infinity) */
9328 return (len_a == 1 && array_a[0] == 0);
9330 else if (array_b[0] == 0) {
9332 /* Otherwise, to complement, we invert. Here, the first element is
9333 * 0, just remove it. To do this, we just pretend the array starts
9341 /* But if the first element is not zero, we pretend the list starts
9342 * at the 0 that is always stored immediately before the array. */
9348 /* Make sure that the lengths are the same, as well as the final element
9349 * before looping through the remainder. (Thus we test the length, final,
9350 * and first elements right off the bat) */
9351 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9354 else for (i = 0; i < len_a - 1; i++) {
9355 if (array_a[i] != array_b[i]) {
9365 #undef HEADER_LENGTH
9366 #undef TO_INTERNAL_SIZE
9367 #undef FROM_INTERNAL_SIZE
9368 #undef INVLIST_VERSION_ID
9370 /* End of inversion list object */
9373 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9375 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9376 * constructs, and updates RExC_flags with them. On input, RExC_parse
9377 * should point to the first flag; it is updated on output to point to the
9378 * final ')' or ':'. There needs to be at least one flag, or this will
9381 /* for (?g), (?gc), and (?o) warnings; warning
9382 about (?c) will warn about (?g) -- japhy */
9384 #define WASTED_O 0x01
9385 #define WASTED_G 0x02
9386 #define WASTED_C 0x04
9387 #define WASTED_GC (WASTED_G|WASTED_C)
9388 I32 wastedflags = 0x00;
9389 U32 posflags = 0, negflags = 0;
9390 U32 *flagsp = &posflags;
9391 char has_charset_modifier = '\0';
9393 bool has_use_defaults = FALSE;
9394 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9395 int x_mod_count = 0;
9397 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9399 /* '^' as an initial flag sets certain defaults */
9400 if (UCHARAT(RExC_parse) == '^') {
9402 has_use_defaults = TRUE;
9403 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9404 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9405 ? REGEX_UNICODE_CHARSET
9406 : REGEX_DEPENDS_CHARSET);
9409 cs = get_regex_charset(RExC_flags);
9410 if (cs == REGEX_DEPENDS_CHARSET
9411 && (RExC_utf8 || RExC_uni_semantics))
9413 cs = REGEX_UNICODE_CHARSET;
9416 while (*RExC_parse) {
9417 /* && strchr("iogcmsx", *RExC_parse) */
9418 /* (?g), (?gc) and (?o) are useless here
9419 and must be globally applied -- japhy */
9420 switch (*RExC_parse) {
9422 /* Code for the imsx flags */
9423 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9425 case LOCALE_PAT_MOD:
9426 if (has_charset_modifier) {
9427 goto excess_modifier;
9429 else if (flagsp == &negflags) {
9432 cs = REGEX_LOCALE_CHARSET;
9433 has_charset_modifier = LOCALE_PAT_MOD;
9435 case UNICODE_PAT_MOD:
9436 if (has_charset_modifier) {
9437 goto excess_modifier;
9439 else if (flagsp == &negflags) {
9442 cs = REGEX_UNICODE_CHARSET;
9443 has_charset_modifier = UNICODE_PAT_MOD;
9445 case ASCII_RESTRICT_PAT_MOD:
9446 if (flagsp == &negflags) {
9449 if (has_charset_modifier) {
9450 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9451 goto excess_modifier;
9453 /* Doubled modifier implies more restricted */
9454 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9457 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9459 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9461 case DEPENDS_PAT_MOD:
9462 if (has_use_defaults) {
9463 goto fail_modifiers;
9465 else if (flagsp == &negflags) {
9468 else if (has_charset_modifier) {
9469 goto excess_modifier;
9472 /* The dual charset means unicode semantics if the
9473 * pattern (or target, not known until runtime) are
9474 * utf8, or something in the pattern indicates unicode
9476 cs = (RExC_utf8 || RExC_uni_semantics)
9477 ? REGEX_UNICODE_CHARSET
9478 : REGEX_DEPENDS_CHARSET;
9479 has_charset_modifier = DEPENDS_PAT_MOD;
9483 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9484 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9486 else if (has_charset_modifier == *(RExC_parse - 1)) {
9487 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9491 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9496 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9499 case ONCE_PAT_MOD: /* 'o' */
9500 case GLOBAL_PAT_MOD: /* 'g' */
9501 if (PASS2 && ckWARN(WARN_REGEXP)) {
9502 const I32 wflagbit = *RExC_parse == 'o'
9505 if (! (wastedflags & wflagbit) ) {
9506 wastedflags |= wflagbit;
9507 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9510 "Useless (%s%c) - %suse /%c modifier",
9511 flagsp == &negflags ? "?-" : "?",
9513 flagsp == &negflags ? "don't " : "",
9520 case CONTINUE_PAT_MOD: /* 'c' */
9521 if (PASS2 && ckWARN(WARN_REGEXP)) {
9522 if (! (wastedflags & WASTED_C) ) {
9523 wastedflags |= WASTED_GC;
9524 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9527 "Useless (%sc) - %suse /gc modifier",
9528 flagsp == &negflags ? "?-" : "?",
9529 flagsp == &negflags ? "don't " : ""
9534 case KEEPCOPY_PAT_MOD: /* 'p' */
9535 if (flagsp == &negflags) {
9537 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9539 *flagsp |= RXf_PMf_KEEPCOPY;
9543 /* A flag is a default iff it is following a minus, so
9544 * if there is a minus, it means will be trying to
9545 * re-specify a default which is an error */
9546 if (has_use_defaults || flagsp == &negflags) {
9547 goto fail_modifiers;
9550 wastedflags = 0; /* reset so (?g-c) warns twice */
9554 RExC_flags |= posflags;
9555 RExC_flags &= ~negflags;
9556 set_regex_charset(&RExC_flags, cs);
9557 if (RExC_flags & RXf_PMf_FOLD) {
9558 RExC_contains_i = 1;
9561 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9567 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9568 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9569 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9570 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9578 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9583 - reg - regular expression, i.e. main body or parenthesized thing
9585 * Caller must absorb opening parenthesis.
9587 * Combining parenthesis handling with the base level of regular expression
9588 * is a trifle forced, but the need to tie the tails of the branches to what
9589 * follows makes it hard to avoid.
9591 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9593 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9595 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9598 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9599 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9600 needs to be restarted.
9601 Otherwise would only return NULL if regbranch() returns NULL, which
9604 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9605 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9606 * 2 is like 1, but indicates that nextchar() has been called to advance
9607 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9608 * this flag alerts us to the need to check for that */
9610 regnode *ret; /* Will be the head of the group. */
9613 regnode *ender = NULL;
9616 U32 oregflags = RExC_flags;
9617 bool have_branch = 0;
9619 I32 freeze_paren = 0;
9620 I32 after_freeze = 0;
9621 I32 num; /* numeric backreferences */
9623 char * parse_start = RExC_parse; /* MJD */
9624 char * const oregcomp_parse = RExC_parse;
9626 GET_RE_DEBUG_FLAGS_DECL;
9628 PERL_ARGS_ASSERT_REG;
9629 DEBUG_PARSE("reg ");
9631 *flagp = 0; /* Tentatively. */
9634 /* Make an OPEN node, if parenthesized. */
9637 /* Under /x, space and comments can be gobbled up between the '(' and
9638 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9639 * intervening space, as the sequence is a token, and a token should be
9641 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9643 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9644 char *start_verb = RExC_parse;
9645 STRLEN verb_len = 0;
9646 char *start_arg = NULL;
9647 unsigned char op = 0;
9649 int internal_argval = 0; /* internal_argval is only useful if
9652 if (has_intervening_patws) {
9654 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9656 while ( *RExC_parse && *RExC_parse != ')' ) {
9657 if ( *RExC_parse == ':' ) {
9658 start_arg = RExC_parse + 1;
9664 verb_len = RExC_parse - start_verb;
9667 while ( *RExC_parse && *RExC_parse != ')' )
9669 if ( *RExC_parse != ')' )
9670 vFAIL("Unterminated verb pattern argument");
9671 if ( RExC_parse == start_arg )
9674 if ( *RExC_parse != ')' )
9675 vFAIL("Unterminated verb pattern");
9678 switch ( *start_verb ) {
9679 case 'A': /* (*ACCEPT) */
9680 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9682 internal_argval = RExC_nestroot;
9685 case 'C': /* (*COMMIT) */
9686 if ( memEQs(start_verb,verb_len,"COMMIT") )
9689 case 'F': /* (*FAIL) */
9690 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9695 case ':': /* (*:NAME) */
9696 case 'M': /* (*MARK:NAME) */
9697 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9702 case 'P': /* (*PRUNE) */
9703 if ( memEQs(start_verb,verb_len,"PRUNE") )
9706 case 'S': /* (*SKIP) */
9707 if ( memEQs(start_verb,verb_len,"SKIP") )
9710 case 'T': /* (*THEN) */
9711 /* [19:06] <TimToady> :: is then */
9712 if ( memEQs(start_verb,verb_len,"THEN") ) {
9714 RExC_seen |= REG_CUTGROUP_SEEN;
9719 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9721 "Unknown verb pattern '%"UTF8f"'",
9722 UTF8fARG(UTF, verb_len, start_verb));
9725 if ( start_arg && internal_argval ) {
9726 vFAIL3("Verb pattern '%.*s' may not have an argument",
9727 verb_len, start_verb);
9728 } else if ( argok < 0 && !start_arg ) {
9729 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9730 verb_len, start_verb);
9732 ret = reganode(pRExC_state, op, internal_argval);
9733 if ( ! internal_argval && ! SIZE_ONLY ) {
9735 SV *sv = newSVpvn( start_arg,
9736 RExC_parse - start_arg);
9737 ARG(ret) = add_data( pRExC_state,
9739 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9746 if (!internal_argval)
9747 RExC_seen |= REG_VERBARG_SEEN;
9748 } else if ( start_arg ) {
9749 vFAIL3("Verb pattern '%.*s' may not have an argument",
9750 verb_len, start_verb);
9752 ret = reg_node(pRExC_state, op);
9754 nextchar(pRExC_state);
9757 else if (*RExC_parse == '?') { /* (?...) */
9758 bool is_logical = 0;
9759 const char * const seqstart = RExC_parse;
9760 const char * endptr;
9761 if (has_intervening_patws) {
9763 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9767 paren = *RExC_parse++;
9768 ret = NULL; /* For look-ahead/behind. */
9771 case 'P': /* (?P...) variants for those used to PCRE/Python */
9772 paren = *RExC_parse++;
9773 if ( paren == '<') /* (?P<...>) named capture */
9775 else if (paren == '>') { /* (?P>name) named recursion */
9776 goto named_recursion;
9778 else if (paren == '=') { /* (?P=...) named backref */
9779 /* this pretty much dupes the code for \k<NAME> in
9780 * regatom(), if you change this make sure you change that
9782 char* name_start = RExC_parse;
9784 SV *sv_dat = reg_scan_name(pRExC_state,
9785 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9786 if (RExC_parse == name_start || *RExC_parse != ')')
9787 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9788 vFAIL2("Sequence %.3s... not terminated",parse_start);
9791 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9792 RExC_rxi->data->data[num]=(void*)sv_dat;
9793 SvREFCNT_inc_simple_void(sv_dat);
9796 ret = reganode(pRExC_state,
9799 : (ASCII_FOLD_RESTRICTED)
9801 : (AT_LEAST_UNI_SEMANTICS)
9809 Set_Node_Offset(ret, parse_start+1);
9810 Set_Node_Cur_Length(ret, parse_start);
9812 nextchar(pRExC_state);
9816 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9817 vFAIL3("Sequence (%.*s...) not recognized",
9818 RExC_parse-seqstart, seqstart);
9820 case '<': /* (?<...) */
9821 if (*RExC_parse == '!')
9823 else if (*RExC_parse != '=')
9829 case '\'': /* (?'...') */
9830 name_start= RExC_parse;
9831 svname = reg_scan_name(pRExC_state,
9832 SIZE_ONLY /* reverse test from the others */
9833 ? REG_RSN_RETURN_NAME
9834 : REG_RSN_RETURN_NULL);
9835 if (RExC_parse == name_start || *RExC_parse != paren)
9836 vFAIL2("Sequence (?%c... not terminated",
9837 paren=='>' ? '<' : paren);
9841 if (!svname) /* shouldn't happen */
9843 "panic: reg_scan_name returned NULL");
9844 if (!RExC_paren_names) {
9845 RExC_paren_names= newHV();
9846 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9848 RExC_paren_name_list= newAV();
9849 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9852 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9854 sv_dat = HeVAL(he_str);
9856 /* croak baby croak */
9858 "panic: paren_name hash element allocation failed");
9859 } else if ( SvPOK(sv_dat) ) {
9860 /* (?|...) can mean we have dupes so scan to check
9861 its already been stored. Maybe a flag indicating
9862 we are inside such a construct would be useful,
9863 but the arrays are likely to be quite small, so
9864 for now we punt -- dmq */
9865 IV count = SvIV(sv_dat);
9866 I32 *pv = (I32*)SvPVX(sv_dat);
9868 for ( i = 0 ; i < count ; i++ ) {
9869 if ( pv[i] == RExC_npar ) {
9875 pv = (I32*)SvGROW(sv_dat,
9876 SvCUR(sv_dat) + sizeof(I32)+1);
9877 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9878 pv[count] = RExC_npar;
9879 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9882 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9883 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9886 SvIV_set(sv_dat, 1);
9889 /* Yes this does cause a memory leak in debugging Perls
9891 if (!av_store(RExC_paren_name_list,
9892 RExC_npar, SvREFCNT_inc(svname)))
9893 SvREFCNT_dec_NN(svname);
9896 /*sv_dump(sv_dat);*/
9898 nextchar(pRExC_state);
9900 goto capturing_parens;
9902 RExC_seen |= REG_LOOKBEHIND_SEEN;
9903 RExC_in_lookbehind++;
9906 case '=': /* (?=...) */
9907 RExC_seen_zerolen++;
9909 case '!': /* (?!...) */
9910 RExC_seen_zerolen++;
9911 if (*RExC_parse == ')') {
9912 ret=reg_node(pRExC_state, OPFAIL);
9913 nextchar(pRExC_state);
9917 case '|': /* (?|...) */
9918 /* branch reset, behave like a (?:...) except that
9919 buffers in alternations share the same numbers */
9921 after_freeze = freeze_paren = RExC_npar;
9923 case ':': /* (?:...) */
9924 case '>': /* (?>...) */
9926 case '$': /* (?$...) */
9927 case '@': /* (?@...) */
9928 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9930 case '0' : /* (?0) */
9931 case 'R' : /* (?R) */
9932 if (*RExC_parse != ')')
9933 FAIL("Sequence (?R) not terminated");
9934 ret = reg_node(pRExC_state, GOSTART);
9935 RExC_seen |= REG_GOSTART_SEEN;
9936 *flagp |= POSTPONED;
9937 nextchar(pRExC_state);
9940 /* named and numeric backreferences */
9941 case '&': /* (?&NAME) */
9942 parse_start = RExC_parse - 1;
9945 SV *sv_dat = reg_scan_name(pRExC_state,
9946 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9947 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9949 if (RExC_parse == RExC_end || *RExC_parse != ')')
9950 vFAIL("Sequence (?&... not terminated");
9951 goto gen_recurse_regop;
9952 assert(0); /* NOT REACHED */
9954 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9956 vFAIL("Illegal pattern");
9958 goto parse_recursion;
9960 case '-': /* (?-1) */
9961 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9962 RExC_parse--; /* rewind to let it be handled later */
9966 case '1': case '2': case '3': case '4': /* (?1) */
9967 case '5': case '6': case '7': case '8': case '9':
9971 bool is_neg = FALSE;
9972 parse_start = RExC_parse - 1; /* MJD */
9973 if (*RExC_parse == '-') {
9977 num = grok_atou(RExC_parse, &endptr);
9979 RExC_parse = (char*)endptr;
9981 /* Some limit for num? */
9985 if (*RExC_parse!=')')
9986 vFAIL("Expecting close bracket");
9989 if ( paren == '-' ) {
9991 Diagram of capture buffer numbering.
9992 Top line is the normal capture buffer numbers
9993 Bottom line is the negative indexing as from
9997 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10001 num = RExC_npar + num;
10004 vFAIL("Reference to nonexistent group");
10006 } else if ( paren == '+' ) {
10007 num = RExC_npar + num - 1;
10010 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10012 if (num > (I32)RExC_rx->nparens) {
10014 vFAIL("Reference to nonexistent group");
10016 RExC_recurse_count++;
10017 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10018 "Recurse #%"UVuf" to %"IVdf"\n",
10019 (UV)ARG(ret), (IV)ARG2L(ret)));
10021 RExC_seen |= REG_RECURSE_SEEN;
10022 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10023 Set_Node_Offset(ret, parse_start); /* MJD */
10025 *flagp |= POSTPONED;
10026 nextchar(pRExC_state);
10029 assert(0); /* NOT REACHED */
10031 case '?': /* (??...) */
10033 if (*RExC_parse != '{') {
10035 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10037 "Sequence (%"UTF8f"...) not recognized",
10038 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10041 *flagp |= POSTPONED;
10042 paren = *RExC_parse++;
10044 case '{': /* (?{...}) */
10047 struct reg_code_block *cb;
10049 RExC_seen_zerolen++;
10051 if ( !pRExC_state->num_code_blocks
10052 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10053 || pRExC_state->code_blocks[pRExC_state->code_index].start
10054 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10057 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10058 FAIL("panic: Sequence (?{...}): no code block found\n");
10059 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10061 /* this is a pre-compiled code block (?{...}) */
10062 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10063 RExC_parse = RExC_start + cb->end;
10066 if (cb->src_regex) {
10067 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10068 RExC_rxi->data->data[n] =
10069 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10070 RExC_rxi->data->data[n+1] = (void*)o;
10073 n = add_data(pRExC_state,
10074 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10075 RExC_rxi->data->data[n] = (void*)o;
10078 pRExC_state->code_index++;
10079 nextchar(pRExC_state);
10083 ret = reg_node(pRExC_state, LOGICAL);
10085 eval = reg2Lanode(pRExC_state, EVAL,
10088 /* for later propagation into (??{})
10090 RExC_flags & RXf_PMf_COMPILETIME
10095 REGTAIL(pRExC_state, ret, eval);
10096 /* deal with the length of this later - MJD */
10099 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10100 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10101 Set_Node_Offset(ret, parse_start);
10104 case '(': /* (?(?{...})...) and (?(?=...)...) */
10107 const int DEFINE_len = sizeof("DEFINE") - 1;
10108 if (RExC_parse[0] == '?') { /* (?(?...)) */
10109 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10110 || RExC_parse[1] == '<'
10111 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10115 ret = reg_node(pRExC_state, LOGICAL);
10119 tail = reg(pRExC_state, 1, &flag, depth+1);
10120 if (flag & RESTART_UTF8) {
10121 *flagp = RESTART_UTF8;
10124 REGTAIL(pRExC_state, ret, tail);
10127 /* Fall through to ‘Unknown switch condition’ at the
10128 end of the if/else chain. */
10130 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10131 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10133 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10134 char *name_start= RExC_parse++;
10136 SV *sv_dat=reg_scan_name(pRExC_state,
10137 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10138 if (RExC_parse == name_start || *RExC_parse != ch)
10139 vFAIL2("Sequence (?(%c... not terminated",
10140 (ch == '>' ? '<' : ch));
10143 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10144 RExC_rxi->data->data[num]=(void*)sv_dat;
10145 SvREFCNT_inc_simple_void(sv_dat);
10147 ret = reganode(pRExC_state,NGROUPP,num);
10148 goto insert_if_check_paren;
10150 else if (strnEQ(RExC_parse, "DEFINE",
10151 MIN(DEFINE_len, RExC_end - RExC_parse)))
10153 ret = reganode(pRExC_state,DEFINEP,0);
10154 RExC_parse += DEFINE_len;
10156 goto insert_if_check_paren;
10158 else if (RExC_parse[0] == 'R') {
10161 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10162 parno = grok_atou(RExC_parse, &endptr);
10164 RExC_parse = (char*)endptr;
10165 } else if (RExC_parse[0] == '&') {
10168 sv_dat = reg_scan_name(pRExC_state,
10170 ? REG_RSN_RETURN_NULL
10171 : REG_RSN_RETURN_DATA);
10172 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10174 ret = reganode(pRExC_state,INSUBP,parno);
10175 goto insert_if_check_paren;
10177 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10181 parno = grok_atou(RExC_parse, &endptr);
10183 RExC_parse = (char*)endptr;
10184 ret = reganode(pRExC_state, GROUPP, parno);
10186 insert_if_check_paren:
10187 if (*(tmp = nextchar(pRExC_state)) != ')') {
10188 /* nextchar also skips comments, so undo its work
10189 * and skip over the the next character.
10192 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10193 vFAIL("Switch condition not recognized");
10196 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10197 br = regbranch(pRExC_state, &flags, 1,depth+1);
10199 if (flags & RESTART_UTF8) {
10200 *flagp = RESTART_UTF8;
10203 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10206 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10208 c = *nextchar(pRExC_state);
10209 if (flags&HASWIDTH)
10210 *flagp |= HASWIDTH;
10213 vFAIL("(?(DEFINE)....) does not allow branches");
10215 /* Fake one for optimizer. */
10216 lastbr = reganode(pRExC_state, IFTHEN, 0);
10218 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10219 if (flags & RESTART_UTF8) {
10220 *flagp = RESTART_UTF8;
10223 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10226 REGTAIL(pRExC_state, ret, lastbr);
10227 if (flags&HASWIDTH)
10228 *flagp |= HASWIDTH;
10229 c = *nextchar(pRExC_state);
10234 if (RExC_parse>RExC_end)
10235 vFAIL("Switch (?(condition)... not terminated");
10237 vFAIL("Switch (?(condition)... contains too many branches");
10239 ender = reg_node(pRExC_state, TAIL);
10240 REGTAIL(pRExC_state, br, ender);
10242 REGTAIL(pRExC_state, lastbr, ender);
10243 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10246 REGTAIL(pRExC_state, ret, ender);
10247 RExC_size++; /* XXX WHY do we need this?!!
10248 For large programs it seems to be required
10249 but I can't figure out why. -- dmq*/
10252 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10253 vFAIL("Unknown switch condition (?(...))");
10255 case '[': /* (?[ ... ]) */
10256 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10259 RExC_parse--; /* for vFAIL to print correctly */
10260 vFAIL("Sequence (? incomplete");
10262 default: /* e.g., (?i) */
10265 parse_lparen_question_flags(pRExC_state);
10266 if (UCHARAT(RExC_parse) != ':') {
10267 nextchar(pRExC_state);
10272 nextchar(pRExC_state);
10282 ret = reganode(pRExC_state, OPEN, parno);
10284 if (!RExC_nestroot)
10285 RExC_nestroot = parno;
10286 if (RExC_seen & REG_RECURSE_SEEN
10287 && !RExC_open_parens[parno-1])
10289 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10290 "Setting open paren #%"IVdf" to %d\n",
10291 (IV)parno, REG_NODE_NUM(ret)));
10292 RExC_open_parens[parno-1]= ret;
10295 Set_Node_Length(ret, 1); /* MJD */
10296 Set_Node_Offset(ret, RExC_parse); /* MJD */
10304 /* Pick up the branches, linking them together. */
10305 parse_start = RExC_parse; /* MJD */
10306 br = regbranch(pRExC_state, &flags, 1,depth+1);
10308 /* branch_len = (paren != 0); */
10311 if (flags & RESTART_UTF8) {
10312 *flagp = RESTART_UTF8;
10315 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10317 if (*RExC_parse == '|') {
10318 if (!SIZE_ONLY && RExC_extralen) {
10319 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10322 reginsert(pRExC_state, BRANCH, br, depth+1);
10323 Set_Node_Length(br, paren != 0);
10324 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10328 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10330 else if (paren == ':') {
10331 *flagp |= flags&SIMPLE;
10333 if (is_open) { /* Starts with OPEN. */
10334 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10336 else if (paren != '?') /* Not Conditional */
10338 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10340 while (*RExC_parse == '|') {
10341 if (!SIZE_ONLY && RExC_extralen) {
10342 ender = reganode(pRExC_state, LONGJMP,0);
10344 /* Append to the previous. */
10345 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10348 RExC_extralen += 2; /* Account for LONGJMP. */
10349 nextchar(pRExC_state);
10350 if (freeze_paren) {
10351 if (RExC_npar > after_freeze)
10352 after_freeze = RExC_npar;
10353 RExC_npar = freeze_paren;
10355 br = regbranch(pRExC_state, &flags, 0, depth+1);
10358 if (flags & RESTART_UTF8) {
10359 *flagp = RESTART_UTF8;
10362 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10364 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10366 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10369 if (have_branch || paren != ':') {
10370 /* Make a closing node, and hook it on the end. */
10373 ender = reg_node(pRExC_state, TAIL);
10376 ender = reganode(pRExC_state, CLOSE, parno);
10377 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10378 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10379 "Setting close paren #%"IVdf" to %d\n",
10380 (IV)parno, REG_NODE_NUM(ender)));
10381 RExC_close_parens[parno-1]= ender;
10382 if (RExC_nestroot == parno)
10385 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10386 Set_Node_Length(ender,1); /* MJD */
10392 *flagp &= ~HASWIDTH;
10395 ender = reg_node(pRExC_state, SUCCEED);
10398 ender = reg_node(pRExC_state, END);
10400 assert(!RExC_opend); /* there can only be one! */
10401 RExC_opend = ender;
10405 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10406 SV * const mysv_val1=sv_newmortal();
10407 SV * const mysv_val2=sv_newmortal();
10408 DEBUG_PARSE_MSG("lsbr");
10409 regprop(RExC_rx, mysv_val1, lastbr, NULL);
10410 regprop(RExC_rx, mysv_val2, ender, NULL);
10411 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10412 SvPV_nolen_const(mysv_val1),
10413 (IV)REG_NODE_NUM(lastbr),
10414 SvPV_nolen_const(mysv_val2),
10415 (IV)REG_NODE_NUM(ender),
10416 (IV)(ender - lastbr)
10419 REGTAIL(pRExC_state, lastbr, ender);
10421 if (have_branch && !SIZE_ONLY) {
10422 char is_nothing= 1;
10424 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10426 /* Hook the tails of the branches to the closing node. */
10427 for (br = ret; br; br = regnext(br)) {
10428 const U8 op = PL_regkind[OP(br)];
10429 if (op == BRANCH) {
10430 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10431 if ( OP(NEXTOPER(br)) != NOTHING
10432 || regnext(NEXTOPER(br)) != ender)
10435 else if (op == BRANCHJ) {
10436 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10437 /* for now we always disable this optimisation * /
10438 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10439 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10445 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10446 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10447 SV * const mysv_val1=sv_newmortal();
10448 SV * const mysv_val2=sv_newmortal();
10449 DEBUG_PARSE_MSG("NADA");
10450 regprop(RExC_rx, mysv_val1, ret, NULL);
10451 regprop(RExC_rx, mysv_val2, ender, NULL);
10452 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10453 SvPV_nolen_const(mysv_val1),
10454 (IV)REG_NODE_NUM(ret),
10455 SvPV_nolen_const(mysv_val2),
10456 (IV)REG_NODE_NUM(ender),
10461 if (OP(ender) == TAIL) {
10466 for ( opt= br + 1; opt < ender ; opt++ )
10467 OP(opt)= OPTIMIZED;
10468 NEXT_OFF(br)= ender - br;
10476 static const char parens[] = "=!<,>";
10478 if (paren && (p = strchr(parens, paren))) {
10479 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10480 int flag = (p - parens) > 1;
10483 node = SUSPEND, flag = 0;
10484 reginsert(pRExC_state, node,ret, depth+1);
10485 Set_Node_Cur_Length(ret, parse_start);
10486 Set_Node_Offset(ret, parse_start + 1);
10488 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10492 /* Check for proper termination. */
10494 /* restore original flags, but keep (?p) */
10495 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10496 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10497 RExC_parse = oregcomp_parse;
10498 vFAIL("Unmatched (");
10501 else if (!paren && RExC_parse < RExC_end) {
10502 if (*RExC_parse == ')') {
10504 vFAIL("Unmatched )");
10507 FAIL("Junk on end of regexp"); /* "Can't happen". */
10508 assert(0); /* NOTREACHED */
10511 if (RExC_in_lookbehind) {
10512 RExC_in_lookbehind--;
10514 if (after_freeze > RExC_npar)
10515 RExC_npar = after_freeze;
10520 - regbranch - one alternative of an | operator
10522 * Implements the concatenation operator.
10524 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10528 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10531 regnode *chain = NULL;
10533 I32 flags = 0, c = 0;
10534 GET_RE_DEBUG_FLAGS_DECL;
10536 PERL_ARGS_ASSERT_REGBRANCH;
10538 DEBUG_PARSE("brnc");
10543 if (!SIZE_ONLY && RExC_extralen)
10544 ret = reganode(pRExC_state, BRANCHJ,0);
10546 ret = reg_node(pRExC_state, BRANCH);
10547 Set_Node_Length(ret, 1);
10551 if (!first && SIZE_ONLY)
10552 RExC_extralen += 1; /* BRANCHJ */
10554 *flagp = WORST; /* Tentatively. */
10557 nextchar(pRExC_state);
10558 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10559 flags &= ~TRYAGAIN;
10560 latest = regpiece(pRExC_state, &flags,depth+1);
10561 if (latest == NULL) {
10562 if (flags & TRYAGAIN)
10564 if (flags & RESTART_UTF8) {
10565 *flagp = RESTART_UTF8;
10568 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10570 else if (ret == NULL)
10572 *flagp |= flags&(HASWIDTH|POSTPONED);
10573 if (chain == NULL) /* First piece. */
10574 *flagp |= flags&SPSTART;
10577 REGTAIL(pRExC_state, chain, latest);
10582 if (chain == NULL) { /* Loop ran zero times. */
10583 chain = reg_node(pRExC_state, NOTHING);
10588 *flagp |= flags&SIMPLE;
10595 - regpiece - something followed by possible [*+?]
10597 * Note that the branching code sequences used for ? and the general cases
10598 * of * and + are somewhat optimized: they use the same NOTHING node as
10599 * both the endmarker for their branch list and the body of the last branch.
10600 * It might seem that this node could be dispensed with entirely, but the
10601 * endmarker role is not redundant.
10603 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10605 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10609 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10615 const char * const origparse = RExC_parse;
10617 I32 max = REG_INFTY;
10618 #ifdef RE_TRACK_PATTERN_OFFSETS
10621 const char *maxpos = NULL;
10623 /* Save the original in case we change the emitted regop to a FAIL. */
10624 regnode * const orig_emit = RExC_emit;
10626 GET_RE_DEBUG_FLAGS_DECL;
10628 PERL_ARGS_ASSERT_REGPIECE;
10630 DEBUG_PARSE("piec");
10632 ret = regatom(pRExC_state, &flags,depth+1);
10634 if (flags & (TRYAGAIN|RESTART_UTF8))
10635 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10637 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10643 if (op == '{' && regcurly(RExC_parse)) {
10645 #ifdef RE_TRACK_PATTERN_OFFSETS
10646 parse_start = RExC_parse; /* MJD */
10648 next = RExC_parse + 1;
10649 while (isDIGIT(*next) || *next == ',') {
10650 if (*next == ',') {
10658 if (*next == '}') { /* got one */
10659 const char* endptr;
10663 min = grok_atou(RExC_parse, &endptr);
10664 if (*maxpos == ',')
10667 maxpos = RExC_parse;
10668 max = grok_atou(maxpos, &endptr);
10669 if (!max && *maxpos != '0')
10670 max = REG_INFTY; /* meaning "infinity" */
10671 else if (max >= REG_INFTY)
10672 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10674 nextchar(pRExC_state);
10675 if (max < min) { /* If can't match, warn and optimize to fail
10679 /* We can't back off the size because we have to reserve
10680 * enough space for all the things we are about to throw
10681 * away, but we can shrink it by the ammount we are about
10682 * to re-use here */
10683 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10686 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10687 RExC_emit = orig_emit;
10689 ret = reg_node(pRExC_state, OPFAIL);
10692 else if (min == max
10693 && RExC_parse < RExC_end
10694 && (*RExC_parse == '?' || *RExC_parse == '+'))
10697 ckWARN2reg(RExC_parse + 1,
10698 "Useless use of greediness modifier '%c'",
10701 /* Absorb the modifier, so later code doesn't see nor use
10703 nextchar(pRExC_state);
10707 if ((flags&SIMPLE)) {
10708 RExC_naughty += 2 + RExC_naughty / 2;
10709 reginsert(pRExC_state, CURLY, ret, depth+1);
10710 Set_Node_Offset(ret, parse_start+1); /* MJD */
10711 Set_Node_Cur_Length(ret, parse_start);
10714 regnode * const w = reg_node(pRExC_state, WHILEM);
10717 REGTAIL(pRExC_state, ret, w);
10718 if (!SIZE_ONLY && RExC_extralen) {
10719 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10720 reginsert(pRExC_state, NOTHING,ret, depth+1);
10721 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10723 reginsert(pRExC_state, CURLYX,ret, depth+1);
10725 Set_Node_Offset(ret, parse_start+1);
10726 Set_Node_Length(ret,
10727 op == '{' ? (RExC_parse - parse_start) : 1);
10729 if (!SIZE_ONLY && RExC_extralen)
10730 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10731 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10733 RExC_whilem_seen++, RExC_extralen += 3;
10734 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10741 *flagp |= HASWIDTH;
10743 ARG1_SET(ret, (U16)min);
10744 ARG2_SET(ret, (U16)max);
10746 if (max == REG_INFTY)
10747 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10753 if (!ISMULT1(op)) {
10758 #if 0 /* Now runtime fix should be reliable. */
10760 /* if this is reinstated, don't forget to put this back into perldiag:
10762 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10764 (F) The part of the regexp subject to either the * or + quantifier
10765 could match an empty string. The {#} shows in the regular
10766 expression about where the problem was discovered.
10770 if (!(flags&HASWIDTH) && op != '?')
10771 vFAIL("Regexp *+ operand could be empty");
10774 #ifdef RE_TRACK_PATTERN_OFFSETS
10775 parse_start = RExC_parse;
10777 nextchar(pRExC_state);
10779 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10781 if (op == '*' && (flags&SIMPLE)) {
10782 reginsert(pRExC_state, STAR, ret, depth+1);
10785 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10787 else if (op == '*') {
10791 else if (op == '+' && (flags&SIMPLE)) {
10792 reginsert(pRExC_state, PLUS, ret, depth+1);
10795 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10797 else if (op == '+') {
10801 else if (op == '?') {
10806 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10807 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10808 ckWARN2reg(RExC_parse,
10809 "%"UTF8f" matches null string many times",
10810 UTF8fARG(UTF, (RExC_parse >= origparse
10811 ? RExC_parse - origparse
10814 (void)ReREFCNT_inc(RExC_rx_sv);
10817 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10818 nextchar(pRExC_state);
10819 reginsert(pRExC_state, MINMOD, ret, depth+1);
10820 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10823 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10825 nextchar(pRExC_state);
10826 ender = reg_node(pRExC_state, SUCCEED);
10827 REGTAIL(pRExC_state, ret, ender);
10828 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10830 ender = reg_node(pRExC_state, TAIL);
10831 REGTAIL(pRExC_state, ret, ender);
10834 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10836 vFAIL("Nested quantifiers");
10843 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10844 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10848 /* This is expected to be called by a parser routine that has recognized '\N'
10849 and needs to handle the rest. RExC_parse is expected to point at the first
10850 char following the N at the time of the call. On successful return,
10851 RExC_parse has been updated to point to just after the sequence identified
10852 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10853 have been set appropriately.
10855 The typical case for this is \N{some character name}. This is usually
10856 called while parsing the input, filling in or ready to fill in an EXACTish
10857 node, and the code point for the character should be returned, so that it
10858 can be added to the node, and parsing continued with the next input
10859 character. But it may be that instead of a single character the \N{}
10860 expands to more than one, a named sequence. In this case any following
10861 quantifier applies to the whole sequence, and it is easier, given the code
10862 structure that calls this, to handle it from a different area of the code.
10863 For this reason, the input parameters can be set so that it returns valid
10864 only on one or the other of these cases.
10866 Another possibility is for the input to be an empty \N{}, which for
10867 backwards compatibility we accept, but generate a NOTHING node which should
10868 later get optimized out. This is handled from the area of code which can
10869 handle a named sequence, so if called with the parameters for the other, it
10872 Still another possibility is for the \N to mean [^\n], and not a single
10873 character or explicit sequence at all. This is determined by context.
10874 Again, this is handled from the area of code which can handle a named
10875 sequence, so if called with the parameters for the other, it also fails.
10877 And the final possibility is for the \N to be called from within a bracketed
10878 character class. In this case the [^\n] meaning makes no sense, and so is
10879 an error. Other anomalous situations are left to the calling code to handle.
10881 For non-single-quoted regexes, the tokenizer has attempted to decide which
10882 of the above applies, and in the case of a named sequence, has converted it
10883 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10884 where c1... are the characters in the sequence. For single-quoted regexes,
10885 the tokenizer passes the \N sequence through unchanged; this code will not
10886 attempt to determine this nor expand those, instead raising a syntax error.
10887 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10888 or there is no '}', it signals that this \N occurrence means to match a
10889 non-newline. (This mostly was done because of [perl #56444].)
10891 The API is somewhat convoluted due to historical and the above reasons.
10893 The function raises an error (via vFAIL), and doesn't return for various
10894 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
10895 it returns a count of how many characters were accounted for by it. (This
10896 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
10897 points in the sequence. It sets <node_p>, <valuep>, and/or
10898 <substitute_parse> on success.
10900 If <valuep> is non-null, it means the caller can accept an input sequence
10901 consisting of a just a single code point; <*valuep> is set to the value
10902 of the only or first code point in the input.
10904 If <substitute_parse> is non-null, it means the caller can accept an input
10905 sequence consisting of one or more code points; <*substitute_parse> is a
10906 newly created mortal SV* in this case, containing \x{} escapes representing
10909 Both <valuep> and <substitute_parse> can be non-NULL.
10911 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
10912 that the caller can accept any legal sequence other than a single code
10913 point. To wit, <*node_p> is set as follows:
10914 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
10915 2) \N{}: points to a new NOTHING node; return is 0
10916 3) otherwise: points to a new EXACT node containing the resolved
10917 string; return is the number of code points in the
10918 string. This will never be 1.
10919 Note that failure is returned for single code point sequences if <valuep> is
10920 null and <node_p> is not.
10923 char * endbrace; /* '}' following the name */
10925 char *endchar; /* Points to '.' or '}' ending cur char in the input
10927 bool has_multiple_chars; /* true if the input stream contains a sequence of
10928 more than one character */
10929 bool in_char_class = substitute_parse != NULL;
10930 STRLEN count = 0; /* Number of characters in this sequence */
10932 GET_RE_DEBUG_FLAGS_DECL;
10934 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10936 GET_RE_DEBUG_FLAGS;
10938 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10939 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
10941 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10942 * modifier. The other meaning does not, so use a temporary until we find
10943 * out which we are being called with */
10944 p = (RExC_flags & RXf_PMf_EXTENDED)
10945 ? regpatws(pRExC_state, RExC_parse,
10946 TRUE) /* means recognize comments */
10949 /* Disambiguate between \N meaning a named character versus \N meaning
10950 * [^\n]. The former is assumed when it can't be the latter. */
10951 if (*p != '{' || regcurly(p)) {
10954 /* no bare \N allowed in a charclass */
10955 if (in_char_class) {
10956 vFAIL("\\N in a character class must be a named character: \\N{...}");
10958 return (STRLEN) -1;
10960 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10962 nextchar(pRExC_state);
10963 *node_p = reg_node(pRExC_state, REG_ANY);
10964 *flagp |= HASWIDTH|SIMPLE;
10966 Set_Node_Length(*node_p, 1); /* MJD */
10970 /* Here, we have decided it should be a named character or sequence */
10972 /* The test above made sure that the next real character is a '{', but
10973 * under the /x modifier, it could be separated by space (or a comment and
10974 * \n) and this is not allowed (for consistency with \x{...} and the
10975 * tokenizer handling of \N{NAME}). */
10976 if (*RExC_parse != '{') {
10977 vFAIL("Missing braces on \\N{}");
10980 RExC_parse++; /* Skip past the '{' */
10982 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10983 || ! (endbrace == RExC_parse /* nothing between the {} */
10984 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10986 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10989 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10990 vFAIL("\\N{NAME} must be resolved by the lexer");
10993 if (endbrace == RExC_parse) { /* empty: \N{} */
10995 *node_p = reg_node(pRExC_state,NOTHING);
10997 else if (! in_char_class) {
10998 return (STRLEN) -1;
11000 nextchar(pRExC_state);
11004 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11005 RExC_parse += 2; /* Skip past the 'U+' */
11007 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11009 /* Code points are separated by dots. If none, there is only one code
11010 * point, and is terminated by the brace */
11011 has_multiple_chars = (endchar < endbrace);
11013 /* We get the first code point if we want it, and either there is only one,
11014 * or we can accept both cases of one and more than one */
11015 if (valuep && (substitute_parse || ! has_multiple_chars)) {
11016 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11017 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11018 | PERL_SCAN_DISALLOW_PREFIX
11020 /* No errors in the first pass (See [perl
11021 * #122671].) We let the code below find the
11022 * errors when there are multiple chars. */
11023 | ((SIZE_ONLY || has_multiple_chars)
11024 ? PERL_SCAN_SILENT_ILLDIGIT
11027 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11029 /* The tokenizer should have guaranteed validity, but it's possible to
11030 * bypass it by using single quoting, so check. Don't do the check
11031 * here when there are multiple chars; we do it below anyway. */
11032 if (! has_multiple_chars) {
11033 if (length_of_hex == 0
11034 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11036 RExC_parse += length_of_hex; /* Includes all the valid */
11037 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11038 ? UTF8SKIP(RExC_parse)
11040 /* Guard against malformed utf8 */
11041 if (RExC_parse >= endchar) {
11042 RExC_parse = endchar;
11044 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11047 RExC_parse = endbrace + 1;
11052 /* Here, we should have already handled the case where a single character
11053 * is expected and found. So it is a failure if we aren't expecting
11054 * multiple chars and got them; or didn't get them but wanted them. We
11055 * fail without advancing the parse, so that the caller can try again with
11056 * different acceptance criteria */
11057 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11059 return (STRLEN) -1;
11064 /* What is done here is to convert this to a sub-pattern of the form
11065 * \x{char1}\x{char2}...
11066 * and then either return it in <*substitute_parse> if non-null; or
11067 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
11068 * way, it retains its atomicness, while not having to worry about
11069 * special handling that some code points may have. toke.c has
11070 * converted the original Unicode values to native, so that we can just
11071 * pass on the hex values unchanged. We do have to set a flag to keep
11072 * recoding from happening in the recursion */
11076 char *orig_end = RExC_end;
11079 if (substitute_parse) {
11080 *substitute_parse = newSVpvs("");
11083 substitute_parse = &dummy;
11084 *substitute_parse = newSVpvs("?:");
11086 *substitute_parse = sv_2mortal(*substitute_parse);
11088 while (RExC_parse < endbrace) {
11090 /* Convert to notation the rest of the code understands */
11091 sv_catpv(*substitute_parse, "\\x{");
11092 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11093 sv_catpv(*substitute_parse, "}");
11095 /* Point to the beginning of the next character in the sequence. */
11096 RExC_parse = endchar + 1;
11097 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11101 if (! in_char_class) {
11102 sv_catpv(*substitute_parse, ")");
11105 RExC_parse = SvPV(*substitute_parse, len);
11107 /* Don't allow empty number */
11108 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11109 RExC_parse = endbrace;
11110 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11112 RExC_end = RExC_parse + len;
11114 /* The values are Unicode, and therefore not subject to recoding */
11115 RExC_override_recoding = 1;
11118 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11119 if (flags & RESTART_UTF8) {
11120 *flagp = RESTART_UTF8;
11121 return (STRLEN) -1;
11123 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11126 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11129 RExC_parse = endbrace;
11130 RExC_end = orig_end;
11131 RExC_override_recoding = 0;
11133 nextchar(pRExC_state);
11143 * It returns the code point in utf8 for the value in *encp.
11144 * value: a code value in the source encoding
11145 * encp: a pointer to an Encode object
11147 * If the result from Encode is not a single character,
11148 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11151 S_reg_recode(pTHX_ const char value, SV **encp)
11154 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11155 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11156 const STRLEN newlen = SvCUR(sv);
11157 UV uv = UNICODE_REPLACEMENT;
11159 PERL_ARGS_ASSERT_REG_RECODE;
11163 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11166 if (!newlen || numlen != newlen) {
11167 uv = UNICODE_REPLACEMENT;
11173 PERL_STATIC_INLINE U8
11174 S_compute_EXACTish(RExC_state_t *pRExC_state)
11178 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11184 op = get_regex_charset(RExC_flags);
11185 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11186 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11187 been, so there is no hole */
11190 return op + EXACTF;
11193 PERL_STATIC_INLINE void
11194 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11195 regnode *node, I32* flagp, STRLEN len, UV code_point,
11198 /* This knows the details about sizing an EXACTish node, setting flags for
11199 * it (by setting <*flagp>, and potentially populating it with a single
11202 * If <len> (the length in bytes) is non-zero, this function assumes that
11203 * the node has already been populated, and just does the sizing. In this
11204 * case <code_point> should be the final code point that has already been
11205 * placed into the node. This value will be ignored except that under some
11206 * circumstances <*flagp> is set based on it.
11208 * If <len> is zero, the function assumes that the node is to contain only
11209 * the single character given by <code_point> and calculates what <len>
11210 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11211 * additionally will populate the node's STRING with <code_point> or its
11214 * In both cases <*flagp> is appropriately set
11216 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11217 * 255, must be folded (the former only when the rules indicate it can
11220 * When it does the populating, it looks at the flag 'downgradable'. If
11221 * true with a node that folds, it checks if the single code point
11222 * participates in a fold, and if not downgrades the node to an EXACT.
11223 * This helps the optimizer */
11225 bool len_passed_in = cBOOL(len != 0);
11226 U8 character[UTF8_MAXBYTES_CASE+1];
11228 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11230 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11231 * sizing difference, and is extra work that is thrown away */
11232 if (downgradable && ! PASS2) {
11233 downgradable = FALSE;
11236 if (! len_passed_in) {
11238 if (UNI_IS_INVARIANT(code_point)) {
11239 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11240 *character = (U8) code_point;
11242 else { /* Here is /i and not /l. (toFOLD() is defined on just
11243 ASCII, which isn't the same thing as INVARIANT on
11244 EBCDIC, but it works there, as the extra invariants
11245 fold to themselves) */
11246 *character = toFOLD((U8) code_point);
11248 /* We can downgrade to an EXACT node if this character
11249 * isn't a folding one. Note that this assumes that
11250 * nothing above Latin1 folds to some other invariant than
11251 * one of these alphabetics; otherwise we would also have
11253 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11254 * || ASCII_FOLD_RESTRICTED))
11256 if (downgradable && PL_fold[code_point] == code_point) {
11262 else if (FOLD && (! LOC
11263 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11264 { /* Folding, and ok to do so now */
11265 UV folded = _to_uni_fold_flags(
11269 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11270 ? FOLD_FLAGS_NOMIX_ASCII
11273 && folded == code_point /* This quickly rules out many
11274 cases, avoiding the
11275 _invlist_contains_cp() overhead
11277 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11282 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11284 /* Not folding this cp, and can output it directly */
11285 *character = UTF8_TWO_BYTE_HI(code_point);
11286 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11290 uvchr_to_utf8( character, code_point);
11291 len = UTF8SKIP(character);
11293 } /* Else pattern isn't UTF8. */
11295 *character = (U8) code_point;
11297 } /* Else is folded non-UTF8 */
11298 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11300 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11301 * comments at join_exact()); */
11302 *character = (U8) code_point;
11305 /* Can turn into an EXACT node if we know the fold at compile time,
11306 * and it folds to itself and doesn't particpate in other folds */
11309 && PL_fold_latin1[code_point] == code_point
11310 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11311 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11315 } /* else is Sharp s. May need to fold it */
11316 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11318 *(character + 1) = 's';
11322 *character = LATIN_SMALL_LETTER_SHARP_S;
11328 RExC_size += STR_SZ(len);
11331 RExC_emit += STR_SZ(len);
11332 STR_LEN(node) = len;
11333 if (! len_passed_in) {
11334 Copy((char *) character, STRING(node), len, char);
11338 *flagp |= HASWIDTH;
11340 /* A single character node is SIMPLE, except for the special-cased SHARP S
11342 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11343 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11344 || ! FOLD || ! DEPENDS_SEMANTICS))
11349 /* The OP may not be well defined in PASS1 */
11350 if (PASS2 && OP(node) == EXACTFL) {
11351 RExC_contains_locale = 1;
11356 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11357 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11360 S_backref_value(char *p)
11362 const char* endptr;
11363 UV val = grok_atou(p, &endptr);
11364 if (endptr == p || endptr == NULL || val > I32_MAX)
11371 - regatom - the lowest level
11373 Try to identify anything special at the start of the pattern. If there
11374 is, then handle it as required. This may involve generating a single regop,
11375 such as for an assertion; or it may involve recursing, such as to
11376 handle a () structure.
11378 If the string doesn't start with something special then we gobble up
11379 as much literal text as we can.
11381 Once we have been able to handle whatever type of thing started the
11382 sequence, we return.
11384 Note: we have to be careful with escapes, as they can be both literal
11385 and special, and in the case of \10 and friends, context determines which.
11387 A summary of the code structure is:
11389 switch (first_byte) {
11390 cases for each special:
11391 handle this special;
11394 switch (2nd byte) {
11395 cases for each unambiguous special:
11396 handle this special;
11398 cases for each ambigous special/literal:
11400 if (special) handle here
11402 default: // unambiguously literal:
11405 default: // is a literal char
11408 create EXACTish node for literal;
11409 while (more input and node isn't full) {
11410 switch (input_byte) {
11411 cases for each special;
11412 make sure parse pointer is set so that the next call to
11413 regatom will see this special first
11414 goto loopdone; // EXACTish node terminated by prev. char
11416 append char to EXACTISH node;
11418 get next input byte;
11422 return the generated node;
11424 Specifically there are two separate switches for handling
11425 escape sequences, with the one for handling literal escapes requiring
11426 a dummy entry for all of the special escapes that are actually handled
11429 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11431 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11433 Otherwise does not return NULL.
11437 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11439 regnode *ret = NULL;
11441 char *parse_start = RExC_parse;
11446 GET_RE_DEBUG_FLAGS_DECL;
11448 *flagp = WORST; /* Tentatively. */
11450 DEBUG_PARSE("atom");
11452 PERL_ARGS_ASSERT_REGATOM;
11455 switch ((U8)*RExC_parse) {
11457 RExC_seen_zerolen++;
11458 nextchar(pRExC_state);
11459 if (RExC_flags & RXf_PMf_MULTILINE)
11460 ret = reg_node(pRExC_state, MBOL);
11462 ret = reg_node(pRExC_state, SBOL);
11463 Set_Node_Length(ret, 1); /* MJD */
11466 nextchar(pRExC_state);
11468 RExC_seen_zerolen++;
11469 if (RExC_flags & RXf_PMf_MULTILINE)
11470 ret = reg_node(pRExC_state, MEOL);
11472 ret = reg_node(pRExC_state, SEOL);
11473 Set_Node_Length(ret, 1); /* MJD */
11476 nextchar(pRExC_state);
11477 if (RExC_flags & RXf_PMf_SINGLELINE)
11478 ret = reg_node(pRExC_state, SANY);
11480 ret = reg_node(pRExC_state, REG_ANY);
11481 *flagp |= HASWIDTH|SIMPLE;
11483 Set_Node_Length(ret, 1); /* MJD */
11487 char * const oregcomp_parse = ++RExC_parse;
11488 ret = regclass(pRExC_state, flagp,depth+1,
11489 FALSE, /* means parse the whole char class */
11490 TRUE, /* allow multi-char folds */
11491 FALSE, /* don't silence non-portable warnings. */
11493 if (*RExC_parse != ']') {
11494 RExC_parse = oregcomp_parse;
11495 vFAIL("Unmatched [");
11498 if (*flagp & RESTART_UTF8)
11500 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11503 nextchar(pRExC_state);
11504 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11508 nextchar(pRExC_state);
11509 ret = reg(pRExC_state, 2, &flags,depth+1);
11511 if (flags & TRYAGAIN) {
11512 if (RExC_parse == RExC_end) {
11513 /* Make parent create an empty node if needed. */
11514 *flagp |= TRYAGAIN;
11519 if (flags & RESTART_UTF8) {
11520 *flagp = RESTART_UTF8;
11523 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11526 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11530 if (flags & TRYAGAIN) {
11531 *flagp |= TRYAGAIN;
11534 vFAIL("Internal urp");
11535 /* Supposed to be caught earlier. */
11541 vFAIL("Quantifier follows nothing");
11546 This switch handles escape sequences that resolve to some kind
11547 of special regop and not to literal text. Escape sequnces that
11548 resolve to literal text are handled below in the switch marked
11551 Every entry in this switch *must* have a corresponding entry
11552 in the literal escape switch. However, the opposite is not
11553 required, as the default for this switch is to jump to the
11554 literal text handling code.
11556 switch ((U8)*++RExC_parse) {
11557 /* Special Escapes */
11559 RExC_seen_zerolen++;
11560 ret = reg_node(pRExC_state, SBOL);
11561 /* SBOL is shared with /^/ so we set the flags so we can tell
11562 * /\A/ from /^/ in split. We check ret because first pass we
11563 * have no regop struct to set the flags on. */
11567 goto finish_meta_pat;
11569 ret = reg_node(pRExC_state, GPOS);
11570 RExC_seen |= REG_GPOS_SEEN;
11572 goto finish_meta_pat;
11574 RExC_seen_zerolen++;
11575 ret = reg_node(pRExC_state, KEEPS);
11577 /* XXX:dmq : disabling in-place substitution seems to
11578 * be necessary here to avoid cases of memory corruption, as
11579 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11581 RExC_seen |= REG_LOOKBEHIND_SEEN;
11582 goto finish_meta_pat;
11584 ret = reg_node(pRExC_state, SEOL);
11586 RExC_seen_zerolen++; /* Do not optimize RE away */
11587 goto finish_meta_pat;
11589 ret = reg_node(pRExC_state, EOS);
11591 RExC_seen_zerolen++; /* Do not optimize RE away */
11592 goto finish_meta_pat;
11594 ret = reg_node(pRExC_state, CANY);
11595 RExC_seen |= REG_CANY_SEEN;
11596 *flagp |= HASWIDTH|SIMPLE;
11598 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11600 goto finish_meta_pat;
11602 ret = reg_node(pRExC_state, CLUMP);
11603 *flagp |= HASWIDTH;
11604 goto finish_meta_pat;
11610 arg = ANYOF_WORDCHAR;
11614 RExC_seen_zerolen++;
11615 RExC_seen |= REG_LOOKBEHIND_SEEN;
11616 op = BOUND + get_regex_charset(RExC_flags);
11617 if (op > BOUNDA) { /* /aa is same as /a */
11620 else if (op == BOUNDL) {
11621 RExC_contains_locale = 1;
11623 ret = reg_node(pRExC_state, op);
11624 FLAGS(ret) = get_regex_charset(RExC_flags);
11626 if ((U8) *(RExC_parse + 1) == '{') {
11627 /* diag_listed_as: Use "%s" instead of "%s" */
11628 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11630 goto finish_meta_pat;
11632 RExC_seen_zerolen++;
11633 RExC_seen |= REG_LOOKBEHIND_SEEN;
11634 op = NBOUND + get_regex_charset(RExC_flags);
11635 if (op > NBOUNDA) { /* /aa is same as /a */
11638 else if (op == NBOUNDL) {
11639 RExC_contains_locale = 1;
11641 ret = reg_node(pRExC_state, op);
11642 FLAGS(ret) = get_regex_charset(RExC_flags);
11644 if ((U8) *(RExC_parse + 1) == '{') {
11645 /* diag_listed_as: Use "%s" instead of "%s" */
11646 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11648 goto finish_meta_pat;
11658 ret = reg_node(pRExC_state, LNBREAK);
11659 *flagp |= HASWIDTH|SIMPLE;
11660 goto finish_meta_pat;
11668 goto join_posix_op_known;
11674 arg = ANYOF_VERTWS;
11676 goto join_posix_op_known;
11686 op = POSIXD + get_regex_charset(RExC_flags);
11687 if (op > POSIXA) { /* /aa is same as /a */
11690 else if (op == POSIXL) {
11691 RExC_contains_locale = 1;
11694 join_posix_op_known:
11697 op += NPOSIXD - POSIXD;
11700 ret = reg_node(pRExC_state, op);
11702 FLAGS(ret) = namedclass_to_classnum(arg);
11705 *flagp |= HASWIDTH|SIMPLE;
11709 nextchar(pRExC_state);
11710 Set_Node_Length(ret, 2); /* MJD */
11716 char* parse_start = RExC_parse - 2;
11721 ret = regclass(pRExC_state, flagp,depth+1,
11722 TRUE, /* means just parse this element */
11723 FALSE, /* don't allow multi-char folds */
11724 FALSE, /* don't silence non-portable warnings.
11725 It would be a bug if these returned
11728 /* regclass() can only return RESTART_UTF8 if multi-char folds
11731 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11736 Set_Node_Offset(ret, parse_start + 2);
11737 Set_Node_Cur_Length(ret, parse_start);
11738 nextchar(pRExC_state);
11742 /* Handle \N and \N{NAME} with multiple code points here and not
11743 * below because it can be multicharacter. join_exact() will join
11744 * them up later on. Also this makes sure that things like
11745 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11746 * The options to the grok function call causes it to fail if the
11747 * sequence is just a single code point. We then go treat it as
11748 * just another character in the current EXACT node, and hence it
11749 * gets uniform treatment with all the other characters. The
11750 * special treatment for quantifiers is not needed for such single
11751 * character sequences */
11753 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11756 if (*flagp & RESTART_UTF8)
11762 case 'k': /* Handle \k<NAME> and \k'NAME' */
11765 char ch= RExC_parse[1];
11766 if (ch != '<' && ch != '\'' && ch != '{') {
11768 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11769 vFAIL2("Sequence %.2s... not terminated",parse_start);
11771 /* this pretty much dupes the code for (?P=...) in reg(), if
11772 you change this make sure you change that */
11773 char* name_start = (RExC_parse += 2);
11775 SV *sv_dat = reg_scan_name(pRExC_state,
11776 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11777 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11778 if (RExC_parse == name_start || *RExC_parse != ch)
11779 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11780 vFAIL2("Sequence %.3s... not terminated",parse_start);
11783 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11784 RExC_rxi->data->data[num]=(void*)sv_dat;
11785 SvREFCNT_inc_simple_void(sv_dat);
11789 ret = reganode(pRExC_state,
11792 : (ASCII_FOLD_RESTRICTED)
11794 : (AT_LEAST_UNI_SEMANTICS)
11800 *flagp |= HASWIDTH;
11802 /* override incorrect value set in reganode MJD */
11803 Set_Node_Offset(ret, parse_start+1);
11804 Set_Node_Cur_Length(ret, parse_start);
11805 nextchar(pRExC_state);
11811 case '1': case '2': case '3': case '4':
11812 case '5': case '6': case '7': case '8': case '9':
11817 if (*RExC_parse == 'g') {
11821 if (*RExC_parse == '{') {
11825 if (*RExC_parse == '-') {
11829 if (hasbrace && !isDIGIT(*RExC_parse)) {
11830 if (isrel) RExC_parse--;
11832 goto parse_named_seq;
11835 num = S_backref_value(RExC_parse);
11837 vFAIL("Reference to invalid group 0");
11838 else if (num == I32_MAX) {
11839 if (isDIGIT(*RExC_parse))
11840 vFAIL("Reference to nonexistent group");
11842 vFAIL("Unterminated \\g... pattern");
11846 num = RExC_npar - num;
11848 vFAIL("Reference to nonexistent or unclosed group");
11852 num = S_backref_value(RExC_parse);
11853 /* bare \NNN might be backref or octal - if it is larger than or equal
11854 * RExC_npar then it is assumed to be and octal escape.
11855 * Note RExC_npar is +1 from the actual number of parens*/
11856 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11857 && *RExC_parse != '8' && *RExC_parse != '9'))
11859 /* Probably a character specified in octal, e.g. \35 */
11864 /* at this point RExC_parse definitely points to a backref
11867 #ifdef RE_TRACK_PATTERN_OFFSETS
11868 char * const parse_start = RExC_parse - 1; /* MJD */
11870 while (isDIGIT(*RExC_parse))
11873 if (*RExC_parse != '}')
11874 vFAIL("Unterminated \\g{...} pattern");
11878 if (num > (I32)RExC_rx->nparens)
11879 vFAIL("Reference to nonexistent group");
11882 ret = reganode(pRExC_state,
11885 : (ASCII_FOLD_RESTRICTED)
11887 : (AT_LEAST_UNI_SEMANTICS)
11893 *flagp |= HASWIDTH;
11895 /* override incorrect value set in reganode MJD */
11896 Set_Node_Offset(ret, parse_start+1);
11897 Set_Node_Cur_Length(ret, parse_start);
11899 nextchar(pRExC_state);
11904 if (RExC_parse >= RExC_end)
11905 FAIL("Trailing \\");
11908 /* Do not generate "unrecognized" warnings here, we fall
11909 back into the quick-grab loop below */
11916 if (RExC_flags & RXf_PMf_EXTENDED) {
11917 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11918 if (RExC_parse < RExC_end)
11925 parse_start = RExC_parse - 1;
11934 #define MAX_NODE_STRING_SIZE 127
11935 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11937 U8 upper_parse = MAX_NODE_STRING_SIZE;
11938 U8 node_type = compute_EXACTish(pRExC_state);
11939 bool next_is_quantifier;
11940 char * oldp = NULL;
11942 /* We can convert EXACTF nodes to EXACTFU if they contain only
11943 * characters that match identically regardless of the target
11944 * string's UTF8ness. The reason to do this is that EXACTF is not
11945 * trie-able, EXACTFU is.
11947 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11948 * contain only above-Latin1 characters (hence must be in UTF8),
11949 * which don't participate in folds with Latin1-range characters,
11950 * as the latter's folds aren't known until runtime. (We don't
11951 * need to figure this out until pass 2) */
11952 bool maybe_exactfu = PASS2
11953 && (node_type == EXACTF || node_type == EXACTFL);
11955 /* If a folding node contains only code points that don't
11956 * participate in folds, it can be changed into an EXACT node,
11957 * which allows the optimizer more things to look for */
11960 ret = reg_node(pRExC_state, node_type);
11962 /* In pass1, folded, we use a temporary buffer instead of the
11963 * actual node, as the node doesn't exist yet */
11964 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11970 /* We do the EXACTFish to EXACT node only if folding. (And we
11971 * don't need to figure this out until pass 2) */
11972 maybe_exact = FOLD && PASS2;
11974 /* XXX The node can hold up to 255 bytes, yet this only goes to
11975 * 127. I (khw) do not know why. Keeping it somewhat less than
11976 * 255 allows us to not have to worry about overflow due to
11977 * converting to utf8 and fold expansion, but that value is
11978 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11979 * split up by this limit into a single one using the real max of
11980 * 255. Even at 127, this breaks under rare circumstances. If
11981 * folding, we do not want to split a node at a character that is a
11982 * non-final in a multi-char fold, as an input string could just
11983 * happen to want to match across the node boundary. The join
11984 * would solve that problem if the join actually happens. But a
11985 * series of more than two nodes in a row each of 127 would cause
11986 * the first join to succeed to get to 254, but then there wouldn't
11987 * be room for the next one, which could at be one of those split
11988 * multi-char folds. I don't know of any fool-proof solution. One
11989 * could back off to end with only a code point that isn't such a
11990 * non-final, but it is possible for there not to be any in the
11992 for (p = RExC_parse - 1;
11993 len < upper_parse && p < RExC_end;
11998 if (RExC_flags & RXf_PMf_EXTENDED)
11999 p = regpatws(pRExC_state, p,
12000 TRUE); /* means recognize comments */
12011 /* Literal Escapes Switch
12013 This switch is meant to handle escape sequences that
12014 resolve to a literal character.
12016 Every escape sequence that represents something
12017 else, like an assertion or a char class, is handled
12018 in the switch marked 'Special Escapes' above in this
12019 routine, but also has an entry here as anything that
12020 isn't explicitly mentioned here will be treated as
12021 an unescaped equivalent literal.
12024 switch ((U8)*++p) {
12025 /* These are all the special escapes. */
12026 case 'A': /* Start assertion */
12027 case 'b': case 'B': /* Word-boundary assertion*/
12028 case 'C': /* Single char !DANGEROUS! */
12029 case 'd': case 'D': /* digit class */
12030 case 'g': case 'G': /* generic-backref, pos assertion */
12031 case 'h': case 'H': /* HORIZWS */
12032 case 'k': case 'K': /* named backref, keep marker */
12033 case 'p': case 'P': /* Unicode property */
12034 case 'R': /* LNBREAK */
12035 case 's': case 'S': /* space class */
12036 case 'v': case 'V': /* VERTWS */
12037 case 'w': case 'W': /* word class */
12038 case 'X': /* eXtended Unicode "combining
12039 character sequence" */
12040 case 'z': case 'Z': /* End of line/string assertion */
12044 /* Anything after here is an escape that resolves to a
12045 literal. (Except digits, which may or may not)
12051 case 'N': /* Handle a single-code point named character. */
12052 /* The options cause it to fail if a multiple code
12053 * point sequence. Handle those in the switch() above
12055 RExC_parse = p + 1;
12056 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12062 if (*flagp & RESTART_UTF8)
12063 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12064 RExC_parse = p = oldp;
12068 if (ender > 0xff) {
12085 ender = ESC_NATIVE;
12095 const char* error_msg;
12097 bool valid = grok_bslash_o(&p,
12100 PASS2, /* out warnings */
12101 FALSE, /* not strict */
12102 TRUE, /* Output warnings
12107 RExC_parse = p; /* going to die anyway; point
12108 to exact spot of failure */
12112 if (PL_encoding && ender < 0x100) {
12113 goto recode_encoding;
12115 if (ender > 0xff) {
12122 UV result = UV_MAX; /* initialize to erroneous
12124 const char* error_msg;
12126 bool valid = grok_bslash_x(&p,
12129 PASS2, /* out warnings */
12130 FALSE, /* not strict */
12131 TRUE, /* Output warnings
12136 RExC_parse = p; /* going to die anyway; point
12137 to exact spot of failure */
12142 if (PL_encoding && ender < 0x100) {
12143 goto recode_encoding;
12145 if (ender > 0xff) {
12152 ender = grok_bslash_c(*p++, PASS2);
12154 case '8': case '9': /* must be a backreference */
12157 case '1': case '2': case '3':case '4':
12158 case '5': case '6': case '7':
12159 /* When we parse backslash escapes there is ambiguity
12160 * between backreferences and octal escapes. Any escape
12161 * from \1 - \9 is a backreference, any multi-digit
12162 * escape which does not start with 0 and which when
12163 * evaluated as decimal could refer to an already
12164 * parsed capture buffer is a backslash. Anything else
12167 * Note this implies that \118 could be interpreted as
12168 * 118 OR as "\11" . "8" depending on whether there
12169 * were 118 capture buffers defined already in the
12172 /* NOTE, RExC_npar is 1 more than the actual number of
12173 * parens we have seen so far, hence the < RExC_npar below. */
12175 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12176 { /* Not to be treated as an octal constant, go
12184 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12186 ender = grok_oct(p, &numlen, &flags, NULL);
12187 if (ender > 0xff) {
12191 if (PASS2 /* like \08, \178 */
12194 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12196 reg_warn_non_literal_string(
12198 form_short_octal_warning(p, numlen));
12201 if (PL_encoding && ender < 0x100)
12202 goto recode_encoding;
12205 if (! RExC_override_recoding) {
12206 SV* enc = PL_encoding;
12207 ender = reg_recode((const char)(U8)ender, &enc);
12209 ckWARNreg(p, "Invalid escape in the specified encoding");
12215 FAIL("Trailing \\");
12218 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12219 /* Include any { following the alpha to emphasize
12220 * that it could be part of an escape at some point
12222 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12223 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12225 goto normal_default;
12226 } /* End of switch on '\' */
12229 /* Currently we don't warn when the lbrace is at the start
12230 * of a construct. This catches it in the middle of a
12231 * literal string, or when its the first thing after
12232 * something like "\b" */
12234 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12236 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12239 default: /* A literal character */
12241 if (UTF8_IS_START(*p) && UTF) {
12243 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12244 &numlen, UTF8_ALLOW_DEFAULT);
12250 } /* End of switch on the literal */
12252 /* Here, have looked at the literal character and <ender>
12253 * contains its ordinal, <p> points to the character after it
12256 if ( RExC_flags & RXf_PMf_EXTENDED)
12257 p = regpatws(pRExC_state, p,
12258 TRUE); /* means recognize comments */
12260 /* If the next thing is a quantifier, it applies to this
12261 * character only, which means that this character has to be in
12262 * its own node and can't just be appended to the string in an
12263 * existing node, so if there are already other characters in
12264 * the node, close the node with just them, and set up to do
12265 * this character again next time through, when it will be the
12266 * only thing in its new node */
12267 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12273 if (! FOLD /* The simple case, just append the literal */
12274 || (LOC /* Also don't fold for tricky chars under /l */
12275 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12278 const STRLEN unilen = reguni(pRExC_state, ender, s);
12284 /* The loop increments <len> each time, as all but this
12285 * path (and one other) through it add a single byte to
12286 * the EXACTish node. But this one has changed len to
12287 * be the correct final value, so subtract one to
12288 * cancel out the increment that follows */
12292 REGC((char)ender, s++);
12295 /* Can get here if folding only if is one of the /l
12296 * characters whose fold depends on the locale. The
12297 * occurrence of any of these indicate that we can't
12298 * simplify things */
12300 maybe_exact = FALSE;
12301 maybe_exactfu = FALSE;
12306 /* See comments for join_exact() as to why we fold this
12307 * non-UTF at compile time */
12308 || (node_type == EXACTFU
12309 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12311 /* Here, are folding and are not UTF-8 encoded; therefore
12312 * the character must be in the range 0-255, and is not /l
12313 * (Not /l because we already handled these under /l in
12314 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12315 if (IS_IN_SOME_FOLD_L1(ender)) {
12316 maybe_exact = FALSE;
12318 /* See if the character's fold differs between /d and
12319 * /u. This includes the multi-char fold SHARP S to
12322 && (PL_fold[ender] != PL_fold_latin1[ender]
12323 || ender == LATIN_SMALL_LETTER_SHARP_S
12325 && isALPHA_FOLD_EQ(ender, 's')
12326 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12328 maybe_exactfu = FALSE;
12332 /* Even when folding, we store just the input character, as
12333 * we have an array that finds its fold quickly */
12334 *(s++) = (char) ender;
12336 else { /* FOLD and UTF */
12337 /* Unlike the non-fold case, we do actually have to
12338 * calculate the results here in pass 1. This is for two
12339 * reasons, the folded length may be longer than the
12340 * unfolded, and we have to calculate how many EXACTish
12341 * nodes it will take; and we may run out of room in a node
12342 * in the middle of a potential multi-char fold, and have
12343 * to back off accordingly. (Hence we can't use REGC for
12344 * the simple case just below.) */
12347 if (isASCII(ender)) {
12348 folded = toFOLD(ender);
12349 *(s)++ = (U8) folded;
12354 folded = _to_uni_fold_flags(
12358 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12359 ? FOLD_FLAGS_NOMIX_ASCII
12363 /* The loop increments <len> each time, as all but this
12364 * path (and one other) through it add a single byte to
12365 * the EXACTish node. But this one has changed len to
12366 * be the correct final value, so subtract one to
12367 * cancel out the increment that follows */
12368 len += foldlen - 1;
12370 /* If this node only contains non-folding code points so
12371 * far, see if this new one is also non-folding */
12373 if (folded != ender) {
12374 maybe_exact = FALSE;
12377 /* Here the fold is the original; we have to check
12378 * further to see if anything folds to it */
12379 if (_invlist_contains_cp(PL_utf8_foldable,
12382 maybe_exact = FALSE;
12389 if (next_is_quantifier) {
12391 /* Here, the next input is a quantifier, and to get here,
12392 * the current character is the only one in the node.
12393 * Also, here <len> doesn't include the final byte for this
12399 } /* End of loop through literal characters */
12401 /* Here we have either exhausted the input or ran out of room in
12402 * the node. (If we encountered a character that can't be in the
12403 * node, transfer is made directly to <loopdone>, and so we
12404 * wouldn't have fallen off the end of the loop.) In the latter
12405 * case, we artificially have to split the node into two, because
12406 * we just don't have enough space to hold everything. This
12407 * creates a problem if the final character participates in a
12408 * multi-character fold in the non-final position, as a match that
12409 * should have occurred won't, due to the way nodes are matched,
12410 * and our artificial boundary. So back off until we find a non-
12411 * problematic character -- one that isn't at the beginning or
12412 * middle of such a fold. (Either it doesn't participate in any
12413 * folds, or appears only in the final position of all the folds it
12414 * does participate in.) A better solution with far fewer false
12415 * positives, and that would fill the nodes more completely, would
12416 * be to actually have available all the multi-character folds to
12417 * test against, and to back-off only far enough to be sure that
12418 * this node isn't ending with a partial one. <upper_parse> is set
12419 * further below (if we need to reparse the node) to include just
12420 * up through that final non-problematic character that this code
12421 * identifies, so when it is set to less than the full node, we can
12422 * skip the rest of this */
12423 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12425 const STRLEN full_len = len;
12427 assert(len >= MAX_NODE_STRING_SIZE);
12429 /* Here, <s> points to the final byte of the final character.
12430 * Look backwards through the string until find a non-
12431 * problematic character */
12435 /* This has no multi-char folds to non-UTF characters */
12436 if (ASCII_FOLD_RESTRICTED) {
12440 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12444 if (! PL_NonL1NonFinalFold) {
12445 PL_NonL1NonFinalFold = _new_invlist_C_array(
12446 NonL1_Perl_Non_Final_Folds_invlist);
12449 /* Point to the first byte of the final character */
12450 s = (char *) utf8_hop((U8 *) s, -1);
12452 while (s >= s0) { /* Search backwards until find
12453 non-problematic char */
12454 if (UTF8_IS_INVARIANT(*s)) {
12456 /* There are no ascii characters that participate
12457 * in multi-char folds under /aa. In EBCDIC, the
12458 * non-ascii invariants are all control characters,
12459 * so don't ever participate in any folds. */
12460 if (ASCII_FOLD_RESTRICTED
12461 || ! IS_NON_FINAL_FOLD(*s))
12466 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12467 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12473 else if (! _invlist_contains_cp(
12474 PL_NonL1NonFinalFold,
12475 valid_utf8_to_uvchr((U8 *) s, NULL)))
12480 /* Here, the current character is problematic in that
12481 * it does occur in the non-final position of some
12482 * fold, so try the character before it, but have to
12483 * special case the very first byte in the string, so
12484 * we don't read outside the string */
12485 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12486 } /* End of loop backwards through the string */
12488 /* If there were only problematic characters in the string,
12489 * <s> will point to before s0, in which case the length
12490 * should be 0, otherwise include the length of the
12491 * non-problematic character just found */
12492 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12495 /* Here, have found the final character, if any, that is
12496 * non-problematic as far as ending the node without splitting
12497 * it across a potential multi-char fold. <len> contains the
12498 * number of bytes in the node up-to and including that
12499 * character, or is 0 if there is no such character, meaning
12500 * the whole node contains only problematic characters. In
12501 * this case, give up and just take the node as-is. We can't
12506 /* If the node ends in an 's' we make sure it stays EXACTF,
12507 * as if it turns into an EXACTFU, it could later get
12508 * joined with another 's' that would then wrongly match
12510 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12512 maybe_exactfu = FALSE;
12516 /* Here, the node does contain some characters that aren't
12517 * problematic. If one such is the final character in the
12518 * node, we are done */
12519 if (len == full_len) {
12522 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12524 /* If the final character is problematic, but the
12525 * penultimate is not, back-off that last character to
12526 * later start a new node with it */
12531 /* Here, the final non-problematic character is earlier
12532 * in the input than the penultimate character. What we do
12533 * is reparse from the beginning, going up only as far as
12534 * this final ok one, thus guaranteeing that the node ends
12535 * in an acceptable character. The reason we reparse is
12536 * that we know how far in the character is, but we don't
12537 * know how to correlate its position with the input parse.
12538 * An alternate implementation would be to build that
12539 * correlation as we go along during the original parse,
12540 * but that would entail extra work for every node, whereas
12541 * this code gets executed only when the string is too
12542 * large for the node, and the final two characters are
12543 * problematic, an infrequent occurrence. Yet another
12544 * possible strategy would be to save the tail of the
12545 * string, and the next time regatom is called, initialize
12546 * with that. The problem with this is that unless you
12547 * back off one more character, you won't be guaranteed
12548 * regatom will get called again, unless regbranch,
12549 * regpiece ... are also changed. If you do back off that
12550 * extra character, so that there is input guaranteed to
12551 * force calling regatom, you can't handle the case where
12552 * just the first character in the node is acceptable. I
12553 * (khw) decided to try this method which doesn't have that
12554 * pitfall; if performance issues are found, we can do a
12555 * combination of the current approach plus that one */
12561 } /* End of verifying node ends with an appropriate char */
12563 loopdone: /* Jumped to when encounters something that shouldn't be in
12566 /* I (khw) don't know if you can get here with zero length, but the
12567 * old code handled this situation by creating a zero-length EXACT
12568 * node. Might as well be NOTHING instead */
12574 /* If 'maybe_exact' is still set here, means there are no
12575 * code points in the node that participate in folds;
12576 * similarly for 'maybe_exactfu' and code points that match
12577 * differently depending on UTF8ness of the target string
12578 * (for /u), or depending on locale for /l */
12582 else if (maybe_exactfu) {
12586 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12587 FALSE /* Don't look to see if could
12588 be turned into an EXACT
12589 node, as we have already
12594 RExC_parse = p - 1;
12595 Set_Node_Cur_Length(ret, parse_start);
12596 nextchar(pRExC_state);
12598 /* len is STRLEN which is unsigned, need to copy to signed */
12601 vFAIL("Internal disaster");
12604 } /* End of label 'defchar:' */
12606 } /* End of giant switch on input character */
12612 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12614 /* Returns the next non-pattern-white space, non-comment character (the
12615 * latter only if 'recognize_comment is true) in the string p, which is
12616 * ended by RExC_end. See also reg_skipcomment */
12617 const char *e = RExC_end;
12619 PERL_ARGS_ASSERT_REGPATWS;
12623 if ((len = is_PATWS_safe(p, e, UTF))) {
12626 else if (recognize_comment && *p == '#') {
12627 p = reg_skipcomment(pRExC_state, p);
12636 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12638 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12639 * sets up the bitmap and any flags, removing those code points from the
12640 * inversion list, setting it to NULL should it become completely empty */
12642 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12643 assert(PL_regkind[OP(node)] == ANYOF);
12645 ANYOF_BITMAP_ZERO(node);
12646 if (*invlist_ptr) {
12648 /* This gets set if we actually need to modify things */
12649 bool change_invlist = FALSE;
12653 /* Start looking through *invlist_ptr */
12654 invlist_iterinit(*invlist_ptr);
12655 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12659 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12660 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12662 else if (end >= NUM_ANYOF_CODE_POINTS) {
12663 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12666 /* Quit if are above what we should change */
12667 if (start >= NUM_ANYOF_CODE_POINTS) {
12671 change_invlist = TRUE;
12673 /* Set all the bits in the range, up to the max that we are doing */
12674 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12676 : NUM_ANYOF_CODE_POINTS - 1;
12677 for (i = start; i <= (int) high; i++) {
12678 if (! ANYOF_BITMAP_TEST(node, i)) {
12679 ANYOF_BITMAP_SET(node, i);
12683 invlist_iterfinish(*invlist_ptr);
12685 /* Done with loop; remove any code points that are in the bitmap from
12686 * *invlist_ptr; similarly for code points above the bitmap if we have
12687 * a flag to match all of them anyways */
12688 if (change_invlist) {
12689 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12691 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12692 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12695 /* If have completely emptied it, remove it completely */
12696 if (_invlist_len(*invlist_ptr) == 0) {
12697 SvREFCNT_dec_NN(*invlist_ptr);
12698 *invlist_ptr = NULL;
12703 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12704 Character classes ([:foo:]) can also be negated ([:^foo:]).
12705 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12706 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12707 but trigger failures because they are currently unimplemented. */
12709 #define POSIXCC_DONE(c) ((c) == ':')
12710 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12711 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12713 PERL_STATIC_INLINE I32
12714 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12716 I32 namedclass = OOB_NAMEDCLASS;
12718 PERL_ARGS_ASSERT_REGPPOSIXCC;
12720 if (value == '[' && RExC_parse + 1 < RExC_end &&
12721 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12722 POSIXCC(UCHARAT(RExC_parse)))
12724 const char c = UCHARAT(RExC_parse);
12725 char* const s = RExC_parse++;
12727 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12729 if (RExC_parse == RExC_end) {
12732 /* Try to give a better location for the error (than the end of
12733 * the string) by looking for the matching ']' */
12735 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12738 vFAIL2("Unmatched '%c' in POSIX class", c);
12740 /* Grandfather lone [:, [=, [. */
12744 const char* const t = RExC_parse++; /* skip over the c */
12747 if (UCHARAT(RExC_parse) == ']') {
12748 const char *posixcc = s + 1;
12749 RExC_parse++; /* skip over the ending ] */
12752 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12753 const I32 skip = t - posixcc;
12755 /* Initially switch on the length of the name. */
12758 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12759 this is the Perl \w
12761 namedclass = ANYOF_WORDCHAR;
12764 /* Names all of length 5. */
12765 /* alnum alpha ascii blank cntrl digit graph lower
12766 print punct space upper */
12767 /* Offset 4 gives the best switch position. */
12768 switch (posixcc[4]) {
12770 if (memEQ(posixcc, "alph", 4)) /* alpha */
12771 namedclass = ANYOF_ALPHA;
12774 if (memEQ(posixcc, "spac", 4)) /* space */
12775 namedclass = ANYOF_PSXSPC;
12778 if (memEQ(posixcc, "grap", 4)) /* graph */
12779 namedclass = ANYOF_GRAPH;
12782 if (memEQ(posixcc, "asci", 4)) /* ascii */
12783 namedclass = ANYOF_ASCII;
12786 if (memEQ(posixcc, "blan", 4)) /* blank */
12787 namedclass = ANYOF_BLANK;
12790 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12791 namedclass = ANYOF_CNTRL;
12794 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12795 namedclass = ANYOF_ALPHANUMERIC;
12798 if (memEQ(posixcc, "lowe", 4)) /* lower */
12799 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12800 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12801 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12804 if (memEQ(posixcc, "digi", 4)) /* digit */
12805 namedclass = ANYOF_DIGIT;
12806 else if (memEQ(posixcc, "prin", 4)) /* print */
12807 namedclass = ANYOF_PRINT;
12808 else if (memEQ(posixcc, "punc", 4)) /* punct */
12809 namedclass = ANYOF_PUNCT;
12814 if (memEQ(posixcc, "xdigit", 6))
12815 namedclass = ANYOF_XDIGIT;
12819 if (namedclass == OOB_NAMEDCLASS)
12821 "POSIX class [:%"UTF8f":] unknown",
12822 UTF8fARG(UTF, t - s - 1, s + 1));
12824 /* The #defines are structured so each complement is +1 to
12825 * the normal one */
12829 assert (posixcc[skip] == ':');
12830 assert (posixcc[skip+1] == ']');
12831 } else if (!SIZE_ONLY) {
12832 /* [[=foo=]] and [[.foo.]] are still future. */
12834 /* adjust RExC_parse so the warning shows after
12835 the class closes */
12836 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12838 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12841 /* Maternal grandfather:
12842 * "[:" ending in ":" but not in ":]" */
12844 vFAIL("Unmatched '[' in POSIX class");
12847 /* Grandfather lone [:, [=, [. */
12857 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12859 /* This applies some heuristics at the current parse position (which should
12860 * be at a '[') to see if what follows might be intended to be a [:posix:]
12861 * class. It returns true if it really is a posix class, of course, but it
12862 * also can return true if it thinks that what was intended was a posix
12863 * class that didn't quite make it.
12865 * It will return true for
12867 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12868 * ')' indicating the end of the (?[
12869 * [:any garbage including %^&$ punctuation:]
12871 * This is designed to be called only from S_handle_regex_sets; it could be
12872 * easily adapted to be called from the spot at the beginning of regclass()
12873 * that checks to see in a normal bracketed class if the surrounding []
12874 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12875 * change long-standing behavior, so I (khw) didn't do that */
12876 char* p = RExC_parse + 1;
12877 char first_char = *p;
12879 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12881 assert(*(p - 1) == '[');
12883 if (! POSIXCC(first_char)) {
12888 while (p < RExC_end && isWORDCHAR(*p)) p++;
12890 if (p >= RExC_end) {
12894 if (p - RExC_parse > 2 /* Got at least 1 word character */
12895 && (*p == first_char
12896 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12901 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12904 && p - RExC_parse > 2 /* [:] evaluates to colon;
12905 [::] is a bad posix class. */
12906 && first_char == *(p - 1));
12910 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12911 I32 *flagp, U32 depth,
12912 char * const oregcomp_parse)
12914 /* Handle the (?[...]) construct to do set operations */
12917 UV start, end; /* End points of code point ranges */
12919 char *save_end, *save_parse;
12924 const bool save_fold = FOLD;
12926 GET_RE_DEBUG_FLAGS_DECL;
12928 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12931 vFAIL("(?[...]) not valid in locale");
12933 RExC_uni_semantics = 1;
12935 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12936 * (such as EXACT). Thus we can skip most everything if just sizing. We
12937 * call regclass to handle '[]' so as to not have to reinvent its parsing
12938 * rules here (throwing away the size it computes each time). And, we exit
12939 * upon an unescaped ']' that isn't one ending a regclass. To do both
12940 * these things, we need to realize that something preceded by a backslash
12941 * is escaped, so we have to keep track of backslashes */
12943 Perl_ck_warner_d(aTHX_
12944 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12945 "The regex_sets feature is experimental" REPORT_LOCATION,
12946 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12948 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12949 RExC_precomp + (RExC_parse - RExC_precomp)));
12952 UV depth = 0; /* how many nested (?[...]) constructs */
12954 while (RExC_parse < RExC_end) {
12955 SV* current = NULL;
12956 RExC_parse = regpatws(pRExC_state, RExC_parse,
12957 TRUE); /* means recognize comments */
12958 switch (*RExC_parse) {
12960 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12965 /* Skip the next byte (which could cause us to end up in
12966 * the middle of a UTF-8 character, but since none of those
12967 * are confusable with anything we currently handle in this
12968 * switch (invariants all), it's safe. We'll just hit the
12969 * default: case next time and keep on incrementing until
12970 * we find one of the invariants we do handle. */
12975 /* If this looks like it is a [:posix:] class, leave the
12976 * parse pointer at the '[' to fool regclass() into
12977 * thinking it is part of a '[[:posix:]]'. That function
12978 * will use strict checking to force a syntax error if it
12979 * doesn't work out to a legitimate class */
12980 bool is_posix_class
12981 = could_it_be_a_POSIX_class(pRExC_state);
12982 if (! is_posix_class) {
12986 /* regclass() can only return RESTART_UTF8 if multi-char
12987 folds are allowed. */
12988 if (!regclass(pRExC_state, flagp,depth+1,
12989 is_posix_class, /* parse the whole char
12990 class only if not a
12992 FALSE, /* don't allow multi-char folds */
12993 TRUE, /* silence non-portable warnings. */
12995 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12998 /* function call leaves parse pointing to the ']', except
12999 * if we faked it */
13000 if (is_posix_class) {
13004 SvREFCNT_dec(current); /* In case it returned something */
13009 if (depth--) break;
13011 if (RExC_parse < RExC_end
13012 && *RExC_parse == ')')
13014 node = reganode(pRExC_state, ANYOF, 0);
13015 RExC_size += ANYOF_SKIP;
13016 nextchar(pRExC_state);
13017 Set_Node_Length(node,
13018 RExC_parse - oregcomp_parse + 1); /* MJD */
13027 FAIL("Syntax error in (?[...])");
13030 /* Pass 2 only after this. Everything in this construct is a
13031 * metacharacter. Operands begin with either a '\' (for an escape
13032 * sequence), or a '[' for a bracketed character class. Any other
13033 * character should be an operator, or parenthesis for grouping. Both
13034 * types of operands are handled by calling regclass() to parse them. It
13035 * is called with a parameter to indicate to return the computed inversion
13036 * list. The parsing here is implemented via a stack. Each entry on the
13037 * stack is a single character representing one of the operators, or the
13038 * '('; or else a pointer to an operand inversion list. */
13040 #define IS_OPERAND(a) (! SvIOK(a))
13042 /* The stack starts empty. It is a syntax error if the first thing parsed
13043 * is a binary operator; everything else is pushed on the stack. When an
13044 * operand is parsed, the top of the stack is examined. If it is a binary
13045 * operator, the item before it should be an operand, and both are replaced
13046 * by the result of doing that operation on the new operand and the one on
13047 * the stack. Thus a sequence of binary operands is reduced to a single
13048 * one before the next one is parsed.
13050 * A unary operator may immediately follow a binary in the input, for
13053 * When an operand is parsed and the top of the stack is a unary operator,
13054 * the operation is performed, and then the stack is rechecked to see if
13055 * this new operand is part of a binary operation; if so, it is handled as
13058 * A '(' is simply pushed on the stack; it is valid only if the stack is
13059 * empty, or the top element of the stack is an operator or another '('
13060 * (for which the parenthesized expression will become an operand). By the
13061 * time the corresponding ')' is parsed everything in between should have
13062 * been parsed and evaluated to a single operand (or else is a syntax
13063 * error), and is handled as a regular operand */
13065 sv_2mortal((SV *)(stack = newAV()));
13067 while (RExC_parse < RExC_end) {
13068 I32 top_index = av_tindex(stack);
13070 SV* current = NULL;
13072 /* Skip white space */
13073 RExC_parse = regpatws(pRExC_state, RExC_parse,
13074 TRUE /* means recognize comments */ );
13075 if (RExC_parse >= RExC_end) {
13076 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13078 if ((curchar = UCHARAT(RExC_parse)) == ']') {
13085 if (av_tindex(stack) >= 0 /* This makes sure that we can
13086 safely subtract 1 from
13087 RExC_parse in the next clause.
13088 If we have something on the
13089 stack, we have parsed something
13091 && UCHARAT(RExC_parse - 1) == '('
13092 && RExC_parse < RExC_end)
13094 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13095 * This happens when we have some thing like
13097 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13099 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13101 * Here we would be handling the interpolated
13102 * '$thai_or_lao'. We handle this by a recursive call to
13103 * ourselves which returns the inversion list the
13104 * interpolated expression evaluates to. We use the flags
13105 * from the interpolated pattern. */
13106 U32 save_flags = RExC_flags;
13107 const char * const save_parse = ++RExC_parse;
13109 parse_lparen_question_flags(pRExC_state);
13111 if (RExC_parse == save_parse /* Makes sure there was at
13112 least one flag (or this
13113 embedding wasn't compiled)
13115 || RExC_parse >= RExC_end - 4
13116 || UCHARAT(RExC_parse) != ':'
13117 || UCHARAT(++RExC_parse) != '('
13118 || UCHARAT(++RExC_parse) != '?'
13119 || UCHARAT(++RExC_parse) != '[')
13122 /* In combination with the above, this moves the
13123 * pointer to the point just after the first erroneous
13124 * character (or if there are no flags, to where they
13125 * should have been) */
13126 if (RExC_parse >= RExC_end - 4) {
13127 RExC_parse = RExC_end;
13129 else if (RExC_parse != save_parse) {
13130 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13132 vFAIL("Expecting '(?flags:(?[...'");
13135 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13136 depth+1, oregcomp_parse);
13138 /* Here, 'current' contains the embedded expression's
13139 * inversion list, and RExC_parse points to the trailing
13140 * ']'; the next character should be the ')' which will be
13141 * paired with the '(' that has been put on the stack, so
13142 * the whole embedded expression reduces to '(operand)' */
13145 RExC_flags = save_flags;
13146 goto handle_operand;
13151 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13152 vFAIL("Unexpected character");
13155 /* regclass() can only return RESTART_UTF8 if multi-char
13156 folds are allowed. */
13157 if (!regclass(pRExC_state, flagp,depth+1,
13158 TRUE, /* means parse just the next thing */
13159 FALSE, /* don't allow multi-char folds */
13160 FALSE, /* don't silence non-portable warnings. */
13162 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13164 /* regclass() will return with parsing just the \ sequence,
13165 * leaving the parse pointer at the next thing to parse */
13167 goto handle_operand;
13169 case '[': /* Is a bracketed character class */
13171 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13173 if (! is_posix_class) {
13177 /* regclass() can only return RESTART_UTF8 if multi-char
13178 folds are allowed. */
13179 if(!regclass(pRExC_state, flagp,depth+1,
13180 is_posix_class, /* parse the whole char class
13181 only if not a posix class */
13182 FALSE, /* don't allow multi-char folds */
13183 FALSE, /* don't silence non-portable warnings. */
13185 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13187 /* function call leaves parse pointing to the ']', except if we
13189 if (is_posix_class) {
13193 goto handle_operand;
13202 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13203 || ! IS_OPERAND(*top_ptr))
13206 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13208 av_push(stack, newSVuv(curchar));
13212 av_push(stack, newSVuv(curchar));
13216 if (top_index >= 0) {
13217 top_ptr = av_fetch(stack, top_index, FALSE);
13219 if (IS_OPERAND(*top_ptr)) {
13221 vFAIL("Unexpected '(' with no preceding operator");
13224 av_push(stack, newSVuv(curchar));
13231 || ! (current = av_pop(stack))
13232 || ! IS_OPERAND(current)
13233 || ! (lparen = av_pop(stack))
13234 || IS_OPERAND(lparen)
13235 || SvUV(lparen) != '(')
13237 SvREFCNT_dec(current);
13239 vFAIL("Unexpected ')'");
13242 SvREFCNT_dec_NN(lparen);
13249 /* Here, we have an operand to process, in 'current' */
13251 if (top_index < 0) { /* Just push if stack is empty */
13252 av_push(stack, current);
13255 SV* top = av_pop(stack);
13257 char current_operator;
13259 if (IS_OPERAND(top)) {
13260 SvREFCNT_dec_NN(top);
13261 SvREFCNT_dec_NN(current);
13262 vFAIL("Operand with no preceding operator");
13264 current_operator = (char) SvUV(top);
13265 switch (current_operator) {
13266 case '(': /* Push the '(' back on followed by the new
13268 av_push(stack, top);
13269 av_push(stack, current);
13270 SvREFCNT_inc(top); /* Counters the '_dec' done
13271 just after the 'break', so
13272 it doesn't get wrongly freed
13277 _invlist_invert(current);
13279 /* Unlike binary operators, the top of the stack,
13280 * now that this unary one has been popped off, may
13281 * legally be an operator, and we now have operand
13284 SvREFCNT_dec_NN(top);
13285 goto handle_operand;
13288 prev = av_pop(stack);
13289 _invlist_intersection(prev,
13292 av_push(stack, current);
13297 prev = av_pop(stack);
13298 _invlist_union(prev, current, ¤t);
13299 av_push(stack, current);
13303 prev = av_pop(stack);;
13304 _invlist_subtract(prev, current, ¤t);
13305 av_push(stack, current);
13308 case '^': /* The union minus the intersection */
13314 prev = av_pop(stack);
13315 _invlist_union(prev, current, &u);
13316 _invlist_intersection(prev, current, &i);
13317 /* _invlist_subtract will overwrite current
13318 without freeing what it already contains */
13320 _invlist_subtract(u, i, ¤t);
13321 av_push(stack, current);
13322 SvREFCNT_dec_NN(i);
13323 SvREFCNT_dec_NN(u);
13324 SvREFCNT_dec_NN(element);
13329 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13331 SvREFCNT_dec_NN(top);
13332 SvREFCNT_dec(prev);
13336 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13339 if (av_tindex(stack) < 0 /* Was empty */
13340 || ((final = av_pop(stack)) == NULL)
13341 || ! IS_OPERAND(final)
13342 || av_tindex(stack) >= 0) /* More left on stack */
13344 vFAIL("Incomplete expression within '(?[ ])'");
13347 /* Here, 'final' is the resultant inversion list from evaluating the
13348 * expression. Return it if so requested */
13349 if (return_invlist) {
13350 *return_invlist = final;
13354 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13355 * expecting a string of ranges and individual code points */
13356 invlist_iterinit(final);
13357 result_string = newSVpvs("");
13358 while (invlist_iternext(final, &start, &end)) {
13359 if (start == end) {
13360 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13363 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13368 save_parse = RExC_parse;
13369 RExC_parse = SvPV(result_string, len);
13370 save_end = RExC_end;
13371 RExC_end = RExC_parse + len;
13373 /* We turn off folding around the call, as the class we have constructed
13374 * already has all folding taken into consideration, and we don't want
13375 * regclass() to add to that */
13376 RExC_flags &= ~RXf_PMf_FOLD;
13377 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13379 node = regclass(pRExC_state, flagp,depth+1,
13380 FALSE, /* means parse the whole char class */
13381 FALSE, /* don't allow multi-char folds */
13382 TRUE, /* silence non-portable warnings. The above may very
13383 well have generated non-portable code points, but
13384 they're valid on this machine */
13387 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13390 RExC_flags |= RXf_PMf_FOLD;
13392 RExC_parse = save_parse + 1;
13393 RExC_end = save_end;
13394 SvREFCNT_dec_NN(final);
13395 SvREFCNT_dec_NN(result_string);
13397 nextchar(pRExC_state);
13398 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13404 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13406 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13407 * innocent-looking character class, like /[ks]/i won't have to go out to
13408 * disk to find the possible matches.
13410 * This should be called only for a Latin1-range code points, cp, which is
13411 * known to be involved in a simple fold with other code points above
13412 * Latin1. It would give false results if /aa has been specified.
13413 * Multi-char folds are outside the scope of this, and must be handled
13416 * XXX It would be better to generate these via regen, in case a new
13417 * version of the Unicode standard adds new mappings, though that is not
13418 * really likely, and may be caught by the default: case of the switch
13421 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13423 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13429 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13433 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13436 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13437 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13439 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13440 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13441 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13443 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13444 *invlist = add_cp_to_invlist(*invlist,
13445 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13447 case LATIN_SMALL_LETTER_SHARP_S:
13448 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13451 /* Use deprecated warning to increase the chances of this being
13454 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13461 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13463 /* This adds the string scalar <multi_string> to the array
13464 * <multi_char_matches>. <multi_string> is known to have exactly
13465 * <cp_count> code points in it. This is used when constructing a
13466 * bracketed character class and we find something that needs to match more
13467 * than a single character.
13469 * <multi_char_matches> is actually an array of arrays. Each top-level
13470 * element is an array that contains all the strings known so far that are
13471 * the same length. And that length (in number of code points) is the same
13472 * as the index of the top-level array. Hence, the [2] element is an
13473 * array, each element thereof is a string containing TWO code points;
13474 * while element [3] is for strings of THREE characters, and so on. Since
13475 * this is for multi-char strings there can never be a [0] nor [1] element.
13477 * When we rewrite the character class below, we will do so such that the
13478 * longest strings are written first, so that it prefers the longest
13479 * matching strings first. This is done even if it turns out that any
13480 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13481 * Christiansen has agreed that this is ok. This makes the test for the
13482 * ligature 'ffi' come before the test for 'ff', for example */
13485 AV** this_array_ptr;
13487 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13489 if (! multi_char_matches) {
13490 multi_char_matches = newAV();
13493 if (av_exists(multi_char_matches, cp_count)) {
13494 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13495 this_array = *this_array_ptr;
13498 this_array = newAV();
13499 av_store(multi_char_matches, cp_count,
13502 av_push(this_array, multi_string);
13504 return multi_char_matches;
13507 /* The names of properties whose definitions are not known at compile time are
13508 * stored in this SV, after a constant heading. So if the length has been
13509 * changed since initialization, then there is a run-time definition. */
13510 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13511 (SvCUR(listsv) != initial_listsv_len)
13514 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13515 const bool stop_at_1, /* Just parse the next thing, don't
13516 look for a full character class */
13517 bool allow_multi_folds,
13518 const bool silence_non_portable, /* Don't output warnings
13521 SV** ret_invlist) /* Return an inversion list, not a node */
13523 /* parse a bracketed class specification. Most of these will produce an
13524 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13525 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13526 * under /i with multi-character folds: it will be rewritten following the
13527 * paradigm of this example, where the <multi-fold>s are characters which
13528 * fold to multiple character sequences:
13529 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13530 * gets effectively rewritten as:
13531 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13532 * reg() gets called (recursively) on the rewritten version, and this
13533 * function will return what it constructs. (Actually the <multi-fold>s
13534 * aren't physically removed from the [abcdefghi], it's just that they are
13535 * ignored in the recursion by means of a flag:
13536 * <RExC_in_multi_char_class>.)
13538 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13539 * characters, with the corresponding bit set if that character is in the
13540 * list. For characters above this, a range list or swash is used. There
13541 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13542 * determinable at compile time
13544 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13545 * to be restarted. This can only happen if ret_invlist is non-NULL.
13548 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13550 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13553 IV namedclass = OOB_NAMEDCLASS;
13554 char *rangebegin = NULL;
13555 bool need_class = 0;
13557 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13558 than just initialized. */
13559 SV* properties = NULL; /* Code points that match \p{} \P{} */
13560 SV* posixes = NULL; /* Code points that match classes like [:word:],
13561 extended beyond the Latin1 range. These have to
13562 be kept separate from other code points for much
13563 of this function because their handling is
13564 different under /i, and for most classes under
13566 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13567 separate for a while from the non-complemented
13568 versions because of complications with /d
13570 UV element_count = 0; /* Number of distinct elements in the class.
13571 Optimizations may be possible if this is tiny */
13572 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13573 character; used under /i */
13575 char * stop_ptr = RExC_end; /* where to stop parsing */
13576 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13578 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13580 /* Unicode properties are stored in a swash; this holds the current one
13581 * being parsed. If this swash is the only above-latin1 component of the
13582 * character class, an optimization is to pass it directly on to the
13583 * execution engine. Otherwise, it is set to NULL to indicate that there
13584 * are other things in the class that have to be dealt with at execution
13586 SV* swash = NULL; /* Code points that match \p{} \P{} */
13588 /* Set if a component of this character class is user-defined; just passed
13589 * on to the engine */
13590 bool has_user_defined_property = FALSE;
13592 /* inversion list of code points this node matches only when the target
13593 * string is in UTF-8. (Because is under /d) */
13594 SV* depends_list = NULL;
13596 /* Inversion list of code points this node matches regardless of things
13597 * like locale, folding, utf8ness of the target string */
13598 SV* cp_list = NULL;
13600 /* Like cp_list, but code points on this list need to be checked for things
13601 * that fold to/from them under /i */
13602 SV* cp_foldable_list = NULL;
13604 /* Like cp_list, but code points on this list are valid only when the
13605 * runtime locale is UTF-8 */
13606 SV* only_utf8_locale_list = NULL;
13609 /* In a range, counts how many 0-2 of the ends of it came from literals,
13610 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13611 UV literal_endpoint = 0;
13613 bool invert = FALSE; /* Is this class to be complemented */
13615 bool warn_super = ALWAYS_WARN_SUPER;
13617 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13618 case we need to change the emitted regop to an EXACT. */
13619 const char * orig_parse = RExC_parse;
13620 const SSize_t orig_size = RExC_size;
13621 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13622 GET_RE_DEBUG_FLAGS_DECL;
13624 PERL_ARGS_ASSERT_REGCLASS;
13626 PERL_UNUSED_ARG(depth);
13629 DEBUG_PARSE("clas");
13631 /* Assume we are going to generate an ANYOF node. */
13632 ret = reganode(pRExC_state, ANYOF, 0);
13635 RExC_size += ANYOF_SKIP;
13636 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13639 ANYOF_FLAGS(ret) = 0;
13641 RExC_emit += ANYOF_SKIP;
13642 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13643 initial_listsv_len = SvCUR(listsv);
13644 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13648 RExC_parse = regpatws(pRExC_state, RExC_parse,
13649 FALSE /* means don't recognize comments */ );
13652 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13655 allow_multi_folds = FALSE;
13658 RExC_parse = regpatws(pRExC_state, RExC_parse,
13659 FALSE /* means don't recognize comments */ );
13663 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13664 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13665 const char *s = RExC_parse;
13666 const char c = *s++;
13668 while (isWORDCHAR(*s))
13670 if (*s && c == *s && s[1] == ']') {
13671 SAVEFREESV(RExC_rx_sv);
13673 "POSIX syntax [%c %c] belongs inside character classes",
13675 (void)ReREFCNT_inc(RExC_rx_sv);
13679 /* If the caller wants us to just parse a single element, accomplish this
13680 * by faking the loop ending condition */
13681 if (stop_at_1 && RExC_end > RExC_parse) {
13682 stop_ptr = RExC_parse + 1;
13685 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13686 if (UCHARAT(RExC_parse) == ']')
13687 goto charclassloop;
13690 if (RExC_parse >= stop_ptr) {
13695 RExC_parse = regpatws(pRExC_state, RExC_parse,
13696 FALSE /* means don't recognize comments */ );
13699 if (UCHARAT(RExC_parse) == ']') {
13705 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13706 save_value = value;
13707 save_prevvalue = prevvalue;
13710 rangebegin = RExC_parse;
13714 value = utf8n_to_uvchr((U8*)RExC_parse,
13715 RExC_end - RExC_parse,
13716 &numlen, UTF8_ALLOW_DEFAULT);
13717 RExC_parse += numlen;
13720 value = UCHARAT(RExC_parse++);
13723 && RExC_parse < RExC_end
13724 && POSIXCC(UCHARAT(RExC_parse)))
13726 namedclass = regpposixcc(pRExC_state, value, strict);
13728 else if (value != '\\') {
13730 literal_endpoint++;
13734 /* Is a backslash; get the code point of the char after it */
13735 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13736 value = utf8n_to_uvchr((U8*)RExC_parse,
13737 RExC_end - RExC_parse,
13738 &numlen, UTF8_ALLOW_DEFAULT);
13739 RExC_parse += numlen;
13742 value = UCHARAT(RExC_parse++);
13744 /* Some compilers cannot handle switching on 64-bit integer
13745 * values, therefore value cannot be an UV. Yes, this will
13746 * be a problem later if we want switch on Unicode.
13747 * A similar issue a little bit later when switching on
13748 * namedclass. --jhi */
13750 /* If the \ is escaping white space when white space is being
13751 * skipped, it means that that white space is wanted literally, and
13752 * is already in 'value'. Otherwise, need to translate the escape
13753 * into what it signifies. */
13754 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13756 case 'w': namedclass = ANYOF_WORDCHAR; break;
13757 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13758 case 's': namedclass = ANYOF_SPACE; break;
13759 case 'S': namedclass = ANYOF_NSPACE; break;
13760 case 'd': namedclass = ANYOF_DIGIT; break;
13761 case 'D': namedclass = ANYOF_NDIGIT; break;
13762 case 'v': namedclass = ANYOF_VERTWS; break;
13763 case 'V': namedclass = ANYOF_NVERTWS; break;
13764 case 'h': namedclass = ANYOF_HORIZWS; break;
13765 case 'H': namedclass = ANYOF_NHORIZWS; break;
13766 case 'N': /* Handle \N{NAME} in class */
13769 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13770 flagp, depth, &as_text);
13771 if (*flagp & RESTART_UTF8)
13772 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13773 if (cp_count != 1) { /* The typical case drops through */
13774 assert(cp_count != (STRLEN) -1);
13775 if (cp_count == 0) {
13777 RExC_parse++; /* Position after the "}" */
13778 vFAIL("Zero length \\N{}");
13781 ckWARNreg(RExC_parse,
13782 "Ignoring zero length \\N{} in character class");
13785 else { /* cp_count > 1 */
13786 if (! RExC_in_multi_char_class) {
13787 if (invert || range || *RExC_parse == '-') {
13790 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13793 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13798 = add_multi_match(multi_char_matches,
13802 break; /* <value> contains the first code
13803 point. Drop out of the switch to
13806 } /* End of cp_count != 1 */
13808 /* This element should not be processed further in this
13811 value = save_value;
13812 prevvalue = save_prevvalue;
13813 continue; /* Back to top of loop to get next char */
13815 /* Here, is a single code point, and <value> contains it */
13817 /* We consider named characters to be literal characters */
13818 literal_endpoint++;
13827 /* We will handle any undefined properties ourselves */
13828 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13829 /* And we actually would prefer to get
13830 * the straight inversion list of the
13831 * swash, since we will be accessing it
13832 * anyway, to save a little time */
13833 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13835 if (RExC_parse >= RExC_end)
13836 vFAIL2("Empty \\%c{}", (U8)value);
13837 if (*RExC_parse == '{') {
13838 const U8 c = (U8)value;
13839 e = strchr(RExC_parse++, '}');
13841 vFAIL2("Missing right brace on \\%c{}", c);
13842 while (isSPACE(*RExC_parse))
13844 if (e == RExC_parse)
13845 vFAIL2("Empty \\%c{}", c);
13846 n = e - RExC_parse;
13847 while (isSPACE(*(RExC_parse + n - 1)))
13858 if (UCHARAT(RExC_parse) == '^') {
13861 /* toggle. (The rhs xor gets the single bit that
13862 * differs between P and p; the other xor inverts just
13864 value ^= 'P' ^ 'p';
13866 while (isSPACE(*RExC_parse)) {
13871 /* Try to get the definition of the property into
13872 * <invlist>. If /i is in effect, the effective property
13873 * will have its name be <__NAME_i>. The design is
13874 * discussed in commit
13875 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13876 name = savepv(Perl_form(aTHX_
13878 (FOLD) ? "__" : "",
13884 /* Look up the property name, and get its swash and
13885 * inversion list, if the property is found */
13887 SvREFCNT_dec_NN(swash);
13889 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13892 NULL, /* No inversion list */
13895 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13896 HV* curpkg = (IN_PERL_COMPILETIME)
13898 : CopSTASH(PL_curcop);
13900 SvREFCNT_dec_NN(swash);
13904 /* Here didn't find it. It could be a user-defined
13905 * property that will be available at run-time. If we
13906 * accept only compile-time properties, is an error;
13907 * otherwise add it to the list for run-time look up */
13909 RExC_parse = e + 1;
13911 "Property '%"UTF8f"' is unknown",
13912 UTF8fARG(UTF, n, name));
13915 /* If the property name doesn't already have a package
13916 * name, add the current one to it so that it can be
13917 * referred to outside it. [perl #121777] */
13918 if (curpkg && ! instr(name, "::")) {
13919 char* pkgname = HvNAME(curpkg);
13920 if (strNE(pkgname, "main")) {
13921 char* full_name = Perl_form(aTHX_
13925 n = strlen(full_name);
13927 name = savepvn(full_name, n);
13930 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13931 (value == 'p' ? '+' : '!'),
13932 UTF8fARG(UTF, n, name));
13933 has_user_defined_property = TRUE;
13935 /* We don't know yet, so have to assume that the
13936 * property could match something in the Latin1 range,
13937 * hence something that isn't utf8. Note that this
13938 * would cause things in <depends_list> to match
13939 * inappropriately, except that any \p{}, including
13940 * this one forces Unicode semantics, which means there
13941 * is no <depends_list> */
13943 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
13947 /* Here, did get the swash and its inversion list. If
13948 * the swash is from a user-defined property, then this
13949 * whole character class should be regarded as such */
13950 if (swash_init_flags
13951 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13953 has_user_defined_property = TRUE;
13956 /* We warn on matching an above-Unicode code point
13957 * if the match would return true, except don't
13958 * warn for \p{All}, which has exactly one element
13960 (_invlist_contains_cp(invlist, 0x110000)
13961 && (! (_invlist_len(invlist) == 1
13962 && *invlist_array(invlist) == 0)))
13968 /* Invert if asking for the complement */
13969 if (value == 'P') {
13970 _invlist_union_complement_2nd(properties,
13974 /* The swash can't be used as-is, because we've
13975 * inverted things; delay removing it to here after
13976 * have copied its invlist above */
13977 SvREFCNT_dec_NN(swash);
13981 _invlist_union(properties, invlist, &properties);
13986 RExC_parse = e + 1;
13987 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13990 /* \p means they want Unicode semantics */
13991 RExC_uni_semantics = 1;
13994 case 'n': value = '\n'; break;
13995 case 'r': value = '\r'; break;
13996 case 't': value = '\t'; break;
13997 case 'f': value = '\f'; break;
13998 case 'b': value = '\b'; break;
13999 case 'e': value = ESC_NATIVE; break;
14000 case 'a': value = '\a'; break;
14002 RExC_parse--; /* function expects to be pointed at the 'o' */
14004 const char* error_msg;
14005 bool valid = grok_bslash_o(&RExC_parse,
14008 PASS2, /* warnings only in
14011 silence_non_portable,
14017 if (PL_encoding && value < 0x100) {
14018 goto recode_encoding;
14022 RExC_parse--; /* function expects to be pointed at the 'x' */
14024 const char* error_msg;
14025 bool valid = grok_bslash_x(&RExC_parse,
14028 PASS2, /* Output warnings */
14030 silence_non_portable,
14036 if (PL_encoding && value < 0x100)
14037 goto recode_encoding;
14040 value = grok_bslash_c(*RExC_parse++, PASS2);
14042 case '0': case '1': case '2': case '3': case '4':
14043 case '5': case '6': case '7':
14045 /* Take 1-3 octal digits */
14046 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14047 numlen = (strict) ? 4 : 3;
14048 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14049 RExC_parse += numlen;
14052 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14053 vFAIL("Need exactly 3 octal digits");
14055 else if (! SIZE_ONLY /* like \08, \178 */
14057 && RExC_parse < RExC_end
14058 && isDIGIT(*RExC_parse)
14059 && ckWARN(WARN_REGEXP))
14061 SAVEFREESV(RExC_rx_sv);
14062 reg_warn_non_literal_string(
14064 form_short_octal_warning(RExC_parse, numlen));
14065 (void)ReREFCNT_inc(RExC_rx_sv);
14068 if (PL_encoding && value < 0x100)
14069 goto recode_encoding;
14073 if (! RExC_override_recoding) {
14074 SV* enc = PL_encoding;
14075 value = reg_recode((const char)(U8)value, &enc);
14078 vFAIL("Invalid escape in the specified encoding");
14081 ckWARNreg(RExC_parse,
14082 "Invalid escape in the specified encoding");
14088 /* Allow \_ to not give an error */
14089 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14091 vFAIL2("Unrecognized escape \\%c in character class",
14095 SAVEFREESV(RExC_rx_sv);
14096 ckWARN2reg(RExC_parse,
14097 "Unrecognized escape \\%c in character class passed through",
14099 (void)ReREFCNT_inc(RExC_rx_sv);
14103 } /* End of switch on char following backslash */
14104 } /* end of handling backslash escape sequences */
14106 /* Here, we have the current token in 'value' */
14108 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14111 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14112 * literal, as is the character that began the false range, i.e.
14113 * the 'a' in the examples */
14116 const int w = (RExC_parse >= rangebegin)
14117 ? RExC_parse - rangebegin
14121 "False [] range \"%"UTF8f"\"",
14122 UTF8fARG(UTF, w, rangebegin));
14125 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14126 ckWARN2reg(RExC_parse,
14127 "False [] range \"%"UTF8f"\"",
14128 UTF8fARG(UTF, w, rangebegin));
14129 (void)ReREFCNT_inc(RExC_rx_sv);
14130 cp_list = add_cp_to_invlist(cp_list, '-');
14131 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14136 range = 0; /* this was not a true range */
14137 element_count += 2; /* So counts for three values */
14140 classnum = namedclass_to_classnum(namedclass);
14142 if (LOC && namedclass < ANYOF_POSIXL_MAX
14143 #ifndef HAS_ISASCII
14144 && classnum != _CC_ASCII
14147 /* What the Posix classes (like \w, [:space:]) match in locale
14148 * isn't knowable under locale until actual match time. Room
14149 * must be reserved (one time per outer bracketed class) to
14150 * store such classes. The space will contain a bit for each
14151 * named class that is to be matched against. This isn't
14152 * needed for \p{} and pseudo-classes, as they are not affected
14153 * by locale, and hence are dealt with separately */
14154 if (! need_class) {
14157 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14160 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14162 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14163 ANYOF_POSIXL_ZERO(ret);
14166 /* Coverity thinks it is possible for this to be negative; both
14167 * jhi and khw think it's not, but be safer */
14168 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14169 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14171 /* See if it already matches the complement of this POSIX
14173 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14174 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14178 posixl_matches_all = TRUE;
14179 break; /* No need to continue. Since it matches both
14180 e.g., \w and \W, it matches everything, and the
14181 bracketed class can be optimized into qr/./s */
14184 /* Add this class to those that should be checked at runtime */
14185 ANYOF_POSIXL_SET(ret, namedclass);
14187 /* The above-Latin1 characters are not subject to locale rules.
14188 * Just add them, in the second pass, to the
14189 * unconditionally-matched list */
14191 SV* scratch_list = NULL;
14193 /* Get the list of the above-Latin1 code points this
14195 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14196 PL_XPosix_ptrs[classnum],
14198 /* Odd numbers are complements, like
14199 * NDIGIT, NASCII, ... */
14200 namedclass % 2 != 0,
14202 /* Checking if 'cp_list' is NULL first saves an extra
14203 * clone. Its reference count will be decremented at the
14204 * next union, etc, or if this is the only instance, at the
14205 * end of the routine */
14207 cp_list = scratch_list;
14210 _invlist_union(cp_list, scratch_list, &cp_list);
14211 SvREFCNT_dec_NN(scratch_list);
14213 continue; /* Go get next character */
14216 else if (! SIZE_ONLY) {
14218 /* Here, not in pass1 (in that pass we skip calculating the
14219 * contents of this class), and is /l, or is a POSIX class for
14220 * which /l doesn't matter (or is a Unicode property, which is
14221 * skipped here). */
14222 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14223 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14225 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14226 * nor /l make a difference in what these match,
14227 * therefore we just add what they match to cp_list. */
14228 if (classnum != _CC_VERTSPACE) {
14229 assert( namedclass == ANYOF_HORIZWS
14230 || namedclass == ANYOF_NHORIZWS);
14232 /* It turns out that \h is just a synonym for
14234 classnum = _CC_BLANK;
14237 _invlist_union_maybe_complement_2nd(
14239 PL_XPosix_ptrs[classnum],
14240 namedclass % 2 != 0, /* Complement if odd
14241 (NHORIZWS, NVERTWS)
14246 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14247 complement and use nposixes */
14248 SV** posixes_ptr = namedclass % 2 == 0
14251 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14252 _invlist_union_maybe_complement_2nd(
14255 namedclass % 2 != 0,
14259 } /* end of namedclass \blah */
14262 RExC_parse = regpatws(pRExC_state, RExC_parse,
14263 FALSE /* means don't recognize comments */ );
14266 /* If 'range' is set, 'value' is the ending of a range--check its
14267 * validity. (If value isn't a single code point in the case of a
14268 * range, we should have figured that out above in the code that
14269 * catches false ranges). Later, we will handle each individual code
14270 * point in the range. If 'range' isn't set, this could be the
14271 * beginning of a range, so check for that by looking ahead to see if
14272 * the next real character to be processed is the range indicator--the
14276 if (prevvalue > value) /* b-a */ {
14277 const int w = RExC_parse - rangebegin;
14279 "Invalid [] range \"%"UTF8f"\"",
14280 UTF8fARG(UTF, w, rangebegin));
14281 range = 0; /* not a valid range */
14285 prevvalue = value; /* save the beginning of the potential range */
14286 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14287 && *RExC_parse == '-')
14289 char* next_char_ptr = RExC_parse + 1;
14290 if (skip_white) { /* Get the next real char after the '-' */
14291 next_char_ptr = regpatws(pRExC_state,
14293 FALSE); /* means don't recognize
14297 /* If the '-' is at the end of the class (just before the ']',
14298 * it is a literal minus; otherwise it is a range */
14299 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14300 RExC_parse = next_char_ptr;
14302 /* a bad range like \w-, [:word:]- ? */
14303 if (namedclass > OOB_NAMEDCLASS) {
14304 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14305 const int w = RExC_parse >= rangebegin
14306 ? RExC_parse - rangebegin
14309 vFAIL4("False [] range \"%*.*s\"",
14314 "False [] range \"%*.*s\"",
14319 cp_list = add_cp_to_invlist(cp_list, '-');
14323 range = 1; /* yeah, it's a range! */
14324 continue; /* but do it the next time */
14329 if (namedclass > OOB_NAMEDCLASS) {
14333 /* Here, we have a single value, and <prevvalue> is the beginning of
14334 * the range, if any; or <value> if not */
14336 /* non-Latin1 code point implies unicode semantics. Must be set in
14337 * pass1 so is there for the whole of pass 2 */
14339 RExC_uni_semantics = 1;
14342 /* Ready to process either the single value, or the completed range.
14343 * For single-valued non-inverted ranges, we consider the possibility
14344 * of multi-char folds. (We made a conscious decision to not do this
14345 * for the other cases because it can often lead to non-intuitive
14346 * results. For example, you have the peculiar case that:
14347 * "s s" =~ /^[^\xDF]+$/i => Y
14348 * "ss" =~ /^[^\xDF]+$/i => N
14350 * See [perl #89750] */
14351 if (FOLD && allow_multi_folds && value == prevvalue) {
14352 if (value == LATIN_SMALL_LETTER_SHARP_S
14353 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14356 /* Here <value> is indeed a multi-char fold. Get what it is */
14358 U8 foldbuf[UTF8_MAXBYTES_CASE];
14361 UV folded = _to_uni_fold_flags(
14365 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14366 ? FOLD_FLAGS_NOMIX_ASCII
14370 /* Here, <folded> should be the first character of the
14371 * multi-char fold of <value>, with <foldbuf> containing the
14372 * whole thing. But, if this fold is not allowed (because of
14373 * the flags), <fold> will be the same as <value>, and should
14374 * be processed like any other character, so skip the special
14376 if (folded != value) {
14378 /* Skip if we are recursed, currently parsing the class
14379 * again. Otherwise add this character to the list of
14380 * multi-char folds. */
14381 if (! RExC_in_multi_char_class) {
14382 STRLEN cp_count = utf8_length(foldbuf,
14383 foldbuf + foldlen);
14384 SV* multi_fold = sv_2mortal(newSVpvs(""));
14386 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14389 = add_multi_match(multi_char_matches,
14395 /* This element should not be processed further in this
14398 value = save_value;
14399 prevvalue = save_prevvalue;
14405 /* Deal with this element of the class */
14408 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14411 SV* this_range = _new_invlist(1);
14412 _append_range_to_invlist(this_range, prevvalue, value);
14414 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14415 * If this range was specified using something like 'i-j', we want
14416 * to include only the 'i' and the 'j', and not anything in
14417 * between, so exclude non-ASCII, non-alphabetics from it.
14418 * However, if the range was specified with something like
14419 * [\x89-\x91] or [\x89-j], all code points within it should be
14420 * included. literal_endpoint==2 means both ends of the range used
14421 * a literal character, not \x{foo} */
14422 if (literal_endpoint == 2
14423 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14424 || (isUPPER_A(prevvalue) && isUPPER_A(value))))
14426 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14429 /* Since 'this_range' now only contains ascii, the intersection
14430 * of it with anything will still yield only ascii */
14431 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14434 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14435 literal_endpoint = 0;
14436 SvREFCNT_dec_NN(this_range);
14440 range = 0; /* this range (if it was one) is done now */
14441 } /* End of loop through all the text within the brackets */
14443 /* If anything in the class expands to more than one character, we have to
14444 * deal with them by building up a substitute parse string, and recursively
14445 * calling reg() on it, instead of proceeding */
14446 if (multi_char_matches) {
14447 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14450 char *save_end = RExC_end;
14451 char *save_parse = RExC_parse;
14452 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14457 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14458 because too confusing */
14460 sv_catpv(substitute_parse, "(?:");
14464 /* Look at the longest folds first */
14465 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14467 if (av_exists(multi_char_matches, cp_count)) {
14468 AV** this_array_ptr;
14471 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14473 while ((this_sequence = av_pop(*this_array_ptr)) !=
14476 if (! first_time) {
14477 sv_catpv(substitute_parse, "|");
14479 first_time = FALSE;
14481 sv_catpv(substitute_parse, SvPVX(this_sequence));
14486 /* If the character class contains anything else besides these
14487 * multi-character folds, have to include it in recursive parsing */
14488 if (element_count) {
14489 sv_catpv(substitute_parse, "|[");
14490 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14491 sv_catpv(substitute_parse, "]");
14494 sv_catpv(substitute_parse, ")");
14497 /* This is a way to get the parse to skip forward a whole named
14498 * sequence instead of matching the 2nd character when it fails the
14500 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14504 RExC_parse = SvPV(substitute_parse, len);
14505 RExC_end = RExC_parse + len;
14506 RExC_in_multi_char_class = 1;
14507 RExC_override_recoding = 1;
14508 RExC_emit = (regnode *)orig_emit;
14510 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14512 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14514 RExC_parse = save_parse;
14515 RExC_end = save_end;
14516 RExC_in_multi_char_class = 0;
14517 RExC_override_recoding = 0;
14518 SvREFCNT_dec_NN(multi_char_matches);
14522 /* Here, we've gone through the entire class and dealt with multi-char
14523 * folds. We are now in a position that we can do some checks to see if we
14524 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14525 * Currently we only do two checks:
14526 * 1) is in the unlikely event that the user has specified both, eg. \w and
14527 * \W under /l, then the class matches everything. (This optimization
14528 * is done only to make the optimizer code run later work.)
14529 * 2) if the character class contains only a single element (including a
14530 * single range), we see if there is an equivalent node for it.
14531 * Other checks are possible */
14532 if (! ret_invlist /* Can't optimize if returning the constructed
14534 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14539 if (UNLIKELY(posixl_matches_all)) {
14542 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14543 \w or [:digit:] or \p{foo}
14546 /* All named classes are mapped into POSIXish nodes, with its FLAG
14547 * argument giving which class it is */
14548 switch ((I32)namedclass) {
14549 case ANYOF_UNIPROP:
14552 /* These don't depend on the charset modifiers. They always
14553 * match under /u rules */
14554 case ANYOF_NHORIZWS:
14555 case ANYOF_HORIZWS:
14556 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14559 case ANYOF_NVERTWS:
14564 /* The actual POSIXish node for all the rest depends on the
14565 * charset modifier. The ones in the first set depend only on
14566 * ASCII or, if available on this platform, locale */
14570 op = (LOC) ? POSIXL : POSIXA;
14581 /* under /a could be alpha */
14583 if (ASCII_RESTRICTED) {
14584 namedclass = ANYOF_ALPHA + (namedclass % 2);
14592 /* The rest have more possibilities depending on the charset.
14593 * We take advantage of the enum ordering of the charset
14594 * modifiers to get the exact node type, */
14596 op = POSIXD + get_regex_charset(RExC_flags);
14597 if (op > POSIXA) { /* /aa is same as /a */
14602 /* The odd numbered ones are the complements of the
14603 * next-lower even number one */
14604 if (namedclass % 2 == 1) {
14608 arg = namedclass_to_classnum(namedclass);
14612 else if (value == prevvalue) {
14614 /* Here, the class consists of just a single code point */
14617 if (! LOC && value == '\n') {
14618 op = REG_ANY; /* Optimize [^\n] */
14619 *flagp |= HASWIDTH|SIMPLE;
14623 else if (value < 256 || UTF) {
14625 /* Optimize a single value into an EXACTish node, but not if it
14626 * would require converting the pattern to UTF-8. */
14627 op = compute_EXACTish(pRExC_state);
14629 } /* Otherwise is a range */
14630 else if (! LOC) { /* locale could vary these */
14631 if (prevvalue == '0') {
14632 if (value == '9') {
14637 else if (prevvalue == 'A') {
14640 && literal_endpoint == 2
14643 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14647 else if (prevvalue == 'a') {
14650 && literal_endpoint == 2
14653 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14659 /* Here, we have changed <op> away from its initial value iff we found
14660 * an optimization */
14663 /* Throw away this ANYOF regnode, and emit the calculated one,
14664 * which should correspond to the beginning, not current, state of
14666 const char * cur_parse = RExC_parse;
14667 RExC_parse = (char *)orig_parse;
14671 /* To get locale nodes to not use the full ANYOF size would
14672 * require moving the code above that writes the portions
14673 * of it that aren't in other nodes to after this point.
14674 * e.g. ANYOF_POSIXL_SET */
14675 RExC_size = orig_size;
14679 RExC_emit = (regnode *)orig_emit;
14680 if (PL_regkind[op] == POSIXD) {
14681 if (op == POSIXL) {
14682 RExC_contains_locale = 1;
14685 op += NPOSIXD - POSIXD;
14690 ret = reg_node(pRExC_state, op);
14692 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14696 *flagp |= HASWIDTH|SIMPLE;
14698 else if (PL_regkind[op] == EXACT) {
14699 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14700 TRUE /* downgradable to EXACT */
14704 RExC_parse = (char *) cur_parse;
14706 SvREFCNT_dec(posixes);
14707 SvREFCNT_dec(nposixes);
14708 SvREFCNT_dec(cp_list);
14709 SvREFCNT_dec(cp_foldable_list);
14716 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14718 /* If folding, we calculate all characters that could fold to or from the
14719 * ones already on the list */
14720 if (cp_foldable_list) {
14722 UV start, end; /* End points of code point ranges */
14724 SV* fold_intersection = NULL;
14727 /* Our calculated list will be for Unicode rules. For locale
14728 * matching, we have to keep a separate list that is consulted at
14729 * runtime only when the locale indicates Unicode rules. For
14730 * non-locale, we just use to the general list */
14732 use_list = &only_utf8_locale_list;
14735 use_list = &cp_list;
14738 /* Only the characters in this class that participate in folds need
14739 * be checked. Get the intersection of this class and all the
14740 * possible characters that are foldable. This can quickly narrow
14741 * down a large class */
14742 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14743 &fold_intersection);
14745 /* The folds for all the Latin1 characters are hard-coded into this
14746 * program, but we have to go out to disk to get the others. */
14747 if (invlist_highest(cp_foldable_list) >= 256) {
14749 /* This is a hash that for a particular fold gives all
14750 * characters that are involved in it */
14751 if (! PL_utf8_foldclosures) {
14752 _load_PL_utf8_foldclosures();
14756 /* Now look at the foldable characters in this class individually */
14757 invlist_iterinit(fold_intersection);
14758 while (invlist_iternext(fold_intersection, &start, &end)) {
14761 /* Look at every character in the range */
14762 for (j = start; j <= end; j++) {
14763 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14769 if (IS_IN_SOME_FOLD_L1(j)) {
14771 /* ASCII is always matched; non-ASCII is matched
14772 * only under Unicode rules (which could happen
14773 * under /l if the locale is a UTF-8 one */
14774 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14775 *use_list = add_cp_to_invlist(*use_list,
14776 PL_fold_latin1[j]);
14780 add_cp_to_invlist(depends_list,
14781 PL_fold_latin1[j]);
14785 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14786 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14788 add_above_Latin1_folds(pRExC_state,
14795 /* Here is an above Latin1 character. We don't have the
14796 * rules hard-coded for it. First, get its fold. This is
14797 * the simple fold, as the multi-character folds have been
14798 * handled earlier and separated out */
14799 _to_uni_fold_flags(j, foldbuf, &foldlen,
14800 (ASCII_FOLD_RESTRICTED)
14801 ? FOLD_FLAGS_NOMIX_ASCII
14804 /* Single character fold of above Latin1. Add everything in
14805 * its fold closure to the list that this node should match.
14806 * The fold closures data structure is a hash with the keys
14807 * being the UTF-8 of every character that is folded to, like
14808 * 'k', and the values each an array of all code points that
14809 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14810 * Multi-character folds are not included */
14811 if ((listp = hv_fetch(PL_utf8_foldclosures,
14812 (char *) foldbuf, foldlen, FALSE)))
14814 AV* list = (AV*) *listp;
14816 for (k = 0; k <= av_tindex(list); k++) {
14817 SV** c_p = av_fetch(list, k, FALSE);
14823 /* /aa doesn't allow folds between ASCII and non- */
14824 if ((ASCII_FOLD_RESTRICTED
14825 && (isASCII(c) != isASCII(j))))
14830 /* Folds under /l which cross the 255/256 boundary
14831 * are added to a separate list. (These are valid
14832 * only when the locale is UTF-8.) */
14833 if (c < 256 && LOC) {
14834 *use_list = add_cp_to_invlist(*use_list, c);
14838 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14840 cp_list = add_cp_to_invlist(cp_list, c);
14843 /* Similarly folds involving non-ascii Latin1
14844 * characters under /d are added to their list */
14845 depends_list = add_cp_to_invlist(depends_list,
14852 SvREFCNT_dec_NN(fold_intersection);
14855 /* Now that we have finished adding all the folds, there is no reason
14856 * to keep the foldable list separate */
14857 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14858 SvREFCNT_dec_NN(cp_foldable_list);
14861 /* And combine the result (if any) with any inversion list from posix
14862 * classes. The lists are kept separate up to now because we don't want to
14863 * fold the classes (folding of those is automatically handled by the swash
14864 * fetching code) */
14865 if (posixes || nposixes) {
14866 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14867 /* Under /a and /aa, nothing above ASCII matches these */
14868 _invlist_intersection(posixes,
14869 PL_XPosix_ptrs[_CC_ASCII],
14873 if (DEPENDS_SEMANTICS) {
14874 /* Under /d, everything in the upper half of the Latin1 range
14875 * matches these complements */
14876 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
14878 else if (AT_LEAST_ASCII_RESTRICTED) {
14879 /* Under /a and /aa, everything above ASCII matches these
14881 _invlist_union_complement_2nd(nposixes,
14882 PL_XPosix_ptrs[_CC_ASCII],
14886 _invlist_union(posixes, nposixes, &posixes);
14887 SvREFCNT_dec_NN(nposixes);
14890 posixes = nposixes;
14893 if (! DEPENDS_SEMANTICS) {
14895 _invlist_union(cp_list, posixes, &cp_list);
14896 SvREFCNT_dec_NN(posixes);
14903 /* Under /d, we put into a separate list the Latin1 things that
14904 * match only when the target string is utf8 */
14905 SV* nonascii_but_latin1_properties = NULL;
14906 _invlist_intersection(posixes, PL_UpperLatin1,
14907 &nonascii_but_latin1_properties);
14908 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14911 _invlist_union(cp_list, posixes, &cp_list);
14912 SvREFCNT_dec_NN(posixes);
14918 if (depends_list) {
14919 _invlist_union(depends_list, nonascii_but_latin1_properties,
14921 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14924 depends_list = nonascii_but_latin1_properties;
14929 /* And combine the result (if any) with any inversion list from properties.
14930 * The lists are kept separate up to now so that we can distinguish the two
14931 * in regards to matching above-Unicode. A run-time warning is generated
14932 * if a Unicode property is matched against a non-Unicode code point. But,
14933 * we allow user-defined properties to match anything, without any warning,
14934 * and we also suppress the warning if there is a portion of the character
14935 * class that isn't a Unicode property, and which matches above Unicode, \W
14936 * or [\x{110000}] for example.
14937 * (Note that in this case, unlike the Posix one above, there is no
14938 * <depends_list>, because having a Unicode property forces Unicode
14943 /* If it matters to the final outcome, see if a non-property
14944 * component of the class matches above Unicode. If so, the
14945 * warning gets suppressed. This is true even if just a single
14946 * such code point is specified, as though not strictly correct if
14947 * another such code point is matched against, the fact that they
14948 * are using above-Unicode code points indicates they should know
14949 * the issues involved */
14951 warn_super = ! (invert
14952 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14955 _invlist_union(properties, cp_list, &cp_list);
14956 SvREFCNT_dec_NN(properties);
14959 cp_list = properties;
14963 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14967 /* Here, we have calculated what code points should be in the character
14970 * Now we can see about various optimizations. Fold calculation (which we
14971 * did above) needs to take place before inversion. Otherwise /[^k]/i
14972 * would invert to include K, which under /i would match k, which it
14973 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14974 * folded until runtime */
14976 /* If we didn't do folding, it's because some information isn't available
14977 * until runtime; set the run-time fold flag for these. (We don't have to
14978 * worry about properties folding, as that is taken care of by the swash
14979 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14980 * locales, or the class matches at least one 0-255 range code point */
14982 if (only_utf8_locale_list) {
14983 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14985 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14988 invlist_iterinit(cp_list);
14989 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14990 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14992 invlist_iterfinish(cp_list);
14996 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14997 * at compile time. Besides not inverting folded locale now, we can't
14998 * invert if there are things such as \w, which aren't known until runtime
15002 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15004 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15006 _invlist_invert(cp_list);
15008 /* Any swash can't be used as-is, because we've inverted things */
15010 SvREFCNT_dec_NN(swash);
15014 /* Clear the invert flag since have just done it here */
15019 *ret_invlist = cp_list;
15020 SvREFCNT_dec(swash);
15022 /* Discard the generated node */
15024 RExC_size = orig_size;
15027 RExC_emit = orig_emit;
15032 /* Some character classes are equivalent to other nodes. Such nodes take
15033 * up less room and generally fewer operations to execute than ANYOF nodes.
15034 * Above, we checked for and optimized into some such equivalents for
15035 * certain common classes that are easy to test. Getting to this point in
15036 * the code means that the class didn't get optimized there. Since this
15037 * code is only executed in Pass 2, it is too late to save space--it has
15038 * been allocated in Pass 1, and currently isn't given back. But turning
15039 * things into an EXACTish node can allow the optimizer to join it to any
15040 * adjacent such nodes. And if the class is equivalent to things like /./,
15041 * expensive run-time swashes can be avoided. Now that we have more
15042 * complete information, we can find things necessarily missed by the
15043 * earlier code. I (khw) am not sure how much to look for here. It would
15044 * be easy, but perhaps too slow, to check any candidates against all the
15045 * node types they could possibly match using _invlistEQ(). */
15050 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15051 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15053 /* We don't optimize if we are supposed to make sure all non-Unicode
15054 * code points raise a warning, as only ANYOF nodes have this check.
15056 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15059 U8 op = END; /* The optimzation node-type */
15060 const char * cur_parse= RExC_parse;
15062 invlist_iterinit(cp_list);
15063 if (! invlist_iternext(cp_list, &start, &end)) {
15065 /* Here, the list is empty. This happens, for example, when a
15066 * Unicode property is the only thing in the character class, and
15067 * it doesn't match anything. (perluniprops.pod notes such
15070 *flagp |= HASWIDTH|SIMPLE;
15072 else if (start == end) { /* The range is a single code point */
15073 if (! invlist_iternext(cp_list, &start, &end)
15075 /* Don't do this optimization if it would require changing
15076 * the pattern to UTF-8 */
15077 && (start < 256 || UTF))
15079 /* Here, the list contains a single code point. Can optimize
15080 * into an EXACTish node */
15089 /* A locale node under folding with one code point can be
15090 * an EXACTFL, as its fold won't be calculated until
15096 /* Here, we are generally folding, but there is only one
15097 * code point to match. If we have to, we use an EXACT
15098 * node, but it would be better for joining with adjacent
15099 * nodes in the optimization pass if we used the same
15100 * EXACTFish node that any such are likely to be. We can
15101 * do this iff the code point doesn't participate in any
15102 * folds. For example, an EXACTF of a colon is the same as
15103 * an EXACT one, since nothing folds to or from a colon. */
15105 if (IS_IN_SOME_FOLD_L1(value)) {
15110 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15115 /* If we haven't found the node type, above, it means we
15116 * can use the prevailing one */
15118 op = compute_EXACTish(pRExC_state);
15123 else if (start == 0) {
15124 if (end == UV_MAX) {
15126 *flagp |= HASWIDTH|SIMPLE;
15129 else if (end == '\n' - 1
15130 && invlist_iternext(cp_list, &start, &end)
15131 && start == '\n' + 1 && end == UV_MAX)
15134 *flagp |= HASWIDTH|SIMPLE;
15138 invlist_iterfinish(cp_list);
15141 RExC_parse = (char *)orig_parse;
15142 RExC_emit = (regnode *)orig_emit;
15144 ret = reg_node(pRExC_state, op);
15146 RExC_parse = (char *)cur_parse;
15148 if (PL_regkind[op] == EXACT) {
15149 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15150 TRUE /* downgradable to EXACT */
15154 SvREFCNT_dec_NN(cp_list);
15159 /* Here, <cp_list> contains all the code points we can determine at
15160 * compile time that match under all conditions. Go through it, and
15161 * for things that belong in the bitmap, put them there, and delete from
15162 * <cp_list>. While we are at it, see if everything above 255 is in the
15163 * list, and if so, set a flag to speed up execution */
15165 populate_ANYOF_from_invlist(ret, &cp_list);
15168 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15171 /* Here, the bitmap has been populated with all the Latin1 code points that
15172 * always match. Can now add to the overall list those that match only
15173 * when the target string is UTF-8 (<depends_list>). */
15174 if (depends_list) {
15176 _invlist_union(cp_list, depends_list, &cp_list);
15177 SvREFCNT_dec_NN(depends_list);
15180 cp_list = depends_list;
15182 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15185 /* If there is a swash and more than one element, we can't use the swash in
15186 * the optimization below. */
15187 if (swash && element_count > 1) {
15188 SvREFCNT_dec_NN(swash);
15192 /* Note that the optimization of using 'swash' if it is the only thing in
15193 * the class doesn't have us change swash at all, so it can include things
15194 * that are also in the bitmap; otherwise we have purposely deleted that
15195 * duplicate information */
15196 set_ANYOF_arg(pRExC_state, ret, cp_list,
15197 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15199 only_utf8_locale_list,
15200 swash, has_user_defined_property);
15202 *flagp |= HASWIDTH|SIMPLE;
15204 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15205 RExC_contains_locale = 1;
15211 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15214 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15215 regnode* const node,
15217 SV* const runtime_defns,
15218 SV* const only_utf8_locale_list,
15220 const bool has_user_defined_property)
15222 /* Sets the arg field of an ANYOF-type node 'node', using information about
15223 * the node passed-in. If there is nothing outside the node's bitmap, the
15224 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15225 * the count returned by add_data(), having allocated and stored an array,
15226 * av, that that count references, as follows:
15227 * av[0] stores the character class description in its textual form.
15228 * This is used later (regexec.c:Perl_regclass_swash()) to
15229 * initialize the appropriate swash, and is also useful for dumping
15230 * the regnode. This is set to &PL_sv_undef if the textual
15231 * description is not needed at run-time (as happens if the other
15232 * elements completely define the class)
15233 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15234 * computed from av[0]. But if no further computation need be done,
15235 * the swash is stored here now (and av[0] is &PL_sv_undef).
15236 * av[2] stores the inversion list of code points that match only if the
15237 * current locale is UTF-8
15238 * av[3] stores the cp_list inversion list for use in addition or instead
15239 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15240 * (Otherwise everything needed is already in av[0] and av[1])
15241 * av[4] is set if any component of the class is from a user-defined
15242 * property; used only if av[3] exists */
15246 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15248 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15249 assert(! (ANYOF_FLAGS(node)
15250 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15251 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15252 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15255 AV * const av = newAV();
15258 assert(ANYOF_FLAGS(node)
15259 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15260 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15262 av_store(av, 0, (runtime_defns)
15263 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15266 av_store(av, 1, swash);
15267 SvREFCNT_dec_NN(cp_list);
15270 av_store(av, 1, &PL_sv_undef);
15272 av_store(av, 3, cp_list);
15273 av_store(av, 4, newSVuv(has_user_defined_property));
15277 if (only_utf8_locale_list) {
15278 av_store(av, 2, only_utf8_locale_list);
15281 av_store(av, 2, &PL_sv_undef);
15284 rv = newRV_noinc(MUTABLE_SV(av));
15285 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15286 RExC_rxi->data->data[n] = (void*)rv;
15291 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15293 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15294 const regnode* node,
15297 SV** only_utf8_locale_ptr,
15301 /* For internal core use only.
15302 * Returns the swash for the input 'node' in the regex 'prog'.
15303 * If <doinit> is 'true', will attempt to create the swash if not already
15305 * If <listsvp> is non-null, will return the printable contents of the
15306 * swash. This can be used to get debugging information even before the
15307 * swash exists, by calling this function with 'doinit' set to false, in
15308 * which case the components that will be used to eventually create the
15309 * swash are returned (in a printable form).
15310 * If <exclude_list> is not NULL, it is an inversion list of things to
15311 * exclude from what's returned in <listsvp>.
15312 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15313 * that, in spite of this function's name, the swash it returns may include
15314 * the bitmap data as well */
15317 SV *si = NULL; /* Input swash initialization string */
15318 SV* invlist = NULL;
15320 RXi_GET_DECL(prog,progi);
15321 const struct reg_data * const data = prog ? progi->data : NULL;
15323 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15325 assert(ANYOF_FLAGS(node)
15326 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15327 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15329 if (data && data->count) {
15330 const U32 n = ARG(node);
15332 if (data->what[n] == 's') {
15333 SV * const rv = MUTABLE_SV(data->data[n]);
15334 AV * const av = MUTABLE_AV(SvRV(rv));
15335 SV **const ary = AvARRAY(av);
15336 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15338 si = *ary; /* ary[0] = the string to initialize the swash with */
15340 /* Elements 3 and 4 are either both present or both absent. [3] is
15341 * any inversion list generated at compile time; [4] indicates if
15342 * that inversion list has any user-defined properties in it. */
15343 if (av_tindex(av) >= 2) {
15344 if (only_utf8_locale_ptr
15346 && ary[2] != &PL_sv_undef)
15348 *only_utf8_locale_ptr = ary[2];
15351 assert(only_utf8_locale_ptr);
15352 *only_utf8_locale_ptr = NULL;
15355 if (av_tindex(av) >= 3) {
15357 if (SvUV(ary[4])) {
15358 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15366 /* Element [1] is reserved for the set-up swash. If already there,
15367 * return it; if not, create it and store it there */
15368 if (ary[1] && SvROK(ary[1])) {
15371 else if (doinit && ((si && si != &PL_sv_undef)
15372 || (invlist && invlist != &PL_sv_undef))) {
15374 sw = _core_swash_init("utf8", /* the utf8 package */
15378 0, /* not from tr/// */
15380 &swash_init_flags);
15381 (void)av_store(av, 1, sw);
15386 /* If requested, return a printable version of what this swash matches */
15388 SV* matches_string = newSVpvs("");
15390 /* The swash should be used, if possible, to get the data, as it
15391 * contains the resolved data. But this function can be called at
15392 * compile-time, before everything gets resolved, in which case we
15393 * return the currently best available information, which is the string
15394 * that will eventually be used to do that resolving, 'si' */
15395 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15396 && (si && si != &PL_sv_undef))
15398 sv_catsv(matches_string, si);
15401 /* Add the inversion list to whatever we have. This may have come from
15402 * the swash, or from an input parameter */
15404 if (exclude_list) {
15405 SV* clone = invlist_clone(invlist);
15406 _invlist_subtract(clone, exclude_list, &clone);
15407 sv_catsv(matches_string, _invlist_contents(clone));
15408 SvREFCNT_dec_NN(clone);
15411 sv_catsv(matches_string, _invlist_contents(invlist));
15414 *listsvp = matches_string;
15419 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15421 /* reg_skipcomment()
15423 Absorbs an /x style # comment from the input stream,
15424 returning a pointer to the first character beyond the comment, or if the
15425 comment terminates the pattern without anything following it, this returns
15426 one past the final character of the pattern (in other words, RExC_end) and
15427 sets the REG_RUN_ON_COMMENT_SEEN flag.
15429 Note it's the callers responsibility to ensure that we are
15430 actually in /x mode
15434 PERL_STATIC_INLINE char*
15435 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15437 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15441 while (p < RExC_end) {
15442 if (*(++p) == '\n') {
15447 /* we ran off the end of the pattern without ending the comment, so we have
15448 * to add an \n when wrapping */
15449 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15455 Advances the parse position, and optionally absorbs
15456 "whitespace" from the inputstream.
15458 Without /x "whitespace" means (?#...) style comments only,
15459 with /x this means (?#...) and # comments and whitespace proper.
15461 Returns the RExC_parse point from BEFORE the scan occurs.
15463 This is the /x friendly way of saying RExC_parse++.
15467 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15469 char* const retval = RExC_parse++;
15471 PERL_ARGS_ASSERT_NEXTCHAR;
15474 if (RExC_end - RExC_parse >= 3
15475 && *RExC_parse == '('
15476 && RExC_parse[1] == '?'
15477 && RExC_parse[2] == '#')
15479 while (*RExC_parse != ')') {
15480 if (RExC_parse == RExC_end)
15481 FAIL("Sequence (?#... not terminated");
15487 if (RExC_flags & RXf_PMf_EXTENDED) {
15488 char * p = regpatws(pRExC_state, RExC_parse,
15489 TRUE); /* means recognize comments */
15490 if (p != RExC_parse) {
15500 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15502 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15503 * space. In pass1, it aligns and increments RExC_size; in pass2,
15506 regnode * const ret = RExC_emit;
15507 GET_RE_DEBUG_FLAGS_DECL;
15509 PERL_ARGS_ASSERT_REGNODE_GUTS;
15511 assert(extra_size >= regarglen[op]);
15514 SIZE_ALIGN(RExC_size);
15515 RExC_size += 1 + extra_size;
15518 if (RExC_emit >= RExC_emit_bound)
15519 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15520 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15522 NODE_ALIGN_FILL(ret);
15523 #ifndef RE_TRACK_PATTERN_OFFSETS
15524 PERL_UNUSED_ARG(name);
15526 if (RExC_offsets) { /* MJD */
15528 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15531 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15532 ? "Overwriting end of array!\n" : "OK",
15533 (UV)(RExC_emit - RExC_emit_start),
15534 (UV)(RExC_parse - RExC_start),
15535 (UV)RExC_offsets[0]));
15536 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15543 - reg_node - emit a node
15545 STATIC regnode * /* Location. */
15546 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15548 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15550 PERL_ARGS_ASSERT_REG_NODE;
15552 assert(regarglen[op] == 0);
15555 regnode *ptr = ret;
15556 FILL_ADVANCE_NODE(ptr, op);
15563 - reganode - emit a node with an argument
15565 STATIC regnode * /* Location. */
15566 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15568 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15570 PERL_ARGS_ASSERT_REGANODE;
15572 assert(regarglen[op] == 1);
15575 regnode *ptr = ret;
15576 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15583 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15585 /* emit a node with U32 and I32 arguments */
15587 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15589 PERL_ARGS_ASSERT_REG2LANODE;
15591 assert(regarglen[op] == 2);
15594 regnode *ptr = ret;
15595 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15602 - reguni - emit (if appropriate) a Unicode character
15604 PERL_STATIC_INLINE STRLEN
15605 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15607 PERL_ARGS_ASSERT_REGUNI;
15609 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15613 - reginsert - insert an operator in front of already-emitted operand
15615 * Means relocating the operand.
15618 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15623 const int offset = regarglen[(U8)op];
15624 const int size = NODE_STEP_REGNODE + offset;
15625 GET_RE_DEBUG_FLAGS_DECL;
15627 PERL_ARGS_ASSERT_REGINSERT;
15628 PERL_UNUSED_CONTEXT;
15629 PERL_UNUSED_ARG(depth);
15630 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15631 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15640 if (RExC_open_parens) {
15642 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15643 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15644 if ( RExC_open_parens[paren] >= opnd ) {
15645 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15646 RExC_open_parens[paren] += size;
15648 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15650 if ( RExC_close_parens[paren] >= opnd ) {
15651 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15652 RExC_close_parens[paren] += size;
15654 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15659 while (src > opnd) {
15660 StructCopy(--src, --dst, regnode);
15661 #ifdef RE_TRACK_PATTERN_OFFSETS
15662 if (RExC_offsets) { /* MJD 20010112 */
15664 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15668 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15669 ? "Overwriting end of array!\n" : "OK",
15670 (UV)(src - RExC_emit_start),
15671 (UV)(dst - RExC_emit_start),
15672 (UV)RExC_offsets[0]));
15673 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15674 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15680 place = opnd; /* Op node, where operand used to be. */
15681 #ifdef RE_TRACK_PATTERN_OFFSETS
15682 if (RExC_offsets) { /* MJD */
15684 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15688 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15689 ? "Overwriting end of array!\n" : "OK",
15690 (UV)(place - RExC_emit_start),
15691 (UV)(RExC_parse - RExC_start),
15692 (UV)RExC_offsets[0]));
15693 Set_Node_Offset(place, RExC_parse);
15694 Set_Node_Length(place, 1);
15697 src = NEXTOPER(place);
15698 FILL_ADVANCE_NODE(place, op);
15699 Zero(src, offset, regnode);
15703 - regtail - set the next-pointer at the end of a node chain of p to val.
15704 - SEE ALSO: regtail_study
15706 /* TODO: All three parms should be const */
15708 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15709 const regnode *val,U32 depth)
15712 GET_RE_DEBUG_FLAGS_DECL;
15714 PERL_ARGS_ASSERT_REGTAIL;
15716 PERL_UNUSED_ARG(depth);
15722 /* Find last node. */
15725 regnode * const temp = regnext(scan);
15727 SV * const mysv=sv_newmortal();
15728 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15729 regprop(RExC_rx, mysv, scan, NULL);
15730 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15731 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15732 (temp == NULL ? "->" : ""),
15733 (temp == NULL ? PL_reg_name[OP(val)] : "")
15741 if (reg_off_by_arg[OP(scan)]) {
15742 ARG_SET(scan, val - scan);
15745 NEXT_OFF(scan) = val - scan;
15751 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15752 - Look for optimizable sequences at the same time.
15753 - currently only looks for EXACT chains.
15755 This is experimental code. The idea is to use this routine to perform
15756 in place optimizations on branches and groups as they are constructed,
15757 with the long term intention of removing optimization from study_chunk so
15758 that it is purely analytical.
15760 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15761 to control which is which.
15764 /* TODO: All four parms should be const */
15767 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15768 const regnode *val,U32 depth)
15772 #ifdef EXPERIMENTAL_INPLACESCAN
15775 GET_RE_DEBUG_FLAGS_DECL;
15777 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15783 /* Find last node. */
15787 regnode * const temp = regnext(scan);
15788 #ifdef EXPERIMENTAL_INPLACESCAN
15789 if (PL_regkind[OP(scan)] == EXACT) {
15790 bool unfolded_multi_char; /* Unexamined in this routine */
15791 if (join_exact(pRExC_state, scan, &min,
15792 &unfolded_multi_char, 1, val, depth+1))
15797 switch (OP(scan)) {
15800 case EXACTFA_NO_TRIE:
15805 if( exact == PSEUDO )
15807 else if ( exact != OP(scan) )
15816 SV * const mysv=sv_newmortal();
15817 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15818 regprop(RExC_rx, mysv, scan, NULL);
15819 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15820 SvPV_nolen_const(mysv),
15821 REG_NODE_NUM(scan),
15822 PL_reg_name[exact]);
15829 SV * const mysv_val=sv_newmortal();
15830 DEBUG_PARSE_MSG("");
15831 regprop(RExC_rx, mysv_val, val, NULL);
15832 PerlIO_printf(Perl_debug_log,
15833 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15834 SvPV_nolen_const(mysv_val),
15835 (IV)REG_NODE_NUM(val),
15839 if (reg_off_by_arg[OP(scan)]) {
15840 ARG_SET(scan, val - scan);
15843 NEXT_OFF(scan) = val - scan;
15851 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15856 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15861 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15863 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15864 if (flags & (1<<bit)) {
15865 if (!set++ && lead)
15866 PerlIO_printf(Perl_debug_log, "%s",lead);
15867 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15872 PerlIO_printf(Perl_debug_log, "\n");
15874 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15879 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15885 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15887 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15888 if (flags & (1<<bit)) {
15889 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15892 if (!set++ && lead)
15893 PerlIO_printf(Perl_debug_log, "%s",lead);
15894 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15897 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15898 if (!set++ && lead) {
15899 PerlIO_printf(Perl_debug_log, "%s",lead);
15902 case REGEX_UNICODE_CHARSET:
15903 PerlIO_printf(Perl_debug_log, "UNICODE");
15905 case REGEX_LOCALE_CHARSET:
15906 PerlIO_printf(Perl_debug_log, "LOCALE");
15908 case REGEX_ASCII_RESTRICTED_CHARSET:
15909 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15911 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15912 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15915 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15921 PerlIO_printf(Perl_debug_log, "\n");
15923 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15929 Perl_regdump(pTHX_ const regexp *r)
15932 SV * const sv = sv_newmortal();
15933 SV *dsv= sv_newmortal();
15934 RXi_GET_DECL(r,ri);
15935 GET_RE_DEBUG_FLAGS_DECL;
15937 PERL_ARGS_ASSERT_REGDUMP;
15939 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15941 /* Header fields of interest. */
15942 if (r->anchored_substr) {
15943 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15944 RE_SV_DUMPLEN(r->anchored_substr), 30);
15945 PerlIO_printf(Perl_debug_log,
15946 "anchored %s%s at %"IVdf" ",
15947 s, RE_SV_TAIL(r->anchored_substr),
15948 (IV)r->anchored_offset);
15949 } else if (r->anchored_utf8) {
15950 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15951 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15952 PerlIO_printf(Perl_debug_log,
15953 "anchored utf8 %s%s at %"IVdf" ",
15954 s, RE_SV_TAIL(r->anchored_utf8),
15955 (IV)r->anchored_offset);
15957 if (r->float_substr) {
15958 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15959 RE_SV_DUMPLEN(r->float_substr), 30);
15960 PerlIO_printf(Perl_debug_log,
15961 "floating %s%s at %"IVdf"..%"UVuf" ",
15962 s, RE_SV_TAIL(r->float_substr),
15963 (IV)r->float_min_offset, (UV)r->float_max_offset);
15964 } else if (r->float_utf8) {
15965 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15966 RE_SV_DUMPLEN(r->float_utf8), 30);
15967 PerlIO_printf(Perl_debug_log,
15968 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15969 s, RE_SV_TAIL(r->float_utf8),
15970 (IV)r->float_min_offset, (UV)r->float_max_offset);
15972 if (r->check_substr || r->check_utf8)
15973 PerlIO_printf(Perl_debug_log,
15975 (r->check_substr == r->float_substr
15976 && r->check_utf8 == r->float_utf8
15977 ? "(checking floating" : "(checking anchored"));
15978 if (r->intflags & PREGf_NOSCAN)
15979 PerlIO_printf(Perl_debug_log, " noscan");
15980 if (r->extflags & RXf_CHECK_ALL)
15981 PerlIO_printf(Perl_debug_log, " isall");
15982 if (r->check_substr || r->check_utf8)
15983 PerlIO_printf(Perl_debug_log, ") ");
15985 if (ri->regstclass) {
15986 regprop(r, sv, ri->regstclass, NULL);
15987 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15989 if (r->intflags & PREGf_ANCH) {
15990 PerlIO_printf(Perl_debug_log, "anchored");
15991 if (r->intflags & PREGf_ANCH_MBOL)
15992 PerlIO_printf(Perl_debug_log, "(MBOL)");
15993 if (r->intflags & PREGf_ANCH_SBOL)
15994 PerlIO_printf(Perl_debug_log, "(SBOL)");
15995 if (r->intflags & PREGf_ANCH_GPOS)
15996 PerlIO_printf(Perl_debug_log, "(GPOS)");
15997 PerlIO_putc(Perl_debug_log, ' ');
15999 if (r->intflags & PREGf_GPOS_SEEN)
16000 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16001 if (r->intflags & PREGf_SKIP)
16002 PerlIO_printf(Perl_debug_log, "plus ");
16003 if (r->intflags & PREGf_IMPLICIT)
16004 PerlIO_printf(Perl_debug_log, "implicit ");
16005 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16006 if (r->extflags & RXf_EVAL_SEEN)
16007 PerlIO_printf(Perl_debug_log, "with eval ");
16008 PerlIO_printf(Perl_debug_log, "\n");
16010 regdump_extflags("r->extflags: ",r->extflags);
16011 regdump_intflags("r->intflags: ",r->intflags);
16014 PERL_ARGS_ASSERT_REGDUMP;
16015 PERL_UNUSED_CONTEXT;
16016 PERL_UNUSED_ARG(r);
16017 #endif /* DEBUGGING */
16021 - regprop - printable representation of opcode, with run time support
16025 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
16030 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16031 static const char * const anyofs[] = {
16032 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16033 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16034 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16035 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16036 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
16037 || _CC_VERTSPACE != 16
16038 #error Need to adjust order of anyofs[]
16075 RXi_GET_DECL(prog,progi);
16076 GET_RE_DEBUG_FLAGS_DECL;
16078 PERL_ARGS_ASSERT_REGPROP;
16082 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16083 /* It would be nice to FAIL() here, but this may be called from
16084 regexec.c, and it would be hard to supply pRExC_state. */
16085 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16086 (int)OP(o), (int)REGNODE_MAX);
16087 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16089 k = PL_regkind[OP(o)];
16092 sv_catpvs(sv, " ");
16093 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16094 * is a crude hack but it may be the best for now since
16095 * we have no flag "this EXACTish node was UTF-8"
16097 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16098 PERL_PV_ESCAPE_UNI_DETECT |
16099 PERL_PV_ESCAPE_NONASCII |
16100 PERL_PV_PRETTY_ELLIPSES |
16101 PERL_PV_PRETTY_LTGT |
16102 PERL_PV_PRETTY_NOCLEAR
16104 } else if (k == TRIE) {
16105 /* print the details of the trie in dumpuntil instead, as
16106 * progi->data isn't available here */
16107 const char op = OP(o);
16108 const U32 n = ARG(o);
16109 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16110 (reg_ac_data *)progi->data->data[n] :
16112 const reg_trie_data * const trie
16113 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16115 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16116 DEBUG_TRIE_COMPILE_r(
16117 Perl_sv_catpvf(aTHX_ sv,
16118 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16119 (UV)trie->startstate,
16120 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16121 (UV)trie->wordcount,
16124 (UV)TRIE_CHARCOUNT(trie),
16125 (UV)trie->uniquecharcount
16128 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16129 sv_catpvs(sv, "[");
16130 (void) put_charclass_bitmap_innards(sv,
16131 (IS_ANYOF_TRIE(op))
16133 : TRIE_BITMAP(trie),
16135 sv_catpvs(sv, "]");
16138 } else if (k == CURLY) {
16139 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16140 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16141 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16143 else if (k == WHILEM && o->flags) /* Ordinal/of */
16144 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16145 else if (k == REF || k == OPEN || k == CLOSE
16146 || k == GROUPP || OP(o)==ACCEPT)
16148 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16149 if ( RXp_PAREN_NAMES(prog) ) {
16150 if ( k != REF || (OP(o) < NREF)) {
16151 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16152 SV **name= av_fetch(list, ARG(o), 0 );
16154 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16157 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
16158 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16159 I32 *nums=(I32*)SvPVX(sv_dat);
16160 SV **name= av_fetch(list, nums[0], 0 );
16163 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16164 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16165 (n ? "," : ""), (IV)nums[n]);
16167 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16171 if ( k == REF && reginfo) {
16172 U32 n = ARG(o); /* which paren pair */
16173 I32 ln = prog->offs[n].start;
16174 if (prog->lastparen < n || ln == -1)
16175 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16176 else if (ln == prog->offs[n].end)
16177 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16179 const char *s = reginfo->strbeg + ln;
16180 Perl_sv_catpvf(aTHX_ sv, ": ");
16181 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16182 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16185 } else if (k == GOSUB)
16186 /* Paren and offset */
16187 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16188 else if (k == VERB) {
16190 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16191 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16192 } else if (k == LOGICAL)
16193 /* 2: embedded, otherwise 1 */
16194 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16195 else if (k == ANYOF) {
16196 const U8 flags = ANYOF_FLAGS(o);
16198 SV* bitmap_invlist; /* Will hold what the bit map contains */
16201 if (flags & ANYOF_LOCALE_FLAGS)
16202 sv_catpvs(sv, "{loc}");
16203 if (flags & ANYOF_LOC_FOLD)
16204 sv_catpvs(sv, "{i}");
16205 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16206 if (flags & ANYOF_INVERT)
16207 sv_catpvs(sv, "^");
16209 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16211 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16214 /* output any special charclass tests (used entirely under use
16216 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16218 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16219 if (ANYOF_POSIXL_TEST(o,i)) {
16220 sv_catpv(sv, anyofs[i]);
16226 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16227 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16228 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16232 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16233 if (flags & ANYOF_INVERT)
16234 /*make sure the invert info is in each */
16235 sv_catpvs(sv, "^");
16238 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16239 sv_catpvs(sv, "{non-utf8-latin1-all}");
16242 /* output information about the unicode matching */
16243 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16244 sv_catpvs(sv, "{above_bitmap_all}");
16245 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16246 SV *lv; /* Set if there is something outside the bit map. */
16247 bool byte_output = FALSE; /* If something in the bitmap has
16249 SV *only_utf8_locale;
16251 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16252 * is used to guarantee that nothing in the bitmap gets
16254 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16255 &lv, &only_utf8_locale,
16257 if (lv && lv != &PL_sv_undef) {
16258 char *s = savesvpv(lv);
16259 char * const origs = s;
16261 while (*s && *s != '\n')
16265 const char * const t = ++s;
16267 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16268 sv_catpvs(sv, "{outside bitmap}");
16271 sv_catpvs(sv, "{utf8}");
16275 sv_catpvs(sv, " ");
16281 /* Truncate very long output */
16282 if (s - origs > 256) {
16283 Perl_sv_catpvf(aTHX_ sv,
16285 (int) (s - origs - 1),
16291 else if (*s == '\t') {
16305 SvREFCNT_dec_NN(lv);
16308 if ((flags & ANYOF_LOC_FOLD)
16309 && only_utf8_locale
16310 && only_utf8_locale != &PL_sv_undef)
16313 int max_entries = 256;
16315 sv_catpvs(sv, "{utf8 locale}");
16316 invlist_iterinit(only_utf8_locale);
16317 while (invlist_iternext(only_utf8_locale,
16319 put_range(sv, start, end, FALSE);
16321 if (max_entries < 0) {
16322 sv_catpvs(sv, "...");
16326 invlist_iterfinish(only_utf8_locale);
16330 SvREFCNT_dec(bitmap_invlist);
16333 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16335 else if (k == POSIXD || k == NPOSIXD) {
16336 U8 index = FLAGS(o) * 2;
16337 if (index < C_ARRAY_LENGTH(anyofs)) {
16338 if (*anyofs[index] != '[') {
16341 sv_catpv(sv, anyofs[index]);
16342 if (*anyofs[index] != '[') {
16347 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16350 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16351 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16352 else if (OP(o) == SBOL)
16353 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16355 PERL_UNUSED_CONTEXT;
16356 PERL_UNUSED_ARG(sv);
16357 PERL_UNUSED_ARG(o);
16358 PERL_UNUSED_ARG(prog);
16359 PERL_UNUSED_ARG(reginfo);
16360 #endif /* DEBUGGING */
16366 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16367 { /* Assume that RE_INTUIT is set */
16368 struct regexp *const prog = ReANY(r);
16369 GET_RE_DEBUG_FLAGS_DECL;
16371 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16372 PERL_UNUSED_CONTEXT;
16376 const char * const s = SvPV_nolen_const(prog->check_substr
16377 ? prog->check_substr : prog->check_utf8);
16379 if (!PL_colorset) reginitcolors();
16380 PerlIO_printf(Perl_debug_log,
16381 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16383 prog->check_substr ? "" : "utf8 ",
16384 PL_colors[5],PL_colors[0],
16387 (strlen(s) > 60 ? "..." : ""));
16390 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16396 handles refcounting and freeing the perl core regexp structure. When
16397 it is necessary to actually free the structure the first thing it
16398 does is call the 'free' method of the regexp_engine associated to
16399 the regexp, allowing the handling of the void *pprivate; member
16400 first. (This routine is not overridable by extensions, which is why
16401 the extensions free is called first.)
16403 See regdupe and regdupe_internal if you change anything here.
16405 #ifndef PERL_IN_XSUB_RE
16407 Perl_pregfree(pTHX_ REGEXP *r)
16413 Perl_pregfree2(pTHX_ REGEXP *rx)
16415 struct regexp *const r = ReANY(rx);
16416 GET_RE_DEBUG_FLAGS_DECL;
16418 PERL_ARGS_ASSERT_PREGFREE2;
16420 if (r->mother_re) {
16421 ReREFCNT_dec(r->mother_re);
16423 CALLREGFREE_PVT(rx); /* free the private data */
16424 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16425 Safefree(r->xpv_len_u.xpvlenu_pv);
16428 SvREFCNT_dec(r->anchored_substr);
16429 SvREFCNT_dec(r->anchored_utf8);
16430 SvREFCNT_dec(r->float_substr);
16431 SvREFCNT_dec(r->float_utf8);
16432 Safefree(r->substrs);
16434 RX_MATCH_COPY_FREE(rx);
16435 #ifdef PERL_ANY_COW
16436 SvREFCNT_dec(r->saved_copy);
16439 SvREFCNT_dec(r->qr_anoncv);
16440 rx->sv_u.svu_rx = 0;
16445 This is a hacky workaround to the structural issue of match results
16446 being stored in the regexp structure which is in turn stored in
16447 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16448 could be PL_curpm in multiple contexts, and could require multiple
16449 result sets being associated with the pattern simultaneously, such
16450 as when doing a recursive match with (??{$qr})
16452 The solution is to make a lightweight copy of the regexp structure
16453 when a qr// is returned from the code executed by (??{$qr}) this
16454 lightweight copy doesn't actually own any of its data except for
16455 the starp/end and the actual regexp structure itself.
16461 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16463 struct regexp *ret;
16464 struct regexp *const r = ReANY(rx);
16465 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16467 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16470 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16472 SvOK_off((SV *)ret_x);
16474 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16475 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16476 made both spots point to the same regexp body.) */
16477 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16478 assert(!SvPVX(ret_x));
16479 ret_x->sv_u.svu_rx = temp->sv_any;
16480 temp->sv_any = NULL;
16481 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16482 SvREFCNT_dec_NN(temp);
16483 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16484 ing below will not set it. */
16485 SvCUR_set(ret_x, SvCUR(rx));
16488 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16489 sv_force_normal(sv) is called. */
16491 ret = ReANY(ret_x);
16493 SvFLAGS(ret_x) |= SvUTF8(rx);
16494 /* We share the same string buffer as the original regexp, on which we
16495 hold a reference count, incremented when mother_re is set below.
16496 The string pointer is copied here, being part of the regexp struct.
16498 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16499 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16501 const I32 npar = r->nparens+1;
16502 Newx(ret->offs, npar, regexp_paren_pair);
16503 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16506 Newx(ret->substrs, 1, struct reg_substr_data);
16507 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16509 SvREFCNT_inc_void(ret->anchored_substr);
16510 SvREFCNT_inc_void(ret->anchored_utf8);
16511 SvREFCNT_inc_void(ret->float_substr);
16512 SvREFCNT_inc_void(ret->float_utf8);
16514 /* check_substr and check_utf8, if non-NULL, point to either their
16515 anchored or float namesakes, and don't hold a second reference. */
16517 RX_MATCH_COPIED_off(ret_x);
16518 #ifdef PERL_ANY_COW
16519 ret->saved_copy = NULL;
16521 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16522 SvREFCNT_inc_void(ret->qr_anoncv);
16528 /* regfree_internal()
16530 Free the private data in a regexp. This is overloadable by
16531 extensions. Perl takes care of the regexp structure in pregfree(),
16532 this covers the *pprivate pointer which technically perl doesn't
16533 know about, however of course we have to handle the
16534 regexp_internal structure when no extension is in use.
16536 Note this is called before freeing anything in the regexp
16541 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16543 struct regexp *const r = ReANY(rx);
16544 RXi_GET_DECL(r,ri);
16545 GET_RE_DEBUG_FLAGS_DECL;
16547 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16553 SV *dsv= sv_newmortal();
16554 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16555 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16556 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16557 PL_colors[4],PL_colors[5],s);
16560 #ifdef RE_TRACK_PATTERN_OFFSETS
16562 Safefree(ri->u.offsets); /* 20010421 MJD */
16564 if (ri->code_blocks) {
16566 for (n = 0; n < ri->num_code_blocks; n++)
16567 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16568 Safefree(ri->code_blocks);
16572 int n = ri->data->count;
16575 /* If you add a ->what type here, update the comment in regcomp.h */
16576 switch (ri->data->what[n]) {
16582 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16585 Safefree(ri->data->data[n]);
16591 { /* Aho Corasick add-on structure for a trie node.
16592 Used in stclass optimization only */
16594 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16595 #ifdef USE_ITHREADS
16599 refcount = --aho->refcount;
16602 PerlMemShared_free(aho->states);
16603 PerlMemShared_free(aho->fail);
16604 /* do this last!!!! */
16605 PerlMemShared_free(ri->data->data[n]);
16606 /* we should only ever get called once, so
16607 * assert as much, and also guard the free
16608 * which /might/ happen twice. At the least
16609 * it will make code anlyzers happy and it
16610 * doesn't cost much. - Yves */
16611 assert(ri->regstclass);
16612 if (ri->regstclass) {
16613 PerlMemShared_free(ri->regstclass);
16614 ri->regstclass = 0;
16621 /* trie structure. */
16623 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16624 #ifdef USE_ITHREADS
16628 refcount = --trie->refcount;
16631 PerlMemShared_free(trie->charmap);
16632 PerlMemShared_free(trie->states);
16633 PerlMemShared_free(trie->trans);
16635 PerlMemShared_free(trie->bitmap);
16637 PerlMemShared_free(trie->jump);
16638 PerlMemShared_free(trie->wordinfo);
16639 /* do this last!!!! */
16640 PerlMemShared_free(ri->data->data[n]);
16645 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16646 ri->data->what[n]);
16649 Safefree(ri->data->what);
16650 Safefree(ri->data);
16656 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16657 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16658 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16661 re_dup - duplicate a regexp.
16663 This routine is expected to clone a given regexp structure. It is only
16664 compiled under USE_ITHREADS.
16666 After all of the core data stored in struct regexp is duplicated
16667 the regexp_engine.dupe method is used to copy any private data
16668 stored in the *pprivate pointer. This allows extensions to handle
16669 any duplication it needs to do.
16671 See pregfree() and regfree_internal() if you change anything here.
16673 #if defined(USE_ITHREADS)
16674 #ifndef PERL_IN_XSUB_RE
16676 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16680 const struct regexp *r = ReANY(sstr);
16681 struct regexp *ret = ReANY(dstr);
16683 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16685 npar = r->nparens+1;
16686 Newx(ret->offs, npar, regexp_paren_pair);
16687 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16689 if (ret->substrs) {
16690 /* Do it this way to avoid reading from *r after the StructCopy().
16691 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16692 cache, it doesn't matter. */
16693 const bool anchored = r->check_substr
16694 ? r->check_substr == r->anchored_substr
16695 : r->check_utf8 == r->anchored_utf8;
16696 Newx(ret->substrs, 1, struct reg_substr_data);
16697 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16699 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16700 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16701 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16702 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16704 /* check_substr and check_utf8, if non-NULL, point to either their
16705 anchored or float namesakes, and don't hold a second reference. */
16707 if (ret->check_substr) {
16709 assert(r->check_utf8 == r->anchored_utf8);
16710 ret->check_substr = ret->anchored_substr;
16711 ret->check_utf8 = ret->anchored_utf8;
16713 assert(r->check_substr == r->float_substr);
16714 assert(r->check_utf8 == r->float_utf8);
16715 ret->check_substr = ret->float_substr;
16716 ret->check_utf8 = ret->float_utf8;
16718 } else if (ret->check_utf8) {
16720 ret->check_utf8 = ret->anchored_utf8;
16722 ret->check_utf8 = ret->float_utf8;
16727 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16728 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16731 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16733 if (RX_MATCH_COPIED(dstr))
16734 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16736 ret->subbeg = NULL;
16737 #ifdef PERL_ANY_COW
16738 ret->saved_copy = NULL;
16741 /* Whether mother_re be set or no, we need to copy the string. We
16742 cannot refrain from copying it when the storage points directly to
16743 our mother regexp, because that's
16744 1: a buffer in a different thread
16745 2: something we no longer hold a reference on
16746 so we need to copy it locally. */
16747 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16748 ret->mother_re = NULL;
16750 #endif /* PERL_IN_XSUB_RE */
16755 This is the internal complement to regdupe() which is used to copy
16756 the structure pointed to by the *pprivate pointer in the regexp.
16757 This is the core version of the extension overridable cloning hook.
16758 The regexp structure being duplicated will be copied by perl prior
16759 to this and will be provided as the regexp *r argument, however
16760 with the /old/ structures pprivate pointer value. Thus this routine
16761 may override any copying normally done by perl.
16763 It returns a pointer to the new regexp_internal structure.
16767 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16770 struct regexp *const r = ReANY(rx);
16771 regexp_internal *reti;
16773 RXi_GET_DECL(r,ri);
16775 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16779 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16780 char, regexp_internal);
16781 Copy(ri->program, reti->program, len+1, regnode);
16783 reti->num_code_blocks = ri->num_code_blocks;
16784 if (ri->code_blocks) {
16786 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16787 struct reg_code_block);
16788 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16789 struct reg_code_block);
16790 for (n = 0; n < ri->num_code_blocks; n++)
16791 reti->code_blocks[n].src_regex = (REGEXP*)
16792 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16795 reti->code_blocks = NULL;
16797 reti->regstclass = NULL;
16800 struct reg_data *d;
16801 const int count = ri->data->count;
16804 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16805 char, struct reg_data);
16806 Newx(d->what, count, U8);
16809 for (i = 0; i < count; i++) {
16810 d->what[i] = ri->data->what[i];
16811 switch (d->what[i]) {
16812 /* see also regcomp.h and regfree_internal() */
16813 case 'a': /* actually an AV, but the dup function is identical. */
16817 case 'u': /* actually an HV, but the dup function is identical. */
16818 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16821 /* This is cheating. */
16822 Newx(d->data[i], 1, regnode_ssc);
16823 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16824 reti->regstclass = (regnode*)d->data[i];
16827 /* Trie stclasses are readonly and can thus be shared
16828 * without duplication. We free the stclass in pregfree
16829 * when the corresponding reg_ac_data struct is freed.
16831 reti->regstclass= ri->regstclass;
16835 ((reg_trie_data*)ri->data->data[i])->refcount++;
16840 d->data[i] = ri->data->data[i];
16843 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16844 ri->data->what[i]);
16853 reti->name_list_idx = ri->name_list_idx;
16855 #ifdef RE_TRACK_PATTERN_OFFSETS
16856 if (ri->u.offsets) {
16857 Newx(reti->u.offsets, 2*len+1, U32);
16858 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16861 SetProgLen(reti,len);
16864 return (void*)reti;
16867 #endif /* USE_ITHREADS */
16869 #ifndef PERL_IN_XSUB_RE
16872 - regnext - dig the "next" pointer out of a node
16875 Perl_regnext(pTHX_ regnode *p)
16882 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16883 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16884 (int)OP(p), (int)REGNODE_MAX);
16887 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16896 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16899 STRLEN l1 = strlen(pat1);
16900 STRLEN l2 = strlen(pat2);
16903 const char *message;
16905 PERL_ARGS_ASSERT_RE_CROAK2;
16911 Copy(pat1, buf, l1 , char);
16912 Copy(pat2, buf + l1, l2 , char);
16913 buf[l1 + l2] = '\n';
16914 buf[l1 + l2 + 1] = '\0';
16915 va_start(args, pat2);
16916 msv = vmess(buf, &args);
16918 message = SvPV_const(msv,l1);
16921 Copy(message, buf, l1 , char);
16922 /* l1-1 to avoid \n */
16923 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16927 /* Certain characters are output as a sequence with the first being a
16929 #define isBACKSLASHED_PUNCT(c) \
16930 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
16933 S_put_code_point(pTHX_ SV *sv, UV c)
16935 PERL_ARGS_ASSERT_PUT_CODE_POINT;
16938 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
16940 else if (isPRINT(c)) {
16941 const char string = (char) c;
16942 if (isBACKSLASHED_PUNCT(c))
16943 sv_catpvs(sv, "\\");
16944 sv_catpvn(sv, &string, 1);
16947 const char * const mnemonic = cntrl_to_mnemonic((char) c);
16949 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
16952 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
16957 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
16960 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
16962 /* Appends to 'sv' a displayable version of the range of code points from
16963 * 'start' to 'end'. It assumes that only ASCII printables are displayable
16964 * as-is (though some of these will be escaped by put_code_point()). */
16966 const unsigned int min_range_count = 3;
16968 assert(start <= end);
16970 PERL_ARGS_ASSERT_PUT_RANGE;
16972 while (start <= end) {
16974 const char * format;
16976 if (end - start < min_range_count) {
16978 /* Individual chars in short ranges */
16979 for (; start <= end; start++) {
16980 put_code_point(sv, start);
16985 /* If permitted by the input options, and there is a possibility that
16986 * this range contains a printable literal, look to see if there is
16988 if (allow_literals && start <= MAX_PRINT_A) {
16990 /* If the range begin isn't an ASCII printable, effectively split
16991 * the range into two parts:
16992 * 1) the portion before the first such printable,
16994 * and output them separately. */
16995 if (! isPRINT_A(start)) {
16996 UV temp_end = start + 1;
16998 /* There is no point looking beyond the final possible
16999 * printable, in MAX_PRINT_A */
17000 UV max = MIN(end, MAX_PRINT_A);
17002 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17006 /* Here, temp_end points to one beyond the first printable if
17007 * found, or to one beyond 'max' if not. If none found, make
17008 * sure that we use the entire range */
17009 if (temp_end > MAX_PRINT_A) {
17010 temp_end = end + 1;
17013 /* Output the first part of the split range, the part that
17014 * doesn't have printables, with no looking for literals
17015 * (otherwise we would infinitely recurse) */
17016 put_range(sv, start, temp_end - 1, FALSE);
17018 /* The 2nd part of the range (if any) starts here. */
17021 /* We continue instead of dropping down because even if the 2nd
17022 * part is non-empty, it could be so short that we want to
17023 * output it specially, as tested for at the top of this loop.
17028 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17029 * output a sub-range of just the digits or letters, then process
17030 * the remaining portion as usual. */
17031 if (isALPHANUMERIC_A(start)) {
17032 UV mask = (isDIGIT_A(start))
17037 UV temp_end = start + 1;
17039 /* Find the end of the sub-range that includes just the
17040 * characters in the same class as the first character in it */
17041 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17046 /* For short ranges, don't duplicate the code above to output
17047 * them; just call recursively */
17048 if (temp_end - start < min_range_count) {
17049 put_range(sv, start, temp_end, FALSE);
17051 else { /* Output as a range */
17052 put_code_point(sv, start);
17053 sv_catpvs(sv, "-");
17054 put_code_point(sv, temp_end);
17056 start = temp_end + 1;
17060 /* We output any other printables as individual characters */
17061 if (isPUNCT_A(start) || isSPACE_A(start)) {
17062 while (start <= end && (isPUNCT_A(start)
17063 || isSPACE_A(start)))
17065 put_code_point(sv, start);
17070 } /* End of looking for literals */
17072 /* Here is not to output as a literal. Some control characters have
17073 * mnemonic names. Split off any of those at the beginning and end of
17074 * the range to print mnemonically. It isn't possible for many of
17075 * these to be in a row, so this won't overwhelm with output */
17076 while (isMNEMONIC_CNTRL(start) && start <= end) {
17077 put_code_point(sv, start);
17080 if (start < end && isMNEMONIC_CNTRL(end)) {
17082 /* Here, the final character in the range has a mnemonic name.
17083 * Work backwards from the end to find the final non-mnemonic */
17084 UV temp_end = end - 1;
17085 while (isMNEMONIC_CNTRL(temp_end)) {
17089 /* And separately output the range that doesn't have mnemonics */
17090 put_range(sv, start, temp_end, FALSE);
17092 /* Then output the mnemonic trailing controls */
17093 start = temp_end + 1;
17094 while (start <= end) {
17095 put_code_point(sv, start);
17101 /* As a final resort, output the range or subrange as hex. */
17103 this_end = (end < NUM_ANYOF_CODE_POINTS)
17105 : NUM_ANYOF_CODE_POINTS - 1;
17106 format = (this_end < 256)
17107 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17108 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17109 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17115 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17117 /* Appends to 'sv' a displayable version of the innards of the bracketed
17118 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17119 * output anything, and bitmap_invlist, if not NULL, will point to an
17120 * inversion list of what is in the bit map */
17124 unsigned int punct_count = 0;
17125 SV* invlist = NULL;
17126 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17127 bool allow_literals = TRUE;
17129 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17131 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17133 /* Worst case is exactly every-other code point is in the list */
17134 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17136 /* Convert the bit map to an inversion list, keeping track of how many
17137 * ASCII puncts are set, including an extra amount for the backslashed
17139 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17140 if (BITMAP_TEST(bitmap, i)) {
17141 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17142 if (isPUNCT_A(i)) {
17144 if isBACKSLASHED_PUNCT(i) {
17151 /* Nothing to output */
17152 if (_invlist_len(*invlist_ptr) == 0) {
17153 SvREFCNT_dec(invlist);
17157 /* Generally, it is more readable if printable characters are output as
17158 * literals, but if a range (nearly) spans all of them, it's best to output
17159 * it as a single range. This code will use a single range if all but 2
17160 * printables are in it */
17161 invlist_iterinit(*invlist_ptr);
17162 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17164 /* If range starts beyond final printable, it doesn't have any in it */
17165 if (start > MAX_PRINT_A) {
17169 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17170 * all but two, the range must start and end no later than 2 from
17172 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17173 if (end > MAX_PRINT_A) {
17179 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17180 allow_literals = FALSE;
17185 invlist_iterfinish(*invlist_ptr);
17187 /* The legibility of the output depends mostly on how many punctuation
17188 * characters are output. There are 32 possible ASCII ones, and some have
17189 * an additional backslash, bringing it to currently 36, so if any more
17190 * than 18 are to be output, we can instead output it as its complement,
17191 * yielding fewer puncts, and making it more legible. But give some weight
17192 * to the fact that outputting it as a complement is less legible than a
17193 * straight output, so don't complement unless we are somewhat over the 18
17195 if (allow_literals && punct_count > 22) {
17196 sv_catpvs(sv, "^");
17198 /* Add everything remaining to the list, so when we invert it just
17199 * below, it will be excluded */
17200 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17201 _invlist_invert(*invlist_ptr);
17204 /* Here we have figured things out. Output each range */
17205 invlist_iterinit(*invlist_ptr);
17206 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17207 if (start >= NUM_ANYOF_CODE_POINTS) {
17210 put_range(sv, start, end, allow_literals);
17212 invlist_iterfinish(*invlist_ptr);
17217 #define CLEAR_OPTSTART \
17218 if (optstart) STMT_START { \
17219 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17220 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17224 #define DUMPUNTIL(b,e) \
17226 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17228 STATIC const regnode *
17229 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17230 const regnode *last, const regnode *plast,
17231 SV* sv, I32 indent, U32 depth)
17233 U8 op = PSEUDO; /* Arbitrary non-END op. */
17234 const regnode *next;
17235 const regnode *optstart= NULL;
17237 RXi_GET_DECL(r,ri);
17238 GET_RE_DEBUG_FLAGS_DECL;
17240 PERL_ARGS_ASSERT_DUMPUNTIL;
17242 #ifdef DEBUG_DUMPUNTIL
17243 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17244 last ? last-start : 0,plast ? plast-start : 0);
17247 if (plast && plast < last)
17250 while (PL_regkind[op] != END && (!last || node < last)) {
17252 /* While that wasn't END last time... */
17255 if (op == CLOSE || op == WHILEM)
17257 next = regnext((regnode *)node);
17260 if (OP(node) == OPTIMIZED) {
17261 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17268 regprop(r, sv, node, NULL);
17269 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17270 (int)(2*indent + 1), "", SvPVX_const(sv));
17272 if (OP(node) != OPTIMIZED) {
17273 if (next == NULL) /* Next ptr. */
17274 PerlIO_printf(Perl_debug_log, " (0)");
17275 else if (PL_regkind[(U8)op] == BRANCH
17276 && PL_regkind[OP(next)] != BRANCH )
17277 PerlIO_printf(Perl_debug_log, " (FAIL)");
17279 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17280 (void)PerlIO_putc(Perl_debug_log, '\n');
17284 if (PL_regkind[(U8)op] == BRANCHJ) {
17287 const regnode *nnode = (OP(next) == LONGJMP
17288 ? regnext((regnode *)next)
17290 if (last && nnode > last)
17292 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17295 else if (PL_regkind[(U8)op] == BRANCH) {
17297 DUMPUNTIL(NEXTOPER(node), next);
17299 else if ( PL_regkind[(U8)op] == TRIE ) {
17300 const regnode *this_trie = node;
17301 const char op = OP(node);
17302 const U32 n = ARG(node);
17303 const reg_ac_data * const ac = op>=AHOCORASICK ?
17304 (reg_ac_data *)ri->data->data[n] :
17306 const reg_trie_data * const trie =
17307 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17309 AV *const trie_words
17310 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17312 const regnode *nextbranch= NULL;
17315 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17316 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17318 PerlIO_printf(Perl_debug_log, "%*s%s ",
17319 (int)(2*(indent+3)), "",
17321 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17322 SvCUR(*elem_ptr), 60,
17323 PL_colors[0], PL_colors[1],
17325 ? PERL_PV_ESCAPE_UNI
17327 | PERL_PV_PRETTY_ELLIPSES
17328 | PERL_PV_PRETTY_LTGT
17333 U16 dist= trie->jump[word_idx+1];
17334 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17335 (UV)((dist ? this_trie + dist : next) - start));
17338 nextbranch= this_trie + trie->jump[0];
17339 DUMPUNTIL(this_trie + dist, nextbranch);
17341 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17342 nextbranch= regnext((regnode *)nextbranch);
17344 PerlIO_printf(Perl_debug_log, "\n");
17347 if (last && next > last)
17352 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17353 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17354 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17356 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17358 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17360 else if ( op == PLUS || op == STAR) {
17361 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17363 else if (PL_regkind[(U8)op] == ANYOF) {
17364 /* arglen 1 + class block */
17365 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17366 ? ANYOF_POSIXL_SKIP
17368 node = NEXTOPER(node);
17370 else if (PL_regkind[(U8)op] == EXACT) {
17371 /* Literal string, where present. */
17372 node += NODE_SZ_STR(node) - 1;
17373 node = NEXTOPER(node);
17376 node = NEXTOPER(node);
17377 node += regarglen[(U8)op];
17379 if (op == CURLYX || op == OPEN)
17383 #ifdef DEBUG_DUMPUNTIL
17384 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17389 #endif /* DEBUGGING */
17393 * c-indentation-style: bsd
17394 * c-basic-offset: 4
17395 * indent-tabs-mode: nil
17398 * ex: set ts=8 sts=4 sw=4 et: