5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 SSize_t whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
130 implies compiling, so don't emit */
131 regnode emit_dummy; /* placeholder for emit to point to */
132 I32 naughty; /* How bad is this pattern? */
133 I32 sawback; /* Did we see \1, ...? */
135 SSize_t size; /* Code size. */
136 I32 npar; /* Capture buffer count, (OPEN). */
137 I32 cpar; /* Capture buffer count, (CLOSE). */
138 I32 nestroot; /* root parens we are in - used by accept */
141 regnode **open_parens; /* pointers to open parens */
142 regnode **close_parens; /* pointers to close parens */
143 regnode *opend; /* END node in program */
144 I32 utf8; /* whether the pattern is utf8 or not */
145 I32 orig_utf8; /* whether the pattern was originally in utf8 */
146 /* XXX use this for future optimisation of case
147 * where pattern must be upgraded to utf8. */
148 I32 uni_semantics; /* If a d charset modifier should use unicode
149 rules, even if the pattern is not in
151 HV *paren_names; /* Paren names */
153 regnode **recurse; /* Recurse regops */
154 I32 recurse_count; /* Number of recurse regops */
157 I32 override_recoding;
158 I32 in_multi_char_class;
159 struct reg_code_block *code_blocks; /* positions of literal (?{})
161 int num_code_blocks; /* size of code_blocks[] */
162 int code_index; /* next code_blocks[] slot */
164 char *starttry; /* -Dr: where regtry was called. */
165 #define RExC_starttry (pRExC_state->starttry)
167 SV *runtime_code_qr; /* qr with the runtime code blocks */
169 const char *lastparse;
171 AV *paren_name_list; /* idx -> name */
172 #define RExC_lastparse (pRExC_state->lastparse)
173 #define RExC_lastnum (pRExC_state->lastnum)
174 #define RExC_paren_name_list (pRExC_state->paren_name_list)
178 #define RExC_flags (pRExC_state->flags)
179 #define RExC_pm_flags (pRExC_state->pm_flags)
180 #define RExC_precomp (pRExC_state->precomp)
181 #define RExC_rx_sv (pRExC_state->rx_sv)
182 #define RExC_rx (pRExC_state->rx)
183 #define RExC_rxi (pRExC_state->rxi)
184 #define RExC_start (pRExC_state->start)
185 #define RExC_end (pRExC_state->end)
186 #define RExC_parse (pRExC_state->parse)
187 #define RExC_whilem_seen (pRExC_state->whilem_seen)
188 #ifdef RE_TRACK_PATTERN_OFFSETS
189 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
191 #define RExC_emit (pRExC_state->emit)
192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
193 #define RExC_emit_start (pRExC_state->emit_start)
194 #define RExC_emit_bound (pRExC_state->emit_bound)
195 #define RExC_naughty (pRExC_state->naughty)
196 #define RExC_sawback (pRExC_state->sawback)
197 #define RExC_seen (pRExC_state->seen)
198 #define RExC_size (pRExC_state->size)
199 #define RExC_npar (pRExC_state->npar)
200 #define RExC_nestroot (pRExC_state->nestroot)
201 #define RExC_extralen (pRExC_state->extralen)
202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203 #define RExC_utf8 (pRExC_state->utf8)
204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
206 #define RExC_open_parens (pRExC_state->open_parens)
207 #define RExC_close_parens (pRExC_state->close_parens)
208 #define RExC_opend (pRExC_state->opend)
209 #define RExC_paren_names (pRExC_state->paren_names)
210 #define RExC_recurse (pRExC_state->recurse)
211 #define RExC_recurse_count (pRExC_state->recurse_count)
212 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
213 #define RExC_contains_locale (pRExC_state->contains_locale)
214 #define RExC_override_recoding (pRExC_state->override_recoding)
215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
218 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
219 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
220 ((*s) == '{' && regcurly(s, FALSE)))
223 #undef SPSTART /* dratted cpp namespace... */
226 * Flags to be passed up and down.
228 #define WORST 0 /* Worst case. */
229 #define HASWIDTH 0x01 /* Known to match non-null strings. */
231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
232 * character. (There needs to be a case: in the switch statement in regexec.c
233 * for any node marked SIMPLE.) Note that this is not the same thing as
236 #define SPSTART 0x04 /* Starts with * or + */
237 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
238 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
239 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
243 /* whether trie related optimizations are enabled */
244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245 #define TRIE_STUDY_OPT
246 #define FULL_TRIE_STUDY
252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253 #define PBITVAL(paren) (1 << ((paren) & 7))
254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
258 #define REQUIRE_UTF8 STMT_START { \
260 *flagp = RESTART_UTF8; \
265 /* This converts the named class defined in regcomp.h to its equivalent class
266 * number defined in handy.h. */
267 #define namedclass_to_classnum(class) ((int) ((class) / 2))
268 #define classnum_to_namedclass(classnum) ((classnum) * 2)
270 /* About scan_data_t.
272 During optimisation we recurse through the regexp program performing
273 various inplace (keyhole style) optimisations. In addition study_chunk
274 and scan_commit populate this data structure with information about
275 what strings MUST appear in the pattern. We look for the longest
276 string that must appear at a fixed location, and we look for the
277 longest string that may appear at a floating location. So for instance
282 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
283 strings (because they follow a .* construct). study_chunk will identify
284 both FOO and BAR as being the longest fixed and floating strings respectively.
286 The strings can be composites, for instance
290 will result in a composite fixed substring 'foo'.
292 For each string some basic information is maintained:
294 - offset or min_offset
295 This is the position the string must appear at, or not before.
296 It also implicitly (when combined with minlenp) tells us how many
297 characters must match before the string we are searching for.
298 Likewise when combined with minlenp and the length of the string it
299 tells us how many characters must appear after the string we have
303 Only used for floating strings. This is the rightmost point that
304 the string can appear at. If set to SSize_t_MAX it indicates that the
305 string can occur infinitely far to the right.
308 A pointer to the minimum number of characters of the pattern that the
309 string was found inside. This is important as in the case of positive
310 lookahead or positive lookbehind we can have multiple patterns
315 The minimum length of the pattern overall is 3, the minimum length
316 of the lookahead part is 3, but the minimum length of the part that
317 will actually match is 1. So 'FOO's minimum length is 3, but the
318 minimum length for the F is 1. This is important as the minimum length
319 is used to determine offsets in front of and behind the string being
320 looked for. Since strings can be composites this is the length of the
321 pattern at the time it was committed with a scan_commit. Note that
322 the length is calculated by study_chunk, so that the minimum lengths
323 are not known until the full pattern has been compiled, thus the
324 pointer to the value.
328 In the case of lookbehind the string being searched for can be
329 offset past the start point of the final matching string.
330 If this value was just blithely removed from the min_offset it would
331 invalidate some of the calculations for how many chars must match
332 before or after (as they are derived from min_offset and minlen and
333 the length of the string being searched for).
334 When the final pattern is compiled and the data is moved from the
335 scan_data_t structure into the regexp structure the information
336 about lookbehind is factored in, with the information that would
337 have been lost precalculated in the end_shift field for the
340 The fields pos_min and pos_delta are used to store the minimum offset
341 and the delta to the maximum offset at the current point in the pattern.
345 typedef struct scan_data_t {
346 /*I32 len_min; unused */
347 /*I32 len_delta; unused */
351 SSize_t last_end; /* min value, <0 unless valid. */
352 SSize_t last_start_min;
353 SSize_t last_start_max;
354 SV **longest; /* Either &l_fixed, or &l_float. */
355 SV *longest_fixed; /* longest fixed string found in pattern */
356 SSize_t offset_fixed; /* offset where it starts */
357 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
359 SV *longest_float; /* longest floating string found in pattern */
360 SSize_t offset_float_min; /* earliest point in string it can appear */
361 SSize_t offset_float_max; /* latest point in string it can appear */
362 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
363 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
366 SSize_t *last_closep;
367 struct regnode_charclass_class *start_class;
370 /* The below is perhaps overboard, but this allows us to save a test at the
371 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
372 * and 'a' differ by a single bit; the same with the upper and lower case of
373 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
374 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
375 * then inverts it to form a mask, with just a single 0, in the bit position
376 * where the upper- and lowercase differ. XXX There are about 40 other
377 * instances in the Perl core where this micro-optimization could be used.
378 * Should decide if maintenance cost is worse, before changing those
380 * Returns a boolean as to whether or not 'v' is either a lowercase or
381 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
382 * compile-time constant, the generated code is better than some optimizing
383 * compilers figure out, amounting to a mask and test. The results are
384 * meaningless if 'c' is not one of [A-Za-z] */
385 #define isARG2_lower_or_UPPER_ARG1(c, v) \
386 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
389 * Forward declarations for pregcomp()'s friends.
392 static const scan_data_t zero_scan_data =
393 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
395 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
396 #define SF_BEFORE_SEOL 0x0001
397 #define SF_BEFORE_MEOL 0x0002
398 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
399 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
402 # define SF_FIX_SHIFT_EOL (0+2)
403 # define SF_FL_SHIFT_EOL (0+4)
405 # define SF_FIX_SHIFT_EOL (+2)
406 # define SF_FL_SHIFT_EOL (+4)
409 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
410 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
412 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
413 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
414 #define SF_IS_INF 0x0040
415 #define SF_HAS_PAR 0x0080
416 #define SF_IN_PAR 0x0100
417 #define SF_HAS_EVAL 0x0200
418 #define SCF_DO_SUBSTR 0x0400
419 #define SCF_DO_STCLASS_AND 0x0800
420 #define SCF_DO_STCLASS_OR 0x1000
421 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
422 #define SCF_WHILEM_VISITED_POS 0x2000
424 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
425 #define SCF_SEEN_ACCEPT 0x8000
426 #define SCF_TRIE_DOING_RESTUDY 0x10000
428 #define UTF cBOOL(RExC_utf8)
430 /* The enums for all these are ordered so things work out correctly */
431 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
432 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
433 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
434 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
435 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
436 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
437 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
439 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
441 #define OOB_NAMEDCLASS -1
443 /* There is no code point that is out-of-bounds, so this is problematic. But
444 * its only current use is to initialize a variable that is always set before
446 #define OOB_UNICODE 0xDEADBEEF
448 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
449 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
452 /* length of regex to show in messages that don't mark a position within */
453 #define RegexLengthToShowInErrorMessages 127
456 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
457 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
458 * op/pragma/warn/regcomp.
460 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
461 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
463 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
465 #define REPORT_LOCATION_ARGS(offset) \
466 UTF8fARG(UTF, offset, RExC_precomp), \
467 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
470 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
471 * arg. Show regex, up to a maximum length. If it's too long, chop and add
474 #define _FAIL(code) STMT_START { \
475 const char *ellipses = ""; \
476 IV len = RExC_end - RExC_precomp; \
479 SAVEFREESV(RExC_rx_sv); \
480 if (len > RegexLengthToShowInErrorMessages) { \
481 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
482 len = RegexLengthToShowInErrorMessages - 10; \
488 #define FAIL(msg) _FAIL( \
489 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
490 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
492 #define FAIL2(msg,arg) _FAIL( \
493 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
494 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
497 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
499 #define Simple_vFAIL(m) STMT_START { \
500 const IV offset = RExC_parse - RExC_precomp; \
501 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
502 m, REPORT_LOCATION_ARGS(offset)); \
506 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
508 #define vFAIL(m) STMT_START { \
510 SAVEFREESV(RExC_rx_sv); \
515 * Like Simple_vFAIL(), but accepts two arguments.
517 #define Simple_vFAIL2(m,a1) STMT_START { \
518 const IV offset = RExC_parse - RExC_precomp; \
519 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
520 REPORT_LOCATION_ARGS(offset)); \
524 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
526 #define vFAIL2(m,a1) STMT_START { \
528 SAVEFREESV(RExC_rx_sv); \
529 Simple_vFAIL2(m, a1); \
534 * Like Simple_vFAIL(), but accepts three arguments.
536 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
537 const IV offset = RExC_parse - RExC_precomp; \
538 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
539 REPORT_LOCATION_ARGS(offset)); \
543 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
545 #define vFAIL3(m,a1,a2) STMT_START { \
547 SAVEFREESV(RExC_rx_sv); \
548 Simple_vFAIL3(m, a1, a2); \
552 * Like Simple_vFAIL(), but accepts four arguments.
554 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
555 const IV offset = RExC_parse - RExC_precomp; \
556 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
557 REPORT_LOCATION_ARGS(offset)); \
560 #define vFAIL4(m,a1,a2,a3) STMT_START { \
562 SAVEFREESV(RExC_rx_sv); \
563 Simple_vFAIL4(m, a1, a2, a3); \
566 /* A specialized version of vFAIL2 that works with UTF8f */
567 #define vFAIL2utf8f(m, a1) STMT_START { \
568 const IV offset = RExC_parse - RExC_precomp; \
570 SAVEFREESV(RExC_rx_sv); \
571 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
572 REPORT_LOCATION_ARGS(offset)); \
576 /* m is not necessarily a "literal string", in this macro */
577 #define reg_warn_non_literal_string(loc, m) STMT_START { \
578 const IV offset = loc - RExC_precomp; \
579 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
580 m, REPORT_LOCATION_ARGS(offset)); \
583 #define ckWARNreg(loc,m) STMT_START { \
584 const IV offset = loc - RExC_precomp; \
585 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
586 REPORT_LOCATION_ARGS(offset)); \
589 #define vWARN_dep(loc, m) STMT_START { \
590 const IV offset = loc - RExC_precomp; \
591 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
592 REPORT_LOCATION_ARGS(offset)); \
595 #define ckWARNdep(loc,m) STMT_START { \
596 const IV offset = loc - RExC_precomp; \
597 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
599 REPORT_LOCATION_ARGS(offset)); \
602 #define ckWARNregdep(loc,m) STMT_START { \
603 const IV offset = loc - RExC_precomp; \
604 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
606 REPORT_LOCATION_ARGS(offset)); \
609 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
610 const IV offset = loc - RExC_precomp; \
611 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
613 a1, REPORT_LOCATION_ARGS(offset)); \
616 #define ckWARN2reg(loc, m, a1) STMT_START { \
617 const IV offset = loc - RExC_precomp; \
618 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
619 a1, REPORT_LOCATION_ARGS(offset)); \
622 #define vWARN3(loc, m, a1, a2) STMT_START { \
623 const IV offset = loc - RExC_precomp; \
624 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
625 a1, a2, REPORT_LOCATION_ARGS(offset)); \
628 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
629 const IV offset = loc - RExC_precomp; \
630 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
631 a1, a2, REPORT_LOCATION_ARGS(offset)); \
634 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
635 const IV offset = loc - RExC_precomp; \
636 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
637 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
640 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
641 const IV offset = loc - RExC_precomp; \
642 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
643 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
646 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
647 const IV offset = loc - RExC_precomp; \
648 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
649 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
653 /* Allow for side effects in s */
654 #define REGC(c,s) STMT_START { \
655 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
658 /* Macros for recording node offsets. 20001227 mjd@plover.com
659 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
660 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
661 * Element 0 holds the number n.
662 * Position is 1 indexed.
664 #ifndef RE_TRACK_PATTERN_OFFSETS
665 #define Set_Node_Offset_To_R(node,byte)
666 #define Set_Node_Offset(node,byte)
667 #define Set_Cur_Node_Offset
668 #define Set_Node_Length_To_R(node,len)
669 #define Set_Node_Length(node,len)
670 #define Set_Node_Cur_Length(node,start)
671 #define Node_Offset(n)
672 #define Node_Length(n)
673 #define Set_Node_Offset_Length(node,offset,len)
674 #define ProgLen(ri) ri->u.proglen
675 #define SetProgLen(ri,x) ri->u.proglen = x
677 #define ProgLen(ri) ri->u.offsets[0]
678 #define SetProgLen(ri,x) ri->u.offsets[0] = x
679 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
681 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
682 __LINE__, (int)(node), (int)(byte))); \
684 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
686 RExC_offsets[2*(node)-1] = (byte); \
691 #define Set_Node_Offset(node,byte) \
692 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
693 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
695 #define Set_Node_Length_To_R(node,len) STMT_START { \
697 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
698 __LINE__, (int)(node), (int)(len))); \
700 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
702 RExC_offsets[2*(node)] = (len); \
707 #define Set_Node_Length(node,len) \
708 Set_Node_Length_To_R((node)-RExC_emit_start, len)
709 #define Set_Node_Cur_Length(node, start) \
710 Set_Node_Length(node, RExC_parse - start)
712 /* Get offsets and lengths */
713 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
714 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
716 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
717 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
718 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
722 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
723 #define EXPERIMENTAL_INPLACESCAN
724 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
726 #define DEBUG_STUDYDATA(str,data,depth) \
727 DEBUG_OPTIMISE_MORE_r(if(data){ \
728 PerlIO_printf(Perl_debug_log, \
729 "%*s" str "Pos:%"IVdf"/%"IVdf \
730 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
731 (int)(depth)*2, "", \
732 (IV)((data)->pos_min), \
733 (IV)((data)->pos_delta), \
734 (UV)((data)->flags), \
735 (IV)((data)->whilem_c), \
736 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
737 is_inf ? "INF " : "" \
739 if ((data)->last_found) \
740 PerlIO_printf(Perl_debug_log, \
741 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
742 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
743 SvPVX_const((data)->last_found), \
744 (IV)((data)->last_end), \
745 (IV)((data)->last_start_min), \
746 (IV)((data)->last_start_max), \
747 ((data)->longest && \
748 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
749 SvPVX_const((data)->longest_fixed), \
750 (IV)((data)->offset_fixed), \
751 ((data)->longest && \
752 (data)->longest==&((data)->longest_float)) ? "*" : "", \
753 SvPVX_const((data)->longest_float), \
754 (IV)((data)->offset_float_min), \
755 (IV)((data)->offset_float_max) \
757 PerlIO_printf(Perl_debug_log,"\n"); \
760 /* Mark that we cannot extend a found fixed substring at this point.
761 Update the longest found anchored substring and the longest found
762 floating substrings if needed. */
765 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
766 SSize_t *minlenp, int is_inf)
768 const STRLEN l = CHR_SVLEN(data->last_found);
769 const STRLEN old_l = CHR_SVLEN(*data->longest);
770 GET_RE_DEBUG_FLAGS_DECL;
772 PERL_ARGS_ASSERT_SCAN_COMMIT;
774 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
775 SvSetMagicSV(*data->longest, data->last_found);
776 if (*data->longest == data->longest_fixed) {
777 data->offset_fixed = l ? data->last_start_min : data->pos_min;
778 if (data->flags & SF_BEFORE_EOL)
780 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
782 data->flags &= ~SF_FIX_BEFORE_EOL;
783 data->minlen_fixed=minlenp;
784 data->lookbehind_fixed=0;
786 else { /* *data->longest == data->longest_float */
787 data->offset_float_min = l ? data->last_start_min : data->pos_min;
788 data->offset_float_max = (l
789 ? data->last_start_max
790 : (data->pos_delta == SSize_t_MAX
792 : data->pos_min + data->pos_delta));
794 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
795 data->offset_float_max = SSize_t_MAX;
796 if (data->flags & SF_BEFORE_EOL)
798 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
800 data->flags &= ~SF_FL_BEFORE_EOL;
801 data->minlen_float=minlenp;
802 data->lookbehind_float=0;
805 SvCUR_set(data->last_found, 0);
807 SV * const sv = data->last_found;
808 if (SvUTF8(sv) && SvMAGICAL(sv)) {
809 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
815 data->flags &= ~SF_BEFORE_EOL;
816 DEBUG_STUDYDATA("commit: ",data,0);
819 /* These macros set, clear and test whether the synthetic start class ('ssc',
820 * given by the parameter) matches an empty string (EOS). This uses the
821 * 'next_off' field in the node, to save a bit in the flags field. The ssc
822 * stands alone, so there is never a next_off, so this field is otherwise
823 * unused. The EOS information is used only for compilation, but theoretically
824 * it could be passed on to the execution code. This could be used to store
825 * more than one bit of information, but only this one is currently used. */
826 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
827 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
828 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
830 /* Can match anything (initialization) */
832 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
834 PERL_ARGS_ASSERT_CL_ANYTHING;
836 ANYOF_BITMAP_SETALL(cl);
837 cl->flags = ANYOF_UNICODE_ALL;
840 /* If any portion of the regex is to operate under locale rules,
841 * initialization includes it. The reason this isn't done for all regexes
842 * is that the optimizer was written under the assumption that locale was
843 * all-or-nothing. Given the complexity and lack of documentation in the
844 * optimizer, and that there are inadequate test cases for locale, so many
845 * parts of it may not work properly, it is safest to avoid locale unless
847 if (RExC_contains_locale) {
848 ANYOF_CLASS_SETALL(cl); /* /l uses class */
849 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
852 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
856 /* Can match anything (initialization) */
858 S_cl_is_anything(const struct regnode_charclass_class *cl)
862 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
864 for (value = 0; value < ANYOF_MAX; value += 2)
865 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
867 if (!(cl->flags & ANYOF_UNICODE_ALL))
869 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
874 /* Can match anything (initialization) */
876 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
878 PERL_ARGS_ASSERT_CL_INIT;
880 Zero(cl, 1, struct regnode_charclass_class);
882 cl_anything(pRExC_state, cl);
883 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
886 /* These two functions currently do the exact same thing */
887 #define cl_init_zero cl_init
889 /* 'AND' a given class with another one. Can create false positives. 'cl'
890 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
891 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
893 S_cl_and(struct regnode_charclass_class *cl,
894 const struct regnode_charclass_class *and_with)
896 PERL_ARGS_ASSERT_CL_AND;
898 assert(PL_regkind[and_with->type] == ANYOF);
900 /* I (khw) am not sure all these restrictions are necessary XXX */
901 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
902 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
903 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
904 && !(and_with->flags & ANYOF_LOC_FOLD)
905 && !(cl->flags & ANYOF_LOC_FOLD)) {
908 if (and_with->flags & ANYOF_INVERT)
909 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
910 cl->bitmap[i] &= ~and_with->bitmap[i];
912 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
913 cl->bitmap[i] &= and_with->bitmap[i];
914 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
916 if (and_with->flags & ANYOF_INVERT) {
918 /* Here, the and'ed node is inverted. Get the AND of the flags that
919 * aren't affected by the inversion. Those that are affected are
920 * handled individually below */
921 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
922 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
923 cl->flags |= affected_flags;
925 /* We currently don't know how to deal with things that aren't in the
926 * bitmap, but we know that the intersection is no greater than what
927 * is already in cl, so let there be false positives that get sorted
928 * out after the synthetic start class succeeds, and the node is
929 * matched for real. */
931 /* The inversion of these two flags indicate that the resulting
932 * intersection doesn't have them */
933 if (and_with->flags & ANYOF_UNICODE_ALL) {
934 cl->flags &= ~ANYOF_UNICODE_ALL;
936 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
937 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
940 else { /* and'd node is not inverted */
941 U8 outside_bitmap_but_not_utf8; /* Temp variable */
943 if (! ANYOF_NONBITMAP(and_with)) {
945 /* Here 'and_with' doesn't match anything outside the bitmap
946 * (except possibly ANYOF_UNICODE_ALL), which means the
947 * intersection can't either, except for ANYOF_UNICODE_ALL, in
948 * which case we don't know what the intersection is, but it's no
949 * greater than what cl already has, so can just leave it alone,
950 * with possible false positives */
951 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
952 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
953 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
956 else if (! ANYOF_NONBITMAP(cl)) {
958 /* Here, 'and_with' does match something outside the bitmap, and cl
959 * doesn't have a list of things to match outside the bitmap. If
960 * cl can match all code points above 255, the intersection will
961 * be those above-255 code points that 'and_with' matches. If cl
962 * can't match all Unicode code points, it means that it can't
963 * match anything outside the bitmap (since the 'if' that got us
964 * into this block tested for that), so we leave the bitmap empty.
966 if (cl->flags & ANYOF_UNICODE_ALL) {
967 ARG_SET(cl, ARG(and_with));
969 /* and_with's ARG may match things that don't require UTF8.
970 * And now cl's will too, in spite of this being an 'and'. See
971 * the comments below about the kludge */
972 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
976 /* Here, both 'and_with' and cl match something outside the
977 * bitmap. Currently we do not do the intersection, so just match
978 * whatever cl had at the beginning. */
982 /* Take the intersection of the two sets of flags. However, the
983 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
984 * kludge around the fact that this flag is not treated like the others
985 * which are initialized in cl_anything(). The way the optimizer works
986 * is that the synthetic start class (SSC) is initialized to match
987 * anything, and then the first time a real node is encountered, its
988 * values are AND'd with the SSC's with the result being the values of
989 * the real node. However, there are paths through the optimizer where
990 * the AND never gets called, so those initialized bits are set
991 * inappropriately, which is not usually a big deal, as they just cause
992 * false positives in the SSC, which will just mean a probably
993 * imperceptible slow down in execution. However this bit has a
994 * higher false positive consequence in that it can cause utf8.pm,
995 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
996 * bigger slowdown and also causes significant extra memory to be used.
997 * In order to prevent this, the code now takes a different tack. The
998 * bit isn't set unless some part of the regular expression needs it,
999 * but once set it won't get cleared. This means that these extra
1000 * modules won't get loaded unless there was some path through the
1001 * pattern that would have required them anyway, and so any false
1002 * positives that occur by not ANDing them out when they could be
1003 * aren't as severe as they would be if we treated this bit like all
1005 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
1006 & ANYOF_NONBITMAP_NON_UTF8;
1007 cl->flags &= and_with->flags;
1008 cl->flags |= outside_bitmap_but_not_utf8;
1012 /* 'OR' a given class with another one. Can create false positives. 'cl'
1013 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
1014 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
1016 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
1018 PERL_ARGS_ASSERT_CL_OR;
1020 if (or_with->flags & ANYOF_INVERT) {
1022 /* Here, the or'd node is to be inverted. This means we take the
1023 * complement of everything not in the bitmap, but currently we don't
1024 * know what that is, so give up and match anything */
1025 if (ANYOF_NONBITMAP(or_with)) {
1026 cl_anything(pRExC_state, cl);
1029 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1030 * <= (B1 | !B2) | (CL1 | !CL2)
1031 * which is wasteful if CL2 is small, but we ignore CL2:
1032 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1033 * XXXX Can we handle case-fold? Unclear:
1034 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1035 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1037 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1038 && !(or_with->flags & ANYOF_LOC_FOLD)
1039 && !(cl->flags & ANYOF_LOC_FOLD) ) {
1042 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1043 cl->bitmap[i] |= ~or_with->bitmap[i];
1044 } /* XXXX: logic is complicated otherwise */
1046 cl_anything(pRExC_state, cl);
1049 /* And, we can just take the union of the flags that aren't affected
1050 * by the inversion */
1051 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1053 /* For the remaining flags:
1054 ANYOF_UNICODE_ALL and inverted means to not match anything above
1055 255, which means that the union with cl should just be
1056 what cl has in it, so can ignore this flag
1057 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1058 is (ASCII) 127-255 to match them, but then invert that, so
1059 the union with cl should just be what cl has in it, so can
1062 } else { /* 'or_with' is not inverted */
1063 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1064 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1065 && (!(or_with->flags & ANYOF_LOC_FOLD)
1066 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1069 /* OR char bitmap and class bitmap separately */
1070 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1071 cl->bitmap[i] |= or_with->bitmap[i];
1072 if (or_with->flags & ANYOF_CLASS) {
1073 ANYOF_CLASS_OR(or_with, cl);
1076 else { /* XXXX: logic is complicated, leave it along for a moment. */
1077 cl_anything(pRExC_state, cl);
1080 if (ANYOF_NONBITMAP(or_with)) {
1082 /* Use the added node's outside-the-bit-map match if there isn't a
1083 * conflict. If there is a conflict (both nodes match something
1084 * outside the bitmap, but what they match outside is not the same
1085 * pointer, and hence not easily compared until XXX we extend
1086 * inversion lists this far), give up and allow the start class to
1087 * match everything outside the bitmap. If that stuff is all above
1088 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1089 if (! ANYOF_NONBITMAP(cl)) {
1090 ARG_SET(cl, ARG(or_with));
1092 else if (ARG(cl) != ARG(or_with)) {
1094 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1095 cl_anything(pRExC_state, cl);
1098 cl->flags |= ANYOF_UNICODE_ALL;
1103 /* Take the union */
1104 cl->flags |= or_with->flags;
1108 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1109 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1110 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1111 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1116 dump_trie(trie,widecharmap,revcharmap)
1117 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1118 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1120 These routines dump out a trie in a somewhat readable format.
1121 The _interim_ variants are used for debugging the interim
1122 tables that are used to generate the final compressed
1123 representation which is what dump_trie expects.
1125 Part of the reason for their existence is to provide a form
1126 of documentation as to how the different representations function.
1131 Dumps the final compressed table form of the trie to Perl_debug_log.
1132 Used for debugging make_trie().
1136 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1137 AV *revcharmap, U32 depth)
1140 SV *sv=sv_newmortal();
1141 int colwidth= widecharmap ? 6 : 4;
1143 GET_RE_DEBUG_FLAGS_DECL;
1145 PERL_ARGS_ASSERT_DUMP_TRIE;
1147 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1148 (int)depth * 2 + 2,"",
1149 "Match","Base","Ofs" );
1151 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1152 SV ** const tmp = av_fetch( revcharmap, state, 0);
1154 PerlIO_printf( Perl_debug_log, "%*s",
1156 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1157 PL_colors[0], PL_colors[1],
1158 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1159 PERL_PV_ESCAPE_FIRSTCHAR
1164 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1165 (int)depth * 2 + 2,"");
1167 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1168 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1169 PerlIO_printf( Perl_debug_log, "\n");
1171 for( state = 1 ; state < trie->statecount ; state++ ) {
1172 const U32 base = trie->states[ state ].trans.base;
1174 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1176 if ( trie->states[ state ].wordnum ) {
1177 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1179 PerlIO_printf( Perl_debug_log, "%6s", "" );
1182 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1187 while( ( base + ofs < trie->uniquecharcount ) ||
1188 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1189 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1192 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1194 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1195 if ( ( base + ofs >= trie->uniquecharcount ) &&
1196 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1197 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1199 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1201 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1203 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1207 PerlIO_printf( Perl_debug_log, "]");
1210 PerlIO_printf( Perl_debug_log, "\n" );
1212 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1213 for (word=1; word <= trie->wordcount; word++) {
1214 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1215 (int)word, (int)(trie->wordinfo[word].prev),
1216 (int)(trie->wordinfo[word].len));
1218 PerlIO_printf(Perl_debug_log, "\n" );
1221 Dumps a fully constructed but uncompressed trie in list form.
1222 List tries normally only are used for construction when the number of
1223 possible chars (trie->uniquecharcount) is very high.
1224 Used for debugging make_trie().
1227 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1228 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1232 SV *sv=sv_newmortal();
1233 int colwidth= widecharmap ? 6 : 4;
1234 GET_RE_DEBUG_FLAGS_DECL;
1236 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1238 /* print out the table precompression. */
1239 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1240 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1241 "------:-----+-----------------\n" );
1243 for( state=1 ; state < next_alloc ; state ++ ) {
1246 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1247 (int)depth * 2 + 2,"", (UV)state );
1248 if ( ! trie->states[ state ].wordnum ) {
1249 PerlIO_printf( Perl_debug_log, "%5s| ","");
1251 PerlIO_printf( Perl_debug_log, "W%4x| ",
1252 trie->states[ state ].wordnum
1255 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1256 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1258 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1260 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1261 PL_colors[0], PL_colors[1],
1262 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1263 PERL_PV_ESCAPE_FIRSTCHAR
1265 TRIE_LIST_ITEM(state,charid).forid,
1266 (UV)TRIE_LIST_ITEM(state,charid).newstate
1269 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1270 (int)((depth * 2) + 14), "");
1273 PerlIO_printf( Perl_debug_log, "\n");
1278 Dumps a fully constructed but uncompressed trie in table form.
1279 This is the normal DFA style state transition table, with a few
1280 twists to facilitate compression later.
1281 Used for debugging make_trie().
1284 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1285 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1290 SV *sv=sv_newmortal();
1291 int colwidth= widecharmap ? 6 : 4;
1292 GET_RE_DEBUG_FLAGS_DECL;
1294 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1297 print out the table precompression so that we can do a visual check
1298 that they are identical.
1301 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1303 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1304 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1306 PerlIO_printf( Perl_debug_log, "%*s",
1308 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1309 PL_colors[0], PL_colors[1],
1310 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1311 PERL_PV_ESCAPE_FIRSTCHAR
1317 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1319 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1320 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1323 PerlIO_printf( Perl_debug_log, "\n" );
1325 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1327 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1328 (int)depth * 2 + 2,"",
1329 (UV)TRIE_NODENUM( state ) );
1331 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1332 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1334 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1336 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1338 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1339 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1341 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1342 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1350 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1351 startbranch: the first branch in the whole branch sequence
1352 first : start branch of sequence of branch-exact nodes.
1353 May be the same as startbranch
1354 last : Thing following the last branch.
1355 May be the same as tail.
1356 tail : item following the branch sequence
1357 count : words in the sequence
1358 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1359 depth : indent depth
1361 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1363 A trie is an N'ary tree where the branches are determined by digital
1364 decomposition of the key. IE, at the root node you look up the 1st character and
1365 follow that branch repeat until you find the end of the branches. Nodes can be
1366 marked as "accepting" meaning they represent a complete word. Eg:
1370 would convert into the following structure. Numbers represent states, letters
1371 following numbers represent valid transitions on the letter from that state, if
1372 the number is in square brackets it represents an accepting state, otherwise it
1373 will be in parenthesis.
1375 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1379 (1) +-i->(6)-+-s->[7]
1381 +-s->(3)-+-h->(4)-+-e->[5]
1383 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1385 This shows that when matching against the string 'hers' we will begin at state 1
1386 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1387 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1388 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1389 single traverse. We store a mapping from accepting to state to which word was
1390 matched, and then when we have multiple possibilities we try to complete the
1391 rest of the regex in the order in which they occured in the alternation.
1393 The only prior NFA like behaviour that would be changed by the TRIE support is
1394 the silent ignoring of duplicate alternations which are of the form:
1396 / (DUPE|DUPE) X? (?{ ... }) Y /x
1398 Thus EVAL blocks following a trie may be called a different number of times with
1399 and without the optimisation. With the optimisations dupes will be silently
1400 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1401 the following demonstrates:
1403 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1405 which prints out 'word' three times, but
1407 'words'=~/(word|word|word)(?{ print $1 })S/
1409 which doesnt print it out at all. This is due to other optimisations kicking in.
1411 Example of what happens on a structural level:
1413 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1415 1: CURLYM[1] {1,32767}(18)
1426 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1427 and should turn into:
1429 1: CURLYM[1] {1,32767}(18)
1431 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1439 Cases where tail != last would be like /(?foo|bar)baz/:
1449 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1450 and would end up looking like:
1453 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1460 d = uvchr_to_utf8_flags(d, uv, 0);
1462 is the recommended Unicode-aware way of saying
1467 #define TRIE_STORE_REVCHAR(val) \
1470 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1471 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1472 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1473 SvCUR_set(zlopp, kapow - flrbbbbb); \
1476 av_push(revcharmap, zlopp); \
1478 char ooooff = (char)val; \
1479 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1483 /* This gets the next character from the input, folding it if not already
1485 #define TRIE_READ_CHAR STMT_START { \
1488 /* if it is UTF then it is either already folded, or does not need \
1490 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1492 else if (folder == PL_fold_latin1) { \
1493 /* This folder implies Unicode rules, which in the range expressible \
1494 * by not UTF is the lower case, with the two exceptions, one of \
1495 * which should have been taken care of before calling this */ \
1496 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1497 uvc = toLOWER_L1(*uc); \
1498 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1501 /* raw data, will be folded later if needed */ \
1509 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1510 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1511 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1512 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1514 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1515 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1516 TRIE_LIST_CUR( state )++; \
1519 #define TRIE_LIST_NEW(state) STMT_START { \
1520 Newxz( trie->states[ state ].trans.list, \
1521 4, reg_trie_trans_le ); \
1522 TRIE_LIST_CUR( state ) = 1; \
1523 TRIE_LIST_LEN( state ) = 4; \
1526 #define TRIE_HANDLE_WORD(state) STMT_START { \
1527 U16 dupe= trie->states[ state ].wordnum; \
1528 regnode * const noper_next = regnext( noper ); \
1531 /* store the word for dumping */ \
1533 if (OP(noper) != NOTHING) \
1534 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1536 tmp = newSVpvn_utf8( "", 0, UTF ); \
1537 av_push( trie_words, tmp ); \
1541 trie->wordinfo[curword].prev = 0; \
1542 trie->wordinfo[curword].len = wordlen; \
1543 trie->wordinfo[curword].accept = state; \
1545 if ( noper_next < tail ) { \
1547 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1548 trie->jump[curword] = (U16)(noper_next - convert); \
1550 jumper = noper_next; \
1552 nextbranch= regnext(cur); \
1556 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1557 /* chain, so that when the bits of chain are later */\
1558 /* linked together, the dups appear in the chain */\
1559 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1560 trie->wordinfo[dupe].prev = curword; \
1562 /* we haven't inserted this word yet. */ \
1563 trie->states[ state ].wordnum = curword; \
1568 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1569 ( ( base + charid >= ucharcount \
1570 && base + charid < ubound \
1571 && state == trie->trans[ base - ucharcount + charid ].check \
1572 && trie->trans[ base - ucharcount + charid ].next ) \
1573 ? trie->trans[ base - ucharcount + charid ].next \
1574 : ( state==1 ? special : 0 ) \
1578 #define MADE_JUMP_TRIE 2
1579 #define MADE_EXACT_TRIE 4
1582 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1585 /* first pass, loop through and scan words */
1586 reg_trie_data *trie;
1587 HV *widecharmap = NULL;
1588 AV *revcharmap = newAV();
1594 regnode *jumper = NULL;
1595 regnode *nextbranch = NULL;
1596 regnode *convert = NULL;
1597 U32 *prev_states; /* temp array mapping each state to previous one */
1598 /* we just use folder as a flag in utf8 */
1599 const U8 * folder = NULL;
1602 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1603 AV *trie_words = NULL;
1604 /* along with revcharmap, this only used during construction but both are
1605 * useful during debugging so we store them in the struct when debugging.
1608 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1609 STRLEN trie_charcount=0;
1611 SV *re_trie_maxbuff;
1612 GET_RE_DEBUG_FLAGS_DECL;
1614 PERL_ARGS_ASSERT_MAKE_TRIE;
1616 PERL_UNUSED_ARG(depth);
1623 case EXACTFU: folder = PL_fold_latin1; break;
1624 case EXACTF: folder = PL_fold; break;
1625 case EXACTFL: folder = PL_fold_locale; break;
1626 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1629 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1631 trie->startstate = 1;
1632 trie->wordcount = word_count;
1633 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1634 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1636 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1637 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1638 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1641 trie_words = newAV();
1644 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1645 if (!SvIOK(re_trie_maxbuff)) {
1646 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1648 DEBUG_TRIE_COMPILE_r({
1649 PerlIO_printf( Perl_debug_log,
1650 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1651 (int)depth * 2 + 2, "",
1652 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1653 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1657 /* Find the node we are going to overwrite */
1658 if ( first == startbranch && OP( last ) != BRANCH ) {
1659 /* whole branch chain */
1662 /* branch sub-chain */
1663 convert = NEXTOPER( first );
1666 /* -- First loop and Setup --
1668 We first traverse the branches and scan each word to determine if it
1669 contains widechars, and how many unique chars there are, this is
1670 important as we have to build a table with at least as many columns as we
1673 We use an array of integers to represent the character codes 0..255
1674 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1675 native representation of the character value as the key and IV's for the
1678 *TODO* If we keep track of how many times each character is used we can
1679 remap the columns so that the table compression later on is more
1680 efficient in terms of memory by ensuring the most common value is in the
1681 middle and the least common are on the outside. IMO this would be better
1682 than a most to least common mapping as theres a decent chance the most
1683 common letter will share a node with the least common, meaning the node
1684 will not be compressible. With a middle is most common approach the worst
1685 case is when we have the least common nodes twice.
1689 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1690 regnode *noper = NEXTOPER( cur );
1691 const U8 *uc = (U8*)STRING( noper );
1692 const U8 *e = uc + STR_LEN( noper );
1694 U32 wordlen = 0; /* required init */
1695 STRLEN minbytes = 0;
1696 STRLEN maxbytes = 0;
1697 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1699 if (OP(noper) == NOTHING) {
1700 regnode *noper_next= regnext(noper);
1701 if (noper_next != tail && OP(noper_next) == flags) {
1703 uc= (U8*)STRING(noper);
1704 e= uc + STR_LEN(noper);
1705 trie->minlen= STR_LEN(noper);
1712 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1713 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1714 regardless of encoding */
1715 if (OP( noper ) == EXACTFU_SS) {
1716 /* false positives are ok, so just set this */
1717 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1720 for ( ; uc < e ; uc += len ) {
1721 TRIE_CHARCOUNT(trie)++;
1724 /* Acummulate to the current values, the range in the number of
1725 * bytes that this character could match. The max is presumed to
1726 * be the same as the folded input (which TRIE_READ_CHAR returns),
1727 * except that when this is not in UTF-8, it could be matched
1728 * against a string which is UTF-8, and the variant characters
1729 * could be 2 bytes instead of the 1 here. Likewise, for the
1730 * minimum number of bytes when not folded. When folding, the min
1731 * is assumed to be 1 byte could fold to match the single character
1732 * here, or in the case of a multi-char fold, 1 byte can fold to
1733 * the whole sequence. 'foldlen' is used to denote whether we are
1734 * in such a sequence, skipping the min setting if so. XXX TODO
1735 * Use the exact list of what folds to each character, from
1736 * PL_utf8_foldclosures */
1738 maxbytes += UTF8SKIP(uc);
1740 /* A non-UTF-8 string could be 1 byte to match our 2 */
1741 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1747 foldlen -= UTF8SKIP(uc);
1750 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1756 maxbytes += (UNI_IS_INVARIANT(*uc))
1767 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1774 U8 folded= folder[ (U8) uvc ];
1775 if ( !trie->charmap[ folded ] ) {
1776 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1777 TRIE_STORE_REVCHAR( folded );
1780 if ( !trie->charmap[ uvc ] ) {
1781 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1782 TRIE_STORE_REVCHAR( uvc );
1785 /* store the codepoint in the bitmap, and its folded
1787 TRIE_BITMAP_SET(trie, uvc);
1789 /* store the folded codepoint */
1790 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1793 /* store first byte of utf8 representation of
1794 variant codepoints */
1795 if (! UVCHR_IS_INVARIANT(uvc)) {
1796 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1799 set_bit = 0; /* We've done our bit :-) */
1804 widecharmap = newHV();
1806 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1809 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1811 if ( !SvTRUE( *svpp ) ) {
1812 sv_setiv( *svpp, ++trie->uniquecharcount );
1813 TRIE_STORE_REVCHAR(uvc);
1817 if( cur == first ) {
1818 trie->minlen = minbytes;
1819 trie->maxlen = maxbytes;
1820 } else if (minbytes < trie->minlen) {
1821 trie->minlen = minbytes;
1822 } else if (maxbytes > trie->maxlen) {
1823 trie->maxlen = maxbytes;
1825 } /* end first pass */
1826 DEBUG_TRIE_COMPILE_r(
1827 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1828 (int)depth * 2 + 2,"",
1829 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1830 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1831 (int)trie->minlen, (int)trie->maxlen )
1835 We now know what we are dealing with in terms of unique chars and
1836 string sizes so we can calculate how much memory a naive
1837 representation using a flat table will take. If it's over a reasonable
1838 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1839 conservative but potentially much slower representation using an array
1842 At the end we convert both representations into the same compressed
1843 form that will be used in regexec.c for matching with. The latter
1844 is a form that cannot be used to construct with but has memory
1845 properties similar to the list form and access properties similar
1846 to the table form making it both suitable for fast searches and
1847 small enough that its feasable to store for the duration of a program.
1849 See the comment in the code where the compressed table is produced
1850 inplace from the flat tabe representation for an explanation of how
1851 the compression works.
1856 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1859 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1861 Second Pass -- Array Of Lists Representation
1863 Each state will be represented by a list of charid:state records
1864 (reg_trie_trans_le) the first such element holds the CUR and LEN
1865 points of the allocated array. (See defines above).
1867 We build the initial structure using the lists, and then convert
1868 it into the compressed table form which allows faster lookups
1869 (but cant be modified once converted).
1872 STRLEN transcount = 1;
1874 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1875 "%*sCompiling trie using list compiler\n",
1876 (int)depth * 2 + 2, ""));
1878 trie->states = (reg_trie_state *)
1879 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1880 sizeof(reg_trie_state) );
1884 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1886 regnode *noper = NEXTOPER( cur );
1887 U8 *uc = (U8*)STRING( noper );
1888 const U8 *e = uc + STR_LEN( noper );
1889 U32 state = 1; /* required init */
1890 U16 charid = 0; /* sanity init */
1891 U32 wordlen = 0; /* required init */
1893 if (OP(noper) == NOTHING) {
1894 regnode *noper_next= regnext(noper);
1895 if (noper_next != tail && OP(noper_next) == flags) {
1897 uc= (U8*)STRING(noper);
1898 e= uc + STR_LEN(noper);
1902 if (OP(noper) != NOTHING) {
1903 for ( ; uc < e ; uc += len ) {
1908 charid = trie->charmap[ uvc ];
1910 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1914 charid=(U16)SvIV( *svpp );
1917 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1924 if ( !trie->states[ state ].trans.list ) {
1925 TRIE_LIST_NEW( state );
1927 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1928 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1929 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1934 newstate = next_alloc++;
1935 prev_states[newstate] = state;
1936 TRIE_LIST_PUSH( state, charid, newstate );
1941 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1945 TRIE_HANDLE_WORD(state);
1947 } /* end second pass */
1949 /* next alloc is the NEXT state to be allocated */
1950 trie->statecount = next_alloc;
1951 trie->states = (reg_trie_state *)
1952 PerlMemShared_realloc( trie->states,
1954 * sizeof(reg_trie_state) );
1956 /* and now dump it out before we compress it */
1957 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1958 revcharmap, next_alloc,
1962 trie->trans = (reg_trie_trans *)
1963 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1970 for( state=1 ; state < next_alloc ; state ++ ) {
1974 DEBUG_TRIE_COMPILE_MORE_r(
1975 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1979 if (trie->states[state].trans.list) {
1980 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1984 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1985 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1986 if ( forid < minid ) {
1988 } else if ( forid > maxid ) {
1992 if ( transcount < tp + maxid - minid + 1) {
1994 trie->trans = (reg_trie_trans *)
1995 PerlMemShared_realloc( trie->trans,
1997 * sizeof(reg_trie_trans) );
1998 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2000 base = trie->uniquecharcount + tp - minid;
2001 if ( maxid == minid ) {
2003 for ( ; zp < tp ; zp++ ) {
2004 if ( ! trie->trans[ zp ].next ) {
2005 base = trie->uniquecharcount + zp - minid;
2006 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2007 trie->trans[ zp ].check = state;
2013 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2014 trie->trans[ tp ].check = state;
2019 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2020 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2021 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2022 trie->trans[ tid ].check = state;
2024 tp += ( maxid - minid + 1 );
2026 Safefree(trie->states[ state ].trans.list);
2029 DEBUG_TRIE_COMPILE_MORE_r(
2030 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2033 trie->states[ state ].trans.base=base;
2035 trie->lasttrans = tp + 1;
2039 Second Pass -- Flat Table Representation.
2041 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2042 We know that we will need Charcount+1 trans at most to store the data
2043 (one row per char at worst case) So we preallocate both structures
2044 assuming worst case.
2046 We then construct the trie using only the .next slots of the entry
2049 We use the .check field of the first entry of the node temporarily to
2050 make compression both faster and easier by keeping track of how many non
2051 zero fields are in the node.
2053 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2056 There are two terms at use here: state as a TRIE_NODEIDX() which is a
2057 number representing the first entry of the node, and state as a
2058 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2059 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2060 are 2 entrys per node. eg:
2068 The table is internally in the right hand, idx form. However as we also
2069 have to deal with the states array which is indexed by nodenum we have to
2070 use TRIE_NODENUM() to convert.
2073 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2074 "%*sCompiling trie using table compiler\n",
2075 (int)depth * 2 + 2, ""));
2077 trie->trans = (reg_trie_trans *)
2078 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2079 * trie->uniquecharcount + 1,
2080 sizeof(reg_trie_trans) );
2081 trie->states = (reg_trie_state *)
2082 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2083 sizeof(reg_trie_state) );
2084 next_alloc = trie->uniquecharcount + 1;
2087 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2089 regnode *noper = NEXTOPER( cur );
2090 const U8 *uc = (U8*)STRING( noper );
2091 const U8 *e = uc + STR_LEN( noper );
2093 U32 state = 1; /* required init */
2095 U16 charid = 0; /* sanity init */
2096 U32 accept_state = 0; /* sanity init */
2098 U32 wordlen = 0; /* required init */
2100 if (OP(noper) == NOTHING) {
2101 regnode *noper_next= regnext(noper);
2102 if (noper_next != tail && OP(noper_next) == flags) {
2104 uc= (U8*)STRING(noper);
2105 e= uc + STR_LEN(noper);
2109 if ( OP(noper) != NOTHING ) {
2110 for ( ; uc < e ; uc += len ) {
2115 charid = trie->charmap[ uvc ];
2117 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2118 charid = svpp ? (U16)SvIV(*svpp) : 0;
2122 if ( !trie->trans[ state + charid ].next ) {
2123 trie->trans[ state + charid ].next = next_alloc;
2124 trie->trans[ state ].check++;
2125 prev_states[TRIE_NODENUM(next_alloc)]
2126 = TRIE_NODENUM(state);
2127 next_alloc += trie->uniquecharcount;
2129 state = trie->trans[ state + charid ].next;
2131 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2133 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2136 accept_state = TRIE_NODENUM( state );
2137 TRIE_HANDLE_WORD(accept_state);
2139 } /* end second pass */
2141 /* and now dump it out before we compress it */
2142 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2144 next_alloc, depth+1));
2148 * Inplace compress the table.*
2150 For sparse data sets the table constructed by the trie algorithm will
2151 be mostly 0/FAIL transitions or to put it another way mostly empty.
2152 (Note that leaf nodes will not contain any transitions.)
2154 This algorithm compresses the tables by eliminating most such
2155 transitions, at the cost of a modest bit of extra work during lookup:
2157 - Each states[] entry contains a .base field which indicates the
2158 index in the state[] array wheres its transition data is stored.
2160 - If .base is 0 there are no valid transitions from that node.
2162 - If .base is nonzero then charid is added to it to find an entry in
2165 -If trans[states[state].base+charid].check!=state then the
2166 transition is taken to be a 0/Fail transition. Thus if there are fail
2167 transitions at the front of the node then the .base offset will point
2168 somewhere inside the previous nodes data (or maybe even into a node
2169 even earlier), but the .check field determines if the transition is
2173 The following process inplace converts the table to the compressed
2174 table: We first do not compress the root node 1,and mark all its
2175 .check pointers as 1 and set its .base pointer as 1 as well. This
2176 allows us to do a DFA construction from the compressed table later,
2177 and ensures that any .base pointers we calculate later are greater
2180 - We set 'pos' to indicate the first entry of the second node.
2182 - We then iterate over the columns of the node, finding the first and
2183 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2184 and set the .check pointers accordingly, and advance pos
2185 appropriately and repreat for the next node. Note that when we copy
2186 the next pointers we have to convert them from the original
2187 NODEIDX form to NODENUM form as the former is not valid post
2190 - If a node has no transitions used we mark its base as 0 and do not
2191 advance the pos pointer.
2193 - If a node only has one transition we use a second pointer into the
2194 structure to fill in allocated fail transitions from other states.
2195 This pointer is independent of the main pointer and scans forward
2196 looking for null transitions that are allocated to a state. When it
2197 finds one it writes the single transition into the "hole". If the
2198 pointer doesnt find one the single transition is appended as normal.
2200 - Once compressed we can Renew/realloc the structures to release the
2203 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2204 specifically Fig 3.47 and the associated pseudocode.
2208 const U32 laststate = TRIE_NODENUM( next_alloc );
2211 trie->statecount = laststate;
2213 for ( state = 1 ; state < laststate ; state++ ) {
2215 const U32 stateidx = TRIE_NODEIDX( state );
2216 const U32 o_used = trie->trans[ stateidx ].check;
2217 U32 used = trie->trans[ stateidx ].check;
2218 trie->trans[ stateidx ].check = 0;
2220 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2221 if ( flag || trie->trans[ stateidx + charid ].next ) {
2222 if ( trie->trans[ stateidx + charid ].next ) {
2224 for ( ; zp < pos ; zp++ ) {
2225 if ( ! trie->trans[ zp ].next ) {
2229 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2230 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2231 trie->trans[ zp ].check = state;
2232 if ( ++zp > pos ) pos = zp;
2239 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2241 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2242 trie->trans[ pos ].check = state;
2247 trie->lasttrans = pos + 1;
2248 trie->states = (reg_trie_state *)
2249 PerlMemShared_realloc( trie->states, laststate
2250 * sizeof(reg_trie_state) );
2251 DEBUG_TRIE_COMPILE_MORE_r(
2252 PerlIO_printf( Perl_debug_log,
2253 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2254 (int)depth * 2 + 2,"",
2255 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2258 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2261 } /* end table compress */
2263 DEBUG_TRIE_COMPILE_MORE_r(
2264 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2265 (int)depth * 2 + 2, "",
2266 (UV)trie->statecount,
2267 (UV)trie->lasttrans)
2269 /* resize the trans array to remove unused space */
2270 trie->trans = (reg_trie_trans *)
2271 PerlMemShared_realloc( trie->trans, trie->lasttrans
2272 * sizeof(reg_trie_trans) );
2274 { /* Modify the program and insert the new TRIE node */
2275 U8 nodetype =(U8)(flags & 0xFF);
2279 regnode *optimize = NULL;
2280 #ifdef RE_TRACK_PATTERN_OFFSETS
2283 U32 mjd_nodelen = 0;
2284 #endif /* RE_TRACK_PATTERN_OFFSETS */
2285 #endif /* DEBUGGING */
2287 This means we convert either the first branch or the first Exact,
2288 depending on whether the thing following (in 'last') is a branch
2289 or not and whther first is the startbranch (ie is it a sub part of
2290 the alternation or is it the whole thing.)
2291 Assuming its a sub part we convert the EXACT otherwise we convert
2292 the whole branch sequence, including the first.
2294 /* Find the node we are going to overwrite */
2295 if ( first != startbranch || OP( last ) == BRANCH ) {
2296 /* branch sub-chain */
2297 NEXT_OFF( first ) = (U16)(last - first);
2298 #ifdef RE_TRACK_PATTERN_OFFSETS
2300 mjd_offset= Node_Offset((convert));
2301 mjd_nodelen= Node_Length((convert));
2304 /* whole branch chain */
2306 #ifdef RE_TRACK_PATTERN_OFFSETS
2309 const regnode *nop = NEXTOPER( convert );
2310 mjd_offset= Node_Offset((nop));
2311 mjd_nodelen= Node_Length((nop));
2315 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2316 (int)depth * 2 + 2, "",
2317 (UV)mjd_offset, (UV)mjd_nodelen)
2320 /* But first we check to see if there is a common prefix we can
2321 split out as an EXACT and put in front of the TRIE node. */
2322 trie->startstate= 1;
2323 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2325 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2329 const U32 base = trie->states[ state ].trans.base;
2331 if ( trie->states[state].wordnum )
2334 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2335 if ( ( base + ofs >= trie->uniquecharcount ) &&
2336 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2337 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2339 if ( ++count > 1 ) {
2340 SV **tmp = av_fetch( revcharmap, ofs, 0);
2341 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2342 if ( state == 1 ) break;
2344 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2346 PerlIO_printf(Perl_debug_log,
2347 "%*sNew Start State=%"UVuf" Class: [",
2348 (int)depth * 2 + 2, "",
2351 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2352 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2354 TRIE_BITMAP_SET(trie,*ch);
2356 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2358 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2362 TRIE_BITMAP_SET(trie,*ch);
2364 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2365 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2371 SV **tmp = av_fetch( revcharmap, idx, 0);
2373 char *ch = SvPV( *tmp, len );
2375 SV *sv=sv_newmortal();
2376 PerlIO_printf( Perl_debug_log,
2377 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2378 (int)depth * 2 + 2, "",
2380 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2381 PL_colors[0], PL_colors[1],
2382 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2383 PERL_PV_ESCAPE_FIRSTCHAR
2388 OP( convert ) = nodetype;
2389 str=STRING(convert);
2392 STR_LEN(convert) += len;
2398 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2403 trie->prefixlen = (state-1);
2405 regnode *n = convert+NODE_SZ_STR(convert);
2406 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2407 trie->startstate = state;
2408 trie->minlen -= (state - 1);
2409 trie->maxlen -= (state - 1);
2411 /* At least the UNICOS C compiler choked on this
2412 * being argument to DEBUG_r(), so let's just have
2415 #ifdef PERL_EXT_RE_BUILD
2421 regnode *fix = convert;
2422 U32 word = trie->wordcount;
2424 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2425 while( ++fix < n ) {
2426 Set_Node_Offset_Length(fix, 0, 0);
2429 SV ** const tmp = av_fetch( trie_words, word, 0 );
2431 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2432 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2434 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2442 NEXT_OFF(convert) = (U16)(tail - convert);
2443 DEBUG_r(optimize= n);
2449 if ( trie->maxlen ) {
2450 NEXT_OFF( convert ) = (U16)(tail - convert);
2451 ARG_SET( convert, data_slot );
2452 /* Store the offset to the first unabsorbed branch in
2453 jump[0], which is otherwise unused by the jump logic.
2454 We use this when dumping a trie and during optimisation. */
2456 trie->jump[0] = (U16)(nextbranch - convert);
2458 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2459 * and there is a bitmap
2460 * and the first "jump target" node we found leaves enough room
2461 * then convert the TRIE node into a TRIEC node, with the bitmap
2462 * embedded inline in the opcode - this is hypothetically faster.
2464 if ( !trie->states[trie->startstate].wordnum
2466 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2468 OP( convert ) = TRIEC;
2469 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2470 PerlMemShared_free(trie->bitmap);
2473 OP( convert ) = TRIE;
2475 /* store the type in the flags */
2476 convert->flags = nodetype;
2480 + regarglen[ OP( convert ) ];
2482 /* XXX We really should free up the resource in trie now,
2483 as we won't use them - (which resources?) dmq */
2485 /* needed for dumping*/
2486 DEBUG_r(if (optimize) {
2487 regnode *opt = convert;
2489 while ( ++opt < optimize) {
2490 Set_Node_Offset_Length(opt,0,0);
2493 Try to clean up some of the debris left after the
2496 while( optimize < jumper ) {
2497 mjd_nodelen += Node_Length((optimize));
2498 OP( optimize ) = OPTIMIZED;
2499 Set_Node_Offset_Length(optimize,0,0);
2502 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2504 } /* end node insert */
2506 /* Finish populating the prev field of the wordinfo array. Walk back
2507 * from each accept state until we find another accept state, and if
2508 * so, point the first word's .prev field at the second word. If the
2509 * second already has a .prev field set, stop now. This will be the
2510 * case either if we've already processed that word's accept state,
2511 * or that state had multiple words, and the overspill words were
2512 * already linked up earlier.
2519 for (word=1; word <= trie->wordcount; word++) {
2521 if (trie->wordinfo[word].prev)
2523 state = trie->wordinfo[word].accept;
2525 state = prev_states[state];
2528 prev = trie->states[state].wordnum;
2532 trie->wordinfo[word].prev = prev;
2534 Safefree(prev_states);
2538 /* and now dump out the compressed format */
2539 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2541 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2543 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2544 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2546 SvREFCNT_dec_NN(revcharmap);
2550 : trie->startstate>1
2556 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2558 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2560 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2561 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2564 We find the fail state for each state in the trie, this state is the longest proper
2565 suffix of the current state's 'word' that is also a proper prefix of another word in our
2566 trie. State 1 represents the word '' and is thus the default fail state. This allows
2567 the DFA not to have to restart after its tried and failed a word at a given point, it
2568 simply continues as though it had been matching the other word in the first place.
2570 'abcdgu'=~/abcdefg|cdgu/
2571 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2572 fail, which would bring us to the state representing 'd' in the second word where we would
2573 try 'g' and succeed, proceeding to match 'cdgu'.
2575 /* add a fail transition */
2576 const U32 trie_offset = ARG(source);
2577 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2579 const U32 ucharcount = trie->uniquecharcount;
2580 const U32 numstates = trie->statecount;
2581 const U32 ubound = trie->lasttrans + ucharcount;
2585 U32 base = trie->states[ 1 ].trans.base;
2588 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2589 GET_RE_DEBUG_FLAGS_DECL;
2591 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2593 PERL_UNUSED_ARG(depth);
2597 ARG_SET( stclass, data_slot );
2598 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2599 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2600 aho->trie=trie_offset;
2601 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2602 Copy( trie->states, aho->states, numstates, reg_trie_state );
2603 Newxz( q, numstates, U32);
2604 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2607 /* initialize fail[0..1] to be 1 so that we always have
2608 a valid final fail state */
2609 fail[ 0 ] = fail[ 1 ] = 1;
2611 for ( charid = 0; charid < ucharcount ; charid++ ) {
2612 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2614 q[ q_write ] = newstate;
2615 /* set to point at the root */
2616 fail[ q[ q_write++ ] ]=1;
2619 while ( q_read < q_write) {
2620 const U32 cur = q[ q_read++ % numstates ];
2621 base = trie->states[ cur ].trans.base;
2623 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2624 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2626 U32 fail_state = cur;
2629 fail_state = fail[ fail_state ];
2630 fail_base = aho->states[ fail_state ].trans.base;
2631 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2633 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2634 fail[ ch_state ] = fail_state;
2635 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2637 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2639 q[ q_write++ % numstates] = ch_state;
2643 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2644 when we fail in state 1, this allows us to use the
2645 charclass scan to find a valid start char. This is based on the principle
2646 that theres a good chance the string being searched contains lots of stuff
2647 that cant be a start char.
2649 fail[ 0 ] = fail[ 1 ] = 0;
2650 DEBUG_TRIE_COMPILE_r({
2651 PerlIO_printf(Perl_debug_log,
2652 "%*sStclass Failtable (%"UVuf" states): 0",
2653 (int)(depth * 2), "", (UV)numstates
2655 for( q_read=1; q_read<numstates; q_read++ ) {
2656 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2658 PerlIO_printf(Perl_debug_log, "\n");
2661 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2666 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2667 * These need to be revisited when a newer toolchain becomes available.
2669 #if defined(__sparc64__) && defined(__GNUC__)
2670 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2671 # undef SPARC64_GCC_WORKAROUND
2672 # define SPARC64_GCC_WORKAROUND 1
2676 #define DEBUG_PEEP(str,scan,depth) \
2677 DEBUG_OPTIMISE_r({if (scan){ \
2678 SV * const mysv=sv_newmortal(); \
2679 regnode *Next = regnext(scan); \
2680 regprop(RExC_rx, mysv, scan); \
2681 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2682 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2683 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2687 /* The below joins as many adjacent EXACTish nodes as possible into a single
2688 * one. The regop may be changed if the node(s) contain certain sequences that
2689 * require special handling. The joining is only done if:
2690 * 1) there is room in the current conglomerated node to entirely contain the
2692 * 2) they are the exact same node type
2694 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2695 * these get optimized out
2697 * If a node is to match under /i (folded), the number of characters it matches
2698 * can be different than its character length if it contains a multi-character
2699 * fold. *min_subtract is set to the total delta of the input nodes.
2701 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2702 * and contains LATIN SMALL LETTER SHARP S
2704 * This is as good a place as any to discuss the design of handling these
2705 * multi-character fold sequences. It's been wrong in Perl for a very long
2706 * time. There are three code points in Unicode whose multi-character folds
2707 * were long ago discovered to mess things up. The previous designs for
2708 * dealing with these involved assigning a special node for them. This
2709 * approach doesn't work, as evidenced by this example:
2710 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2711 * Both these fold to "sss", but if the pattern is parsed to create a node that
2712 * would match just the \xDF, it won't be able to handle the case where a
2713 * successful match would have to cross the node's boundary. The new approach
2714 * that hopefully generally solves the problem generates an EXACTFU_SS node
2717 * It turns out that there are problems with all multi-character folds, and not
2718 * just these three. Now the code is general, for all such cases. The
2719 * approach taken is:
2720 * 1) This routine examines each EXACTFish node that could contain multi-
2721 * character fold sequences. It returns in *min_subtract how much to
2722 * subtract from the the actual length of the string to get a real minimum
2723 * match length; it is 0 if there are no multi-char folds. This delta is
2724 * used by the caller to adjust the min length of the match, and the delta
2725 * between min and max, so that the optimizer doesn't reject these
2726 * possibilities based on size constraints.
2727 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2728 * is used for an EXACTFU node that contains at least one "ss" sequence in
2729 * it. For non-UTF-8 patterns and strings, this is the only case where
2730 * there is a possible fold length change. That means that a regular
2731 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2732 * with length changes, and so can be processed faster. regexec.c takes
2733 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2734 * pre-folded by regcomp.c. This saves effort in regex matching.
2735 * However, the pre-folding isn't done for non-UTF8 patterns because the
2736 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2737 * down by forcing the pattern into UTF8 unless necessary. Also what
2738 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2739 * possibilities for the non-UTF8 patterns are quite simple, except for
2740 * the sharp s. All the ones that don't involve a UTF-8 target string are
2741 * members of a fold-pair, and arrays are set up for all of them so that
2742 * the other member of the pair can be found quickly. Code elsewhere in
2743 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2744 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2745 * described in the next item.
2746 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2747 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2748 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2749 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2750 * character in the pattern corresponds to at most a single character in
2751 * the target string. (And I do mean character, and not byte here, unlike
2752 * other parts of the documentation that have never been updated to
2753 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2754 * two character string 'ss'; in EXACTFA nodes it can match
2755 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2756 * instances where it is violated. I'm reluctant to try to change the
2757 * assumption, as the code involved is impenetrable to me (khw), so
2758 * instead the code here punts. This routine examines (when the pattern
2759 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2760 * boolean indicating whether or not the node contains a sharp s. When it
2761 * is true, the caller sets a flag that later causes the optimizer in this
2762 * file to not set values for the floating and fixed string lengths, and
2763 * thus avoids the optimizer code in regexec.c that makes the invalid
2764 * assumption. Thus, there is no optimization based on string lengths for
2765 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2766 * (The reason the assumption is wrong only in these two cases is that all
2767 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2768 * other folds to their expanded versions. We can't prefold sharp s to
2769 * 'ss' in EXACTF nodes because we don't know at compile time if it
2770 * actually matches 'ss' or not. It will match iff the target string is
2771 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2772 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2773 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2774 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2775 * require the pattern to be forced into UTF-8, the overhead of which we
2778 * Similarly, the code that generates tries doesn't currently handle
2779 * not-already-folded multi-char folds, and it looks like a pain to change
2780 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2781 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2782 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2783 * using /iaa matching will be doing so almost entirely with ASCII
2784 * strings, so this should rarely be encountered in practice */
2786 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2787 if (PL_regkind[OP(scan)] == EXACT) \
2788 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2791 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2792 /* Merge several consecutive EXACTish nodes into one. */
2793 regnode *n = regnext(scan);
2795 regnode *next = scan + NODE_SZ_STR(scan);
2799 regnode *stop = scan;
2800 GET_RE_DEBUG_FLAGS_DECL;
2802 PERL_UNUSED_ARG(depth);
2805 PERL_ARGS_ASSERT_JOIN_EXACT;
2806 #ifndef EXPERIMENTAL_INPLACESCAN
2807 PERL_UNUSED_ARG(flags);
2808 PERL_UNUSED_ARG(val);
2810 DEBUG_PEEP("join",scan,depth);
2812 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2813 * EXACT ones that are mergeable to the current one. */
2815 && (PL_regkind[OP(n)] == NOTHING
2816 || (stringok && OP(n) == OP(scan)))
2818 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2821 if (OP(n) == TAIL || n > next)
2823 if (PL_regkind[OP(n)] == NOTHING) {
2824 DEBUG_PEEP("skip:",n,depth);
2825 NEXT_OFF(scan) += NEXT_OFF(n);
2826 next = n + NODE_STEP_REGNODE;
2833 else if (stringok) {
2834 const unsigned int oldl = STR_LEN(scan);
2835 regnode * const nnext = regnext(n);
2837 /* XXX I (khw) kind of doubt that this works on platforms where
2838 * U8_MAX is above 255 because of lots of other assumptions */
2839 /* Don't join if the sum can't fit into a single node */
2840 if (oldl + STR_LEN(n) > U8_MAX)
2843 DEBUG_PEEP("merg",n,depth);
2846 NEXT_OFF(scan) += NEXT_OFF(n);
2847 STR_LEN(scan) += STR_LEN(n);
2848 next = n + NODE_SZ_STR(n);
2849 /* Now we can overwrite *n : */
2850 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2858 #ifdef EXPERIMENTAL_INPLACESCAN
2859 if (flags && !NEXT_OFF(n)) {
2860 DEBUG_PEEP("atch", val, depth);
2861 if (reg_off_by_arg[OP(n)]) {
2862 ARG_SET(n, val - n);
2865 NEXT_OFF(n) = val - n;
2873 *has_exactf_sharp_s = FALSE;
2875 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2876 * can now analyze for sequences of problematic code points. (Prior to
2877 * this final joining, sequences could have been split over boundaries, and
2878 * hence missed). The sequences only happen in folding, hence for any
2879 * non-EXACT EXACTish node */
2880 if (OP(scan) != EXACT) {
2881 const U8 * const s0 = (U8*) STRING(scan);
2883 const U8 * const s_end = s0 + STR_LEN(scan);
2885 /* One pass is made over the node's string looking for all the
2886 * possibilities. to avoid some tests in the loop, there are two main
2887 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2891 /* Examine the string for a multi-character fold sequence. UTF-8
2892 * patterns have all characters pre-folded by the time this code is
2894 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2895 length sequence we are looking for is 2 */
2898 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2899 if (! len) { /* Not a multi-char fold: get next char */
2904 /* Nodes with 'ss' require special handling, except for EXACTFL
2905 * and EXACTFA-ish for which there is no multi-char fold to
2907 if (len == 2 && *s == 's' && *(s+1) == 's'
2908 && OP(scan) != EXACTFL
2909 && OP(scan) != EXACTFA
2910 && OP(scan) != EXACTFA_NO_TRIE)
2913 OP(scan) = EXACTFU_SS;
2916 else { /* Here is a generic multi-char fold. */
2917 const U8* multi_end = s + len;
2919 /* Count how many characters in it. In the case of /l and
2920 * /aa, no folds which contain ASCII code points are
2921 * allowed, so check for those, and skip if found. (In
2922 * EXACTFL, no folds are allowed to any Latin1 code point,
2923 * not just ASCII. But there aren't any of these
2924 * currently, nor ever likely, so don't take the time to
2925 * test for them. The code that generates the
2926 * is_MULTI_foo() macros croaks should one actually get put
2927 * into Unicode .) */
2928 if (OP(scan) != EXACTFL
2929 && OP(scan) != EXACTFA
2930 && OP(scan) != EXACTFA_NO_TRIE)
2932 count = utf8_length(s, multi_end);
2936 while (s < multi_end) {
2939 goto next_iteration;
2949 /* The delta is how long the sequence is minus 1 (1 is how long
2950 * the character that folds to the sequence is) */
2951 *min_subtract += count - 1;
2955 else if (OP(scan) == EXACTFA) {
2957 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2958 * fold to the ASCII range (and there are no existing ones in the
2959 * upper latin1 range). But, as outlined in the comments preceding
2960 * this function, we need to flag any occurrences of the sharp s.
2961 * This character forbids trie formation (because of added
2964 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2965 OP(scan) = EXACTFA_NO_TRIE;
2966 *has_exactf_sharp_s = TRUE;
2973 else if (OP(scan) != EXACTFL) {
2975 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2976 * multi-char folds that are all Latin1. (This code knows that
2977 * there are no current multi-char folds possible with EXACTFL,
2978 * relying on fold_grind.t to catch any errors if the very unlikely
2979 * event happens that some get added in future Unicode versions.)
2980 * As explained in the comments preceding this function, we look
2981 * also for the sharp s in EXACTF nodes; it can be in the final
2982 * position. Otherwise we can stop looking 1 byte earlier because
2983 * have to find at least two characters for a multi-fold */
2984 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2987 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2988 if (! len) { /* Not a multi-char fold. */
2989 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2991 *has_exactf_sharp_s = TRUE;
2998 && isARG2_lower_or_UPPER_ARG1('s', *s)
2999 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3002 /* EXACTF nodes need to know that the minimum length
3003 * changed so that a sharp s in the string can match this
3004 * ss in the pattern, but they remain EXACTF nodes, as they
3005 * won't match this unless the target string is is UTF-8,
3006 * which we don't know until runtime */
3007 if (OP(scan) != EXACTF) {
3008 OP(scan) = EXACTFU_SS;
3012 *min_subtract += len - 1;
3019 /* Allow dumping but overwriting the collection of skipped
3020 * ops and/or strings with fake optimized ops */
3021 n = scan + NODE_SZ_STR(scan);
3029 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3033 /* REx optimizer. Converts nodes into quicker variants "in place".
3034 Finds fixed substrings. */
3036 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3037 to the position after last scanned or to NULL. */
3039 #define INIT_AND_WITHP \
3040 assert(!and_withp); \
3041 Newx(and_withp,1,struct regnode_charclass_class); \
3042 SAVEFREEPV(and_withp)
3044 /* this is a chain of data about sub patterns we are processing that
3045 need to be handled separately/specially in study_chunk. Its so
3046 we can simulate recursion without losing state. */
3048 typedef struct scan_frame {
3049 regnode *last; /* last node to process in this frame */
3050 regnode *next; /* next node to process when last is reached */
3051 struct scan_frame *prev; /*previous frame*/
3052 I32 stop; /* what stopparen do we use */
3056 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3059 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3060 SSize_t *minlenp, SSize_t *deltap,
3065 struct regnode_charclass_class *and_withp,
3066 U32 flags, U32 depth)
3067 /* scanp: Start here (read-write). */
3068 /* deltap: Write maxlen-minlen here. */
3069 /* last: Stop before this one. */
3070 /* data: string data about the pattern */
3071 /* stopparen: treat close N as END */
3072 /* recursed: which subroutines have we recursed into */
3073 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3076 /* There must be at least this number of characters to match */
3079 regnode *scan = *scanp, *next;
3081 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3082 int is_inf_internal = 0; /* The studied chunk is infinite */
3083 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3084 scan_data_t data_fake;
3085 SV *re_trie_maxbuff = NULL;
3086 regnode *first_non_open = scan;
3087 SSize_t stopmin = SSize_t_MAX;
3088 scan_frame *frame = NULL;
3089 GET_RE_DEBUG_FLAGS_DECL;
3091 PERL_ARGS_ASSERT_STUDY_CHUNK;
3094 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3098 while (first_non_open && OP(first_non_open) == OPEN)
3099 first_non_open=regnext(first_non_open);
3104 while ( scan && OP(scan) != END && scan < last ){
3105 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3106 node length to get a real minimum (because
3107 the folded version may be shorter) */
3108 bool has_exactf_sharp_s = FALSE;
3109 /* Peephole optimizer: */
3110 DEBUG_STUDYDATA("Peep:", data,depth);
3111 DEBUG_PEEP("Peep",scan,depth);
3113 /* Its not clear to khw or hv why this is done here, and not in the
3114 * clauses that deal with EXACT nodes. khw's guess is that it's
3115 * because of a previous design */
3116 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3118 /* Follow the next-chain of the current node and optimize
3119 away all the NOTHINGs from it. */
3120 if (OP(scan) != CURLYX) {
3121 const int max = (reg_off_by_arg[OP(scan)]
3123 /* I32 may be smaller than U16 on CRAYs! */
3124 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3125 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3129 /* Skip NOTHING and LONGJMP. */
3130 while ((n = regnext(n))
3131 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3132 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3133 && off + noff < max)
3135 if (reg_off_by_arg[OP(scan)])
3138 NEXT_OFF(scan) = off;
3143 /* The principal pseudo-switch. Cannot be a switch, since we
3144 look into several different things. */
3145 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3146 || OP(scan) == IFTHEN) {
3147 next = regnext(scan);
3149 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3151 if (OP(next) == code || code == IFTHEN) {
3152 /* NOTE - There is similar code to this block below for handling
3153 TRIE nodes on a re-study. If you change stuff here check there
3155 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3156 struct regnode_charclass_class accum;
3157 regnode * const startbranch=scan;
3159 if (flags & SCF_DO_SUBSTR)
3160 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3161 if (flags & SCF_DO_STCLASS)
3162 cl_init_zero(pRExC_state, &accum);
3164 while (OP(scan) == code) {
3165 SSize_t deltanext, minnext, fake;
3167 struct regnode_charclass_class this_class;
3170 data_fake.flags = 0;
3172 data_fake.whilem_c = data->whilem_c;
3173 data_fake.last_closep = data->last_closep;
3176 data_fake.last_closep = &fake;
3178 data_fake.pos_delta = delta;
3179 next = regnext(scan);
3180 scan = NEXTOPER(scan);
3182 scan = NEXTOPER(scan);
3183 if (flags & SCF_DO_STCLASS) {
3184 cl_init(pRExC_state, &this_class);
3185 data_fake.start_class = &this_class;
3186 f = SCF_DO_STCLASS_AND;
3188 if (flags & SCF_WHILEM_VISITED_POS)
3189 f |= SCF_WHILEM_VISITED_POS;
3191 /* we suppose the run is continuous, last=next...*/
3192 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3194 stopparen, recursed, NULL, f,depth+1);
3197 if (deltanext == SSize_t_MAX) {
3198 is_inf = is_inf_internal = 1;
3200 } else if (max1 < minnext + deltanext)
3201 max1 = minnext + deltanext;
3203 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3205 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3206 if ( stopmin > minnext)
3207 stopmin = min + min1;
3208 flags &= ~SCF_DO_SUBSTR;
3210 data->flags |= SCF_SEEN_ACCEPT;
3213 if (data_fake.flags & SF_HAS_EVAL)
3214 data->flags |= SF_HAS_EVAL;
3215 data->whilem_c = data_fake.whilem_c;
3217 if (flags & SCF_DO_STCLASS)
3218 cl_or(pRExC_state, &accum, &this_class);
3220 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3222 if (flags & SCF_DO_SUBSTR) {
3223 data->pos_min += min1;
3224 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3225 data->pos_delta = SSize_t_MAX;
3227 data->pos_delta += max1 - min1;
3228 if (max1 != min1 || is_inf)
3229 data->longest = &(data->longest_float);
3232 if (delta == SSize_t_MAX
3233 || SSize_t_MAX - delta - (max1 - min1) < 0)
3234 delta = SSize_t_MAX;
3236 delta += max1 - min1;
3237 if (flags & SCF_DO_STCLASS_OR) {
3238 cl_or(pRExC_state, data->start_class, &accum);
3240 cl_and(data->start_class, and_withp);
3241 flags &= ~SCF_DO_STCLASS;
3244 else if (flags & SCF_DO_STCLASS_AND) {
3246 cl_and(data->start_class, &accum);
3247 flags &= ~SCF_DO_STCLASS;
3250 /* Switch to OR mode: cache the old value of
3251 * data->start_class */
3253 StructCopy(data->start_class, and_withp,
3254 struct regnode_charclass_class);
3255 flags &= ~SCF_DO_STCLASS_AND;
3256 StructCopy(&accum, data->start_class,
3257 struct regnode_charclass_class);
3258 flags |= SCF_DO_STCLASS_OR;
3259 SET_SSC_EOS(data->start_class);
3263 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3266 Assuming this was/is a branch we are dealing with: 'scan' now
3267 points at the item that follows the branch sequence, whatever
3268 it is. We now start at the beginning of the sequence and look
3275 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3277 If we can find such a subsequence we need to turn the first
3278 element into a trie and then add the subsequent branch exact
3279 strings to the trie.
3283 1. patterns where the whole set of branches can be converted.
3285 2. patterns where only a subset can be converted.
3287 In case 1 we can replace the whole set with a single regop
3288 for the trie. In case 2 we need to keep the start and end
3291 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3292 becomes BRANCH TRIE; BRANCH X;
3294 There is an additional case, that being where there is a
3295 common prefix, which gets split out into an EXACT like node
3296 preceding the TRIE node.
3298 If x(1..n)==tail then we can do a simple trie, if not we make
3299 a "jump" trie, such that when we match the appropriate word
3300 we "jump" to the appropriate tail node. Essentially we turn
3301 a nested if into a case structure of sorts.
3306 if (!re_trie_maxbuff) {
3307 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3308 if (!SvIOK(re_trie_maxbuff))
3309 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3311 if ( SvIV(re_trie_maxbuff)>=0 ) {
3313 regnode *first = (regnode *)NULL;
3314 regnode *last = (regnode *)NULL;
3315 regnode *tail = scan;
3320 SV * const mysv = sv_newmortal(); /* for dumping */
3322 /* var tail is used because there may be a TAIL
3323 regop in the way. Ie, the exacts will point to the
3324 thing following the TAIL, but the last branch will
3325 point at the TAIL. So we advance tail. If we
3326 have nested (?:) we may have to move through several
3330 while ( OP( tail ) == TAIL ) {
3331 /* this is the TAIL generated by (?:) */
3332 tail = regnext( tail );
3336 DEBUG_TRIE_COMPILE_r({
3337 regprop(RExC_rx, mysv, tail );
3338 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3339 (int)depth * 2 + 2, "",
3340 "Looking for TRIE'able sequences. Tail node is: ",
3341 SvPV_nolen_const( mysv )
3347 Step through the branches
3348 cur represents each branch,
3349 noper is the first thing to be matched as part of that branch
3350 noper_next is the regnext() of that node.
3352 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3353 via a "jump trie" but we also support building with NOJUMPTRIE,
3354 which restricts the trie logic to structures like /FOO|BAR/.
3356 If noper is a trieable nodetype then the branch is a possible optimization
3357 target. If we are building under NOJUMPTRIE then we require that noper_next
3358 is the same as scan (our current position in the regex program).
3360 Once we have two or more consecutive such branches we can create a
3361 trie of the EXACT's contents and stitch it in place into the program.
3363 If the sequence represents all of the branches in the alternation we
3364 replace the entire thing with a single TRIE node.
3366 Otherwise when it is a subsequence we need to stitch it in place and
3367 replace only the relevant branches. This means the first branch has
3368 to remain as it is used by the alternation logic, and its next pointer,
3369 and needs to be repointed at the item on the branch chain following
3370 the last branch we have optimized away.
3372 This could be either a BRANCH, in which case the subsequence is internal,
3373 or it could be the item following the branch sequence in which case the
3374 subsequence is at the end (which does not necessarily mean the first node
3375 is the start of the alternation).
3377 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3380 ----------------+-----------
3384 EXACTFU_SS | EXACTFU
3389 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3390 ( EXACT == (X) ) ? EXACT : \
3391 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3392 ( EXACTFA == (X) ) ? EXACTFA : \
3395 /* dont use tail as the end marker for this traverse */
3396 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3397 regnode * const noper = NEXTOPER( cur );
3398 U8 noper_type = OP( noper );
3399 U8 noper_trietype = TRIE_TYPE( noper_type );
3400 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3401 regnode * const noper_next = regnext( noper );
3402 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3403 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3406 DEBUG_TRIE_COMPILE_r({
3407 regprop(RExC_rx, mysv, cur);
3408 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3409 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3411 regprop(RExC_rx, mysv, noper);
3412 PerlIO_printf( Perl_debug_log, " -> %s",
3413 SvPV_nolen_const(mysv));
3416 regprop(RExC_rx, mysv, noper_next );
3417 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3418 SvPV_nolen_const(mysv));
3420 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3421 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3422 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3426 /* Is noper a trieable nodetype that can be merged with the
3427 * current trie (if there is one)? */
3431 ( noper_trietype == NOTHING)
3432 || ( trietype == NOTHING )
3433 || ( trietype == noper_trietype )
3436 && noper_next == tail
3440 /* Handle mergable triable node
3441 * Either we are the first node in a new trieable sequence,
3442 * in which case we do some bookkeeping, otherwise we update
3443 * the end pointer. */
3446 if ( noper_trietype == NOTHING ) {
3447 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3448 regnode * const noper_next = regnext( noper );
3449 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3450 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3453 if ( noper_next_trietype ) {
3454 trietype = noper_next_trietype;
3455 } else if (noper_next_type) {
3456 /* a NOTHING regop is 1 regop wide. We need at least two
3457 * for a trie so we can't merge this in */
3461 trietype = noper_trietype;
3464 if ( trietype == NOTHING )
3465 trietype = noper_trietype;
3470 } /* end handle mergable triable node */
3472 /* handle unmergable node -
3473 * noper may either be a triable node which can not be tried
3474 * together with the current trie, or a non triable node */
3476 /* If last is set and trietype is not NOTHING then we have found
3477 * at least two triable branch sequences in a row of a similar
3478 * trietype so we can turn them into a trie. If/when we
3479 * allow NOTHING to start a trie sequence this condition will be
3480 * required, and it isn't expensive so we leave it in for now. */
3481 if ( trietype && trietype != NOTHING )
3482 make_trie( pRExC_state,
3483 startbranch, first, cur, tail, count,
3484 trietype, depth+1 );
3485 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3489 && noper_next == tail
3492 /* noper is triable, so we can start a new trie sequence */
3495 trietype = noper_trietype;
3497 /* if we already saw a first but the current node is not triable then we have
3498 * to reset the first information. */
3503 } /* end handle unmergable node */
3504 } /* loop over branches */
3505 DEBUG_TRIE_COMPILE_r({
3506 regprop(RExC_rx, mysv, cur);
3507 PerlIO_printf( Perl_debug_log,
3508 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3509 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3512 if ( last && trietype ) {
3513 if ( trietype != NOTHING ) {
3514 /* the last branch of the sequence was part of a trie,
3515 * so we have to construct it here outside of the loop
3517 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3518 #ifdef TRIE_STUDY_OPT
3519 if ( ((made == MADE_EXACT_TRIE &&
3520 startbranch == first)
3521 || ( first_non_open == first )) &&
3523 flags |= SCF_TRIE_RESTUDY;
3524 if ( startbranch == first
3527 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3532 /* at this point we know whatever we have is a NOTHING sequence/branch
3533 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3535 if ( startbranch == first ) {
3537 /* the entire thing is a NOTHING sequence, something like this:
3538 * (?:|) So we can turn it into a plain NOTHING op. */
3539 DEBUG_TRIE_COMPILE_r({
3540 regprop(RExC_rx, mysv, cur);
3541 PerlIO_printf( Perl_debug_log,
3542 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3543 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3546 OP(startbranch)= NOTHING;
3547 NEXT_OFF(startbranch)= tail - startbranch;
3548 for ( opt= startbranch + 1; opt < tail ; opt++ )
3552 } /* end if ( last) */
3553 } /* TRIE_MAXBUF is non zero */
3558 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3559 scan = NEXTOPER(NEXTOPER(scan));
3560 } else /* single branch is optimized. */
3561 scan = NEXTOPER(scan);
3563 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3564 scan_frame *newframe = NULL;
3569 if (OP(scan) != SUSPEND) {
3570 /* set the pointer */
3571 if (OP(scan) == GOSUB) {
3573 RExC_recurse[ARG2L(scan)] = scan;
3574 start = RExC_open_parens[paren-1];
3575 end = RExC_close_parens[paren-1];
3578 start = RExC_rxi->program + 1;
3582 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3583 SAVEFREEPV(recursed);
3585 if (!PAREN_TEST(recursed,paren+1)) {
3586 PAREN_SET(recursed,paren+1);
3587 Newx(newframe,1,scan_frame);
3589 if (flags & SCF_DO_SUBSTR) {
3590 SCAN_COMMIT(pRExC_state,data,minlenp);
3591 data->longest = &(data->longest_float);
3593 is_inf = is_inf_internal = 1;
3594 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3595 cl_anything(pRExC_state, data->start_class);
3596 flags &= ~SCF_DO_STCLASS;
3599 Newx(newframe,1,scan_frame);
3602 end = regnext(scan);
3607 SAVEFREEPV(newframe);
3608 newframe->next = regnext(scan);
3609 newframe->last = last;
3610 newframe->stop = stopparen;
3611 newframe->prev = frame;
3621 else if (OP(scan) == EXACT) {
3622 SSize_t l = STR_LEN(scan);
3625 const U8 * const s = (U8*)STRING(scan);
3626 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3627 l = utf8_length(s, s + l);
3629 uc = *((U8*)STRING(scan));
3632 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3633 /* The code below prefers earlier match for fixed
3634 offset, later match for variable offset. */
3635 if (data->last_end == -1) { /* Update the start info. */
3636 data->last_start_min = data->pos_min;
3637 data->last_start_max = is_inf
3638 ? SSize_t_MAX : data->pos_min + data->pos_delta;
3640 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3642 SvUTF8_on(data->last_found);
3644 SV * const sv = data->last_found;
3645 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3646 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3647 if (mg && mg->mg_len >= 0)
3648 mg->mg_len += utf8_length((U8*)STRING(scan),
3649 (U8*)STRING(scan)+STR_LEN(scan));
3651 data->last_end = data->pos_min + l;
3652 data->pos_min += l; /* As in the first entry. */
3653 data->flags &= ~SF_BEFORE_EOL;
3655 if (flags & SCF_DO_STCLASS_AND) {
3656 /* Check whether it is compatible with what we know already! */
3660 /* If compatible, we or it in below. It is compatible if is
3661 * in the bitmp and either 1) its bit or its fold is set, or 2)
3662 * it's for a locale. Even if there isn't unicode semantics
3663 * here, at runtime there may be because of matching against a
3664 * utf8 string, so accept a possible false positive for
3665 * latin1-range folds */
3667 (!(data->start_class->flags & ANYOF_LOCALE)
3668 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3669 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3670 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3675 ANYOF_CLASS_ZERO(data->start_class);
3676 ANYOF_BITMAP_ZERO(data->start_class);
3678 ANYOF_BITMAP_SET(data->start_class, uc);
3679 else if (uc >= 0x100) {
3682 /* Some Unicode code points fold to the Latin1 range; as
3683 * XXX temporary code, instead of figuring out if this is
3684 * one, just assume it is and set all the start class bits
3685 * that could be some such above 255 code point's fold
3686 * which will generate fals positives. As the code
3687 * elsewhere that does compute the fold settles down, it
3688 * can be extracted out and re-used here */
3689 for (i = 0; i < 256; i++){
3690 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3691 ANYOF_BITMAP_SET(data->start_class, i);
3695 CLEAR_SSC_EOS(data->start_class);
3697 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3699 else if (flags & SCF_DO_STCLASS_OR) {
3700 /* false positive possible if the class is case-folded */
3702 ANYOF_BITMAP_SET(data->start_class, uc);
3704 data->start_class->flags |= ANYOF_UNICODE_ALL;
3705 CLEAR_SSC_EOS(data->start_class);
3706 cl_and(data->start_class, and_withp);
3708 flags &= ~SCF_DO_STCLASS;
3710 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3711 SSize_t l = STR_LEN(scan);
3712 UV uc = *((U8*)STRING(scan));
3714 /* Search for fixed substrings supports EXACT only. */
3715 if (flags & SCF_DO_SUBSTR) {
3717 SCAN_COMMIT(pRExC_state, data, minlenp);
3720 const U8 * const s = (U8 *)STRING(scan);
3721 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3722 l = utf8_length(s, s + l);
3724 if (has_exactf_sharp_s) {
3725 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3727 min += l - min_subtract;
3729 delta += min_subtract;
3730 if (flags & SCF_DO_SUBSTR) {
3731 data->pos_min += l - min_subtract;
3732 if (data->pos_min < 0) {
3735 data->pos_delta += min_subtract;
3737 data->longest = &(data->longest_float);
3740 if (flags & SCF_DO_STCLASS_AND) {
3741 /* Check whether it is compatible with what we know already! */
3744 (!(data->start_class->flags & ANYOF_LOCALE)
3745 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3746 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3750 ANYOF_CLASS_ZERO(data->start_class);
3751 ANYOF_BITMAP_ZERO(data->start_class);
3753 ANYOF_BITMAP_SET(data->start_class, uc);
3754 CLEAR_SSC_EOS(data->start_class);
3755 if (OP(scan) == EXACTFL) {
3756 /* XXX This set is probably no longer necessary, and
3757 * probably wrong as LOCALE now is on in the initial
3759 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3763 /* Also set the other member of the fold pair. In case
3764 * that unicode semantics is called for at runtime, use
3765 * the full latin1 fold. (Can't do this for locale,
3766 * because not known until runtime) */
3767 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3769 /* All other (EXACTFL handled above) folds except under
3770 * /iaa that include s, S, and sharp_s also may include
3772 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
3774 if (uc == 's' || uc == 'S') {
3775 ANYOF_BITMAP_SET(data->start_class,
3776 LATIN_SMALL_LETTER_SHARP_S);
3778 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3779 ANYOF_BITMAP_SET(data->start_class, 's');
3780 ANYOF_BITMAP_SET(data->start_class, 'S');
3785 else if (uc >= 0x100) {
3787 for (i = 0; i < 256; i++){
3788 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3789 ANYOF_BITMAP_SET(data->start_class, i);
3794 else if (flags & SCF_DO_STCLASS_OR) {
3795 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3796 /* false positive possible if the class is case-folded.
3797 Assume that the locale settings are the same... */
3799 ANYOF_BITMAP_SET(data->start_class, uc);
3800 if (OP(scan) != EXACTFL) {
3802 /* And set the other member of the fold pair, but
3803 * can't do that in locale because not known until
3805 ANYOF_BITMAP_SET(data->start_class,
3806 PL_fold_latin1[uc]);
3808 /* All folds except under /iaa that include s, S,
3809 * and sharp_s also may include the others */
3810 if (OP(scan) != EXACTFA
3811 && OP(scan) != EXACTFA_NO_TRIE)
3813 if (uc == 's' || uc == 'S') {
3814 ANYOF_BITMAP_SET(data->start_class,
3815 LATIN_SMALL_LETTER_SHARP_S);
3817 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3818 ANYOF_BITMAP_SET(data->start_class, 's');
3819 ANYOF_BITMAP_SET(data->start_class, 'S');
3824 CLEAR_SSC_EOS(data->start_class);
3826 cl_and(data->start_class, and_withp);
3828 flags &= ~SCF_DO_STCLASS;
3830 else if (REGNODE_VARIES(OP(scan))) {
3831 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
3832 I32 fl = 0, f = flags;
3833 regnode * const oscan = scan;
3834 struct regnode_charclass_class this_class;
3835 struct regnode_charclass_class *oclass = NULL;
3836 I32 next_is_eval = 0;
3838 switch (PL_regkind[OP(scan)]) {
3839 case WHILEM: /* End of (?:...)* . */
3840 scan = NEXTOPER(scan);
3843 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3844 next = NEXTOPER(scan);
3845 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3847 maxcount = REG_INFTY;
3848 next = regnext(scan);
3849 scan = NEXTOPER(scan);
3853 if (flags & SCF_DO_SUBSTR)
3858 if (flags & SCF_DO_STCLASS) {
3860 maxcount = REG_INFTY;
3861 next = regnext(scan);
3862 scan = NEXTOPER(scan);
3865 is_inf = is_inf_internal = 1;
3866 scan = regnext(scan);
3867 if (flags & SCF_DO_SUBSTR) {
3868 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3869 data->longest = &(data->longest_float);
3871 goto optimize_curly_tail;
3873 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3874 && (scan->flags == stopparen))
3879 mincount = ARG1(scan);
3880 maxcount = ARG2(scan);
3882 next = regnext(scan);
3883 if (OP(scan) == CURLYX) {
3884 I32 lp = (data ? *(data->last_closep) : 0);
3885 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3887 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3888 next_is_eval = (OP(scan) == EVAL);
3890 if (flags & SCF_DO_SUBSTR) {
3891 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3892 pos_before = data->pos_min;
3896 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3898 data->flags |= SF_IS_INF;
3900 if (flags & SCF_DO_STCLASS) {
3901 cl_init(pRExC_state, &this_class);
3902 oclass = data->start_class;
3903 data->start_class = &this_class;
3904 f |= SCF_DO_STCLASS_AND;
3905 f &= ~SCF_DO_STCLASS_OR;
3907 /* Exclude from super-linear cache processing any {n,m}
3908 regops for which the combination of input pos and regex
3909 pos is not enough information to determine if a match
3912 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3913 regex pos at the \s*, the prospects for a match depend not
3914 only on the input position but also on how many (bar\s*)
3915 repeats into the {4,8} we are. */
3916 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3917 f &= ~SCF_WHILEM_VISITED_POS;
3919 /* This will finish on WHILEM, setting scan, or on NULL: */
3920 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3921 last, data, stopparen, recursed, NULL,
3923 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3925 if (flags & SCF_DO_STCLASS)
3926 data->start_class = oclass;
3927 if (mincount == 0 || minnext == 0) {
3928 if (flags & SCF_DO_STCLASS_OR) {
3929 cl_or(pRExC_state, data->start_class, &this_class);
3931 else if (flags & SCF_DO_STCLASS_AND) {
3932 /* Switch to OR mode: cache the old value of
3933 * data->start_class */
3935 StructCopy(data->start_class, and_withp,
3936 struct regnode_charclass_class);
3937 flags &= ~SCF_DO_STCLASS_AND;
3938 StructCopy(&this_class, data->start_class,
3939 struct regnode_charclass_class);
3940 flags |= SCF_DO_STCLASS_OR;
3941 SET_SSC_EOS(data->start_class);
3943 } else { /* Non-zero len */
3944 if (flags & SCF_DO_STCLASS_OR) {
3945 cl_or(pRExC_state, data->start_class, &this_class);
3946 cl_and(data->start_class, and_withp);
3948 else if (flags & SCF_DO_STCLASS_AND)
3949 cl_and(data->start_class, &this_class);
3950 flags &= ~SCF_DO_STCLASS;
3952 if (!scan) /* It was not CURLYX, but CURLY. */
3954 if (!(flags & SCF_TRIE_DOING_RESTUDY)
3955 /* ? quantifier ok, except for (?{ ... }) */
3956 && (next_is_eval || !(mincount == 0 && maxcount == 1))
3957 && (minnext == 0) && (deltanext == 0)
3958 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3959 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3961 /* Fatal warnings may leak the regexp without this: */
3962 SAVEFREESV(RExC_rx_sv);
3963 ckWARNreg(RExC_parse,
3964 "Quantifier unexpected on zero-length expression");
3965 (void)ReREFCNT_inc(RExC_rx_sv);
3968 min += minnext * mincount;
3969 is_inf_internal |= deltanext == SSize_t_MAX
3970 || (maxcount == REG_INFTY && minnext + deltanext > 0);
3971 is_inf |= is_inf_internal;
3973 delta = SSize_t_MAX;
3975 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3977 /* Try powerful optimization CURLYX => CURLYN. */
3978 if ( OP(oscan) == CURLYX && data
3979 && data->flags & SF_IN_PAR
3980 && !(data->flags & SF_HAS_EVAL)
3981 && !deltanext && minnext == 1 ) {
3982 /* Try to optimize to CURLYN. */
3983 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3984 regnode * const nxt1 = nxt;
3991 if (!REGNODE_SIMPLE(OP(nxt))
3992 && !(PL_regkind[OP(nxt)] == EXACT
3993 && STR_LEN(nxt) == 1))
3999 if (OP(nxt) != CLOSE)
4001 if (RExC_open_parens) {
4002 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4003 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4005 /* Now we know that nxt2 is the only contents: */
4006 oscan->flags = (U8)ARG(nxt);
4008 OP(nxt1) = NOTHING; /* was OPEN. */
4011 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4012 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4013 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4014 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4015 OP(nxt + 1) = OPTIMIZED; /* was count. */
4016 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4021 /* Try optimization CURLYX => CURLYM. */
4022 if ( OP(oscan) == CURLYX && data
4023 && !(data->flags & SF_HAS_PAR)
4024 && !(data->flags & SF_HAS_EVAL)
4025 && !deltanext /* atom is fixed width */
4026 && minnext != 0 /* CURLYM can't handle zero width */
4027 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4029 /* XXXX How to optimize if data == 0? */
4030 /* Optimize to a simpler form. */
4031 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4035 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4036 && (OP(nxt2) != WHILEM))
4038 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4039 /* Need to optimize away parenths. */
4040 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4041 /* Set the parenth number. */
4042 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4044 oscan->flags = (U8)ARG(nxt);
4045 if (RExC_open_parens) {
4046 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4047 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4049 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4050 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4053 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4054 OP(nxt + 1) = OPTIMIZED; /* was count. */
4055 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4056 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4059 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4060 regnode *nnxt = regnext(nxt1);
4062 if (reg_off_by_arg[OP(nxt1)])
4063 ARG_SET(nxt1, nxt2 - nxt1);
4064 else if (nxt2 - nxt1 < U16_MAX)
4065 NEXT_OFF(nxt1) = nxt2 - nxt1;
4067 OP(nxt) = NOTHING; /* Cannot beautify */
4072 /* Optimize again: */
4073 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4074 NULL, stopparen, recursed, NULL, 0,depth+1);
4079 else if ((OP(oscan) == CURLYX)
4080 && (flags & SCF_WHILEM_VISITED_POS)
4081 /* See the comment on a similar expression above.
4082 However, this time it's not a subexpression
4083 we care about, but the expression itself. */
4084 && (maxcount == REG_INFTY)
4085 && data && ++data->whilem_c < 16) {
4086 /* This stays as CURLYX, we can put the count/of pair. */
4087 /* Find WHILEM (as in regexec.c) */
4088 regnode *nxt = oscan + NEXT_OFF(oscan);
4090 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4092 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4093 | (RExC_whilem_seen << 4)); /* On WHILEM */
4095 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4097 if (flags & SCF_DO_SUBSTR) {
4098 SV *last_str = NULL;
4099 int counted = mincount != 0;
4101 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4102 #if defined(SPARC64_GCC_WORKAROUND)
4105 const char *s = NULL;
4108 if (pos_before >= data->last_start_min)
4111 b = data->last_start_min;
4114 s = SvPV_const(data->last_found, l);
4115 old = b - data->last_start_min;
4118 SSize_t b = pos_before >= data->last_start_min
4119 ? pos_before : data->last_start_min;
4121 const char * const s = SvPV_const(data->last_found, l);
4122 SSize_t old = b - data->last_start_min;
4126 old = utf8_hop((U8*)s, old) - (U8*)s;
4128 /* Get the added string: */
4129 last_str = newSVpvn_utf8(s + old, l, UTF);
4130 if (deltanext == 0 && pos_before == b) {
4131 /* What was added is a constant string */
4133 SvGROW(last_str, (mincount * l) + 1);
4134 repeatcpy(SvPVX(last_str) + l,
4135 SvPVX_const(last_str), l, mincount - 1);
4136 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4137 /* Add additional parts. */
4138 SvCUR_set(data->last_found,
4139 SvCUR(data->last_found) - l);
4140 sv_catsv(data->last_found, last_str);
4142 SV * sv = data->last_found;
4144 SvUTF8(sv) && SvMAGICAL(sv) ?
4145 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4146 if (mg && mg->mg_len >= 0)
4147 mg->mg_len += CHR_SVLEN(last_str) - l;
4149 data->last_end += l * (mincount - 1);
4152 /* start offset must point into the last copy */
4153 data->last_start_min += minnext * (mincount - 1);
4154 data->last_start_max += is_inf ? SSize_t_MAX
4155 : (maxcount - 1) * (minnext + data->pos_delta);
4158 /* It is counted once already... */
4159 data->pos_min += minnext * (mincount - counted);
4161 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4162 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4163 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4164 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4166 if (deltanext != SSize_t_MAX)
4167 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4168 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4169 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4171 if (deltanext == SSize_t_MAX ||
4172 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4173 data->pos_delta = SSize_t_MAX;
4175 data->pos_delta += - counted * deltanext +
4176 (minnext + deltanext) * maxcount - minnext * mincount;
4177 if (mincount != maxcount) {
4178 /* Cannot extend fixed substrings found inside
4180 SCAN_COMMIT(pRExC_state,data,minlenp);
4181 if (mincount && last_str) {
4182 SV * const sv = data->last_found;
4183 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4184 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4188 sv_setsv(sv, last_str);
4189 data->last_end = data->pos_min;
4190 data->last_start_min =
4191 data->pos_min - CHR_SVLEN(last_str);
4192 data->last_start_max = is_inf
4194 : data->pos_min + data->pos_delta
4195 - CHR_SVLEN(last_str);
4197 data->longest = &(data->longest_float);
4199 SvREFCNT_dec(last_str);
4201 if (data && (fl & SF_HAS_EVAL))
4202 data->flags |= SF_HAS_EVAL;
4203 optimize_curly_tail:
4204 if (OP(oscan) != CURLYX) {
4205 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4207 NEXT_OFF(oscan) += NEXT_OFF(next);
4210 default: /* REF, and CLUMP only? */
4211 if (flags & SCF_DO_SUBSTR) {
4212 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4213 data->longest = &(data->longest_float);
4215 is_inf = is_inf_internal = 1;
4216 if (flags & SCF_DO_STCLASS_OR)
4217 cl_anything(pRExC_state, data->start_class);
4218 flags &= ~SCF_DO_STCLASS;
4222 else if (OP(scan) == LNBREAK) {
4223 if (flags & SCF_DO_STCLASS) {
4225 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4226 if (flags & SCF_DO_STCLASS_AND) {
4227 for (value = 0; value < 256; value++)
4228 if (!is_VERTWS_cp(value))
4229 ANYOF_BITMAP_CLEAR(data->start_class, value);
4232 for (value = 0; value < 256; value++)
4233 if (is_VERTWS_cp(value))
4234 ANYOF_BITMAP_SET(data->start_class, value);
4236 if (flags & SCF_DO_STCLASS_OR)
4237 cl_and(data->start_class, and_withp);
4238 flags &= ~SCF_DO_STCLASS;
4241 delta++; /* Because of the 2 char string cr-lf */
4242 if (flags & SCF_DO_SUBSTR) {
4243 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4245 data->pos_delta += 1;
4246 data->longest = &(data->longest_float);
4249 else if (REGNODE_SIMPLE(OP(scan))) {
4252 if (flags & SCF_DO_SUBSTR) {
4253 SCAN_COMMIT(pRExC_state,data,minlenp);
4257 if (flags & SCF_DO_STCLASS) {
4259 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4261 /* Some of the logic below assumes that switching
4262 locale on will only add false positives. */
4263 switch (PL_regkind[OP(scan)]) {
4269 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4272 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4273 cl_anything(pRExC_state, data->start_class);
4276 if (OP(scan) == SANY)
4278 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4279 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4280 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4281 cl_anything(pRExC_state, data->start_class);
4283 if (flags & SCF_DO_STCLASS_AND || !value)
4284 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4287 if (flags & SCF_DO_STCLASS_AND)
4288 cl_and(data->start_class,
4289 (struct regnode_charclass_class*)scan);
4291 cl_or(pRExC_state, data->start_class,
4292 (struct regnode_charclass_class*)scan);
4300 classnum = FLAGS(scan);
4301 if (flags & SCF_DO_STCLASS_AND) {
4302 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4303 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4304 for (value = 0; value < loop_max; value++) {
4305 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4306 ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4312 if (data->start_class->flags & ANYOF_LOCALE) {
4313 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4317 /* Even if under locale, set the bits for non-locale
4318 * in case it isn't a true locale-node. This will
4319 * create false positives if it truly is locale */
4320 for (value = 0; value < loop_max; value++) {
4321 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4322 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4334 classnum = FLAGS(scan);
4335 if (flags & SCF_DO_STCLASS_AND) {
4336 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4337 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4338 for (value = 0; value < loop_max; value++) {
4339 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4340 ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4346 if (data->start_class->flags & ANYOF_LOCALE) {
4347 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4351 /* Even if under locale, set the bits for non-locale in
4352 * case it isn't a true locale-node. This will create
4353 * false positives if it truly is locale */
4354 for (value = 0; value < loop_max; value++) {
4355 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4356 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4359 if (PL_regkind[OP(scan)] == NPOSIXD) {
4360 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4366 if (flags & SCF_DO_STCLASS_OR)
4367 cl_and(data->start_class, and_withp);
4368 flags &= ~SCF_DO_STCLASS;
4371 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4372 data->flags |= (OP(scan) == MEOL
4375 SCAN_COMMIT(pRExC_state, data, minlenp);
4378 else if ( PL_regkind[OP(scan)] == BRANCHJ
4379 /* Lookbehind, or need to calculate parens/evals/stclass: */
4380 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4381 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4382 if ( OP(scan) == UNLESSM &&
4384 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4385 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4388 regnode *upto= regnext(scan);
4390 SV * const mysv_val=sv_newmortal();
4391 DEBUG_STUDYDATA("OPFAIL",data,depth);
4393 /*DEBUG_PARSE_MSG("opfail");*/
4394 regprop(RExC_rx, mysv_val, upto);
4395 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4396 SvPV_nolen_const(mysv_val),
4397 (IV)REG_NODE_NUM(upto),
4402 NEXT_OFF(scan) = upto - scan;
4403 for (opt= scan + 1; opt < upto ; opt++)
4404 OP(opt) = OPTIMIZED;
4408 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4409 || OP(scan) == UNLESSM )
4411 /* Negative Lookahead/lookbehind
4412 In this case we can't do fixed string optimisation.
4415 SSize_t deltanext, minnext, fake = 0;
4417 struct regnode_charclass_class intrnl;
4420 data_fake.flags = 0;
4422 data_fake.whilem_c = data->whilem_c;
4423 data_fake.last_closep = data->last_closep;
4426 data_fake.last_closep = &fake;
4427 data_fake.pos_delta = delta;
4428 if ( flags & SCF_DO_STCLASS && !scan->flags
4429 && OP(scan) == IFMATCH ) { /* Lookahead */
4430 cl_init(pRExC_state, &intrnl);
4431 data_fake.start_class = &intrnl;
4432 f |= SCF_DO_STCLASS_AND;
4434 if (flags & SCF_WHILEM_VISITED_POS)
4435 f |= SCF_WHILEM_VISITED_POS;
4436 next = regnext(scan);
4437 nscan = NEXTOPER(NEXTOPER(scan));
4438 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4439 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4442 FAIL("Variable length lookbehind not implemented");
4444 else if (minnext > (I32)U8_MAX) {
4445 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4447 scan->flags = (U8)minnext;
4450 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4452 if (data_fake.flags & SF_HAS_EVAL)
4453 data->flags |= SF_HAS_EVAL;
4454 data->whilem_c = data_fake.whilem_c;
4456 if (f & SCF_DO_STCLASS_AND) {
4457 if (flags & SCF_DO_STCLASS_OR) {
4458 /* OR before, AND after: ideally we would recurse with
4459 * data_fake to get the AND applied by study of the
4460 * remainder of the pattern, and then derecurse;
4461 * *** HACK *** for now just treat as "no information".
4462 * See [perl #56690].
4464 cl_init(pRExC_state, data->start_class);
4466 /* AND before and after: combine and continue */
4467 const int was = TEST_SSC_EOS(data->start_class);
4469 cl_and(data->start_class, &intrnl);
4471 SET_SSC_EOS(data->start_class);
4475 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4477 /* Positive Lookahead/lookbehind
4478 In this case we can do fixed string optimisation,
4479 but we must be careful about it. Note in the case of
4480 lookbehind the positions will be offset by the minimum
4481 length of the pattern, something we won't know about
4482 until after the recurse.
4487 struct regnode_charclass_class intrnl;
4489 /* We use SAVEFREEPV so that when the full compile
4490 is finished perl will clean up the allocated
4491 minlens when it's all done. This way we don't
4492 have to worry about freeing them when we know
4493 they wont be used, which would be a pain.
4496 Newx( minnextp, 1, SSize_t );
4497 SAVEFREEPV(minnextp);
4500 StructCopy(data, &data_fake, scan_data_t);
4501 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4504 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4505 data_fake.last_found=newSVsv(data->last_found);
4509 data_fake.last_closep = &fake;
4510 data_fake.flags = 0;
4511 data_fake.pos_delta = delta;
4513 data_fake.flags |= SF_IS_INF;
4514 if ( flags & SCF_DO_STCLASS && !scan->flags
4515 && OP(scan) == IFMATCH ) { /* Lookahead */
4516 cl_init(pRExC_state, &intrnl);
4517 data_fake.start_class = &intrnl;
4518 f |= SCF_DO_STCLASS_AND;
4520 if (flags & SCF_WHILEM_VISITED_POS)
4521 f |= SCF_WHILEM_VISITED_POS;
4522 next = regnext(scan);
4523 nscan = NEXTOPER(NEXTOPER(scan));
4525 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4526 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4529 FAIL("Variable length lookbehind not implemented");
4531 else if (*minnextp > (I32)U8_MAX) {
4532 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4534 scan->flags = (U8)*minnextp;
4539 if (f & SCF_DO_STCLASS_AND) {
4540 const int was = TEST_SSC_EOS(data.start_class);
4542 cl_and(data->start_class, &intrnl);
4544 SET_SSC_EOS(data->start_class);
4547 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4549 if (data_fake.flags & SF_HAS_EVAL)
4550 data->flags |= SF_HAS_EVAL;
4551 data->whilem_c = data_fake.whilem_c;
4552 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4553 if (RExC_rx->minlen<*minnextp)
4554 RExC_rx->minlen=*minnextp;
4555 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4556 SvREFCNT_dec_NN(data_fake.last_found);
4558 if ( data_fake.minlen_fixed != minlenp )
4560 data->offset_fixed= data_fake.offset_fixed;
4561 data->minlen_fixed= data_fake.minlen_fixed;
4562 data->lookbehind_fixed+= scan->flags;
4564 if ( data_fake.minlen_float != minlenp )
4566 data->minlen_float= data_fake.minlen_float;
4567 data->offset_float_min=data_fake.offset_float_min;
4568 data->offset_float_max=data_fake.offset_float_max;
4569 data->lookbehind_float+= scan->flags;
4576 else if (OP(scan) == OPEN) {
4577 if (stopparen != (I32)ARG(scan))
4580 else if (OP(scan) == CLOSE) {
4581 if (stopparen == (I32)ARG(scan)) {
4584 if ((I32)ARG(scan) == is_par) {
4585 next = regnext(scan);
4587 if ( next && (OP(next) != WHILEM) && next < last)
4588 is_par = 0; /* Disable optimization */
4591 *(data->last_closep) = ARG(scan);
4593 else if (OP(scan) == EVAL) {
4595 data->flags |= SF_HAS_EVAL;
4597 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4598 if (flags & SCF_DO_SUBSTR) {
4599 SCAN_COMMIT(pRExC_state,data,minlenp);
4600 flags &= ~SCF_DO_SUBSTR;
4602 if (data && OP(scan)==ACCEPT) {
4603 data->flags |= SCF_SEEN_ACCEPT;
4608 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4610 if (flags & SCF_DO_SUBSTR) {
4611 SCAN_COMMIT(pRExC_state,data,minlenp);
4612 data->longest = &(data->longest_float);
4614 is_inf = is_inf_internal = 1;
4615 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4616 cl_anything(pRExC_state, data->start_class);
4617 flags &= ~SCF_DO_STCLASS;
4619 else if (OP(scan) == GPOS) {
4620 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4621 !(delta || is_inf || (data && data->pos_delta)))
4623 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4624 RExC_rx->extflags |= RXf_ANCH_GPOS;
4625 if (RExC_rx->gofs < (STRLEN)min)
4626 RExC_rx->gofs = min;
4628 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4632 #ifdef TRIE_STUDY_OPT
4633 #ifdef FULL_TRIE_STUDY
4634 else if (PL_regkind[OP(scan)] == TRIE) {
4635 /* NOTE - There is similar code to this block above for handling
4636 BRANCH nodes on the initial study. If you change stuff here
4638 regnode *trie_node= scan;
4639 regnode *tail= regnext(scan);
4640 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4641 SSize_t max1 = 0, min1 = SSize_t_MAX;
4642 struct regnode_charclass_class accum;
4644 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4645 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4646 if (flags & SCF_DO_STCLASS)
4647 cl_init_zero(pRExC_state, &accum);
4653 const regnode *nextbranch= NULL;
4656 for ( word=1 ; word <= trie->wordcount ; word++)
4658 SSize_t deltanext=0, minnext=0, f = 0, fake;
4659 struct regnode_charclass_class this_class;
4661 data_fake.flags = 0;
4663 data_fake.whilem_c = data->whilem_c;
4664 data_fake.last_closep = data->last_closep;
4667 data_fake.last_closep = &fake;
4668 data_fake.pos_delta = delta;
4669 if (flags & SCF_DO_STCLASS) {
4670 cl_init(pRExC_state, &this_class);
4671 data_fake.start_class = &this_class;
4672 f = SCF_DO_STCLASS_AND;
4674 if (flags & SCF_WHILEM_VISITED_POS)
4675 f |= SCF_WHILEM_VISITED_POS;
4677 if (trie->jump[word]) {
4679 nextbranch = trie_node + trie->jump[0];
4680 scan= trie_node + trie->jump[word];
4681 /* We go from the jump point to the branch that follows
4682 it. Note this means we need the vestigal unused branches
4683 even though they arent otherwise used.
4685 minnext = study_chunk(pRExC_state, &scan, minlenp,
4686 &deltanext, (regnode *)nextbranch, &data_fake,
4687 stopparen, recursed, NULL, f,depth+1);
4689 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4690 nextbranch= regnext((regnode*)nextbranch);
4692 if (min1 > (SSize_t)(minnext + trie->minlen))
4693 min1 = minnext + trie->minlen;
4694 if (deltanext == SSize_t_MAX) {
4695 is_inf = is_inf_internal = 1;
4697 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4698 max1 = minnext + deltanext + trie->maxlen;
4700 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4702 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4703 if ( stopmin > min + min1)
4704 stopmin = min + min1;
4705 flags &= ~SCF_DO_SUBSTR;
4707 data->flags |= SCF_SEEN_ACCEPT;
4710 if (data_fake.flags & SF_HAS_EVAL)
4711 data->flags |= SF_HAS_EVAL;
4712 data->whilem_c = data_fake.whilem_c;
4714 if (flags & SCF_DO_STCLASS)
4715 cl_or(pRExC_state, &accum, &this_class);
4718 if (flags & SCF_DO_SUBSTR) {
4719 data->pos_min += min1;
4720 data->pos_delta += max1 - min1;
4721 if (max1 != min1 || is_inf)
4722 data->longest = &(data->longest_float);
4725 delta += max1 - min1;
4726 if (flags & SCF_DO_STCLASS_OR) {
4727 cl_or(pRExC_state, data->start_class, &accum);
4729 cl_and(data->start_class, and_withp);
4730 flags &= ~SCF_DO_STCLASS;
4733 else if (flags & SCF_DO_STCLASS_AND) {
4735 cl_and(data->start_class, &accum);
4736 flags &= ~SCF_DO_STCLASS;
4739 /* Switch to OR mode: cache the old value of
4740 * data->start_class */
4742 StructCopy(data->start_class, and_withp,
4743 struct regnode_charclass_class);
4744 flags &= ~SCF_DO_STCLASS_AND;
4745 StructCopy(&accum, data->start_class,
4746 struct regnode_charclass_class);
4747 flags |= SCF_DO_STCLASS_OR;
4748 SET_SSC_EOS(data->start_class);
4755 else if (PL_regkind[OP(scan)] == TRIE) {
4756 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4759 min += trie->minlen;
4760 delta += (trie->maxlen - trie->minlen);
4761 flags &= ~SCF_DO_STCLASS; /* xxx */
4762 if (flags & SCF_DO_SUBSTR) {
4763 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4764 data->pos_min += trie->minlen;
4765 data->pos_delta += (trie->maxlen - trie->minlen);
4766 if (trie->maxlen != trie->minlen)
4767 data->longest = &(data->longest_float);
4769 if (trie->jump) /* no more substrings -- for now /grr*/
4770 flags &= ~SCF_DO_SUBSTR;
4772 #endif /* old or new */
4773 #endif /* TRIE_STUDY_OPT */
4775 /* Else: zero-length, ignore. */
4776 scan = regnext(scan);
4781 stopparen = frame->stop;
4782 frame = frame->prev;
4783 goto fake_study_recurse;
4788 DEBUG_STUDYDATA("pre-fin:",data,depth);
4791 *deltap = is_inf_internal ? SSize_t_MAX : delta;
4792 if (flags & SCF_DO_SUBSTR && is_inf)
4793 data->pos_delta = SSize_t_MAX - data->pos_min;
4794 if (is_par > (I32)U8_MAX)
4796 if (is_par && pars==1 && data) {
4797 data->flags |= SF_IN_PAR;
4798 data->flags &= ~SF_HAS_PAR;
4800 else if (pars && data) {
4801 data->flags |= SF_HAS_PAR;
4802 data->flags &= ~SF_IN_PAR;
4804 if (flags & SCF_DO_STCLASS_OR)
4805 cl_and(data->start_class, and_withp);
4806 if (flags & SCF_TRIE_RESTUDY)
4807 data->flags |= SCF_TRIE_RESTUDY;
4809 DEBUG_STUDYDATA("post-fin:",data,depth);
4811 return min < stopmin ? min : stopmin;
4815 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4817 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4819 PERL_ARGS_ASSERT_ADD_DATA;
4821 Renewc(RExC_rxi->data,
4822 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4823 char, struct reg_data);
4825 Renew(RExC_rxi->data->what, count + n, U8);
4827 Newx(RExC_rxi->data->what, n, U8);
4828 RExC_rxi->data->count = count + n;
4829 Copy(s, RExC_rxi->data->what + count, n, U8);
4833 /*XXX: todo make this not included in a non debugging perl */
4834 #ifndef PERL_IN_XSUB_RE
4836 Perl_reginitcolors(pTHX)
4839 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4841 char *t = savepv(s);
4845 t = strchr(t, '\t');
4851 PL_colors[i] = t = (char *)"";
4856 PL_colors[i++] = (char *)"";
4863 #ifdef TRIE_STUDY_OPT
4864 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4867 (data.flags & SCF_TRIE_RESTUDY) \
4875 #define CHECK_RESTUDY_GOTO_butfirst
4879 * pregcomp - compile a regular expression into internal code
4881 * Decides which engine's compiler to call based on the hint currently in
4885 #ifndef PERL_IN_XSUB_RE
4887 /* return the currently in-scope regex engine (or the default if none) */
4889 regexp_engine const *
4890 Perl_current_re_engine(pTHX)
4894 if (IN_PERL_COMPILETIME) {
4895 HV * const table = GvHV(PL_hintgv);
4898 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
4899 return &PL_core_reg_engine;
4900 ptr = hv_fetchs(table, "regcomp", FALSE);
4901 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4902 return &PL_core_reg_engine;
4903 return INT2PTR(regexp_engine*,SvIV(*ptr));
4907 if (!PL_curcop->cop_hints_hash)
4908 return &PL_core_reg_engine;
4909 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4910 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4911 return &PL_core_reg_engine;
4912 return INT2PTR(regexp_engine*,SvIV(ptr));
4918 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4921 regexp_engine const *eng = current_re_engine();
4922 GET_RE_DEBUG_FLAGS_DECL;
4924 PERL_ARGS_ASSERT_PREGCOMP;
4926 /* Dispatch a request to compile a regexp to correct regexp engine. */
4928 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4931 return CALLREGCOMP_ENG(eng, pattern, flags);
4935 /* public(ish) entry point for the perl core's own regex compiling code.
4936 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4937 * pattern rather than a list of OPs, and uses the internal engine rather
4938 * than the current one */
4941 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4943 SV *pat = pattern; /* defeat constness! */
4944 PERL_ARGS_ASSERT_RE_COMPILE;
4945 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4946 #ifdef PERL_IN_XSUB_RE
4949 &PL_core_reg_engine,
4951 NULL, NULL, rx_flags, 0);
4955 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4956 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4957 * point to the realloced string and length.
4959 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4963 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4964 char **pat_p, STRLEN *plen_p, int num_code_blocks)
4966 U8 *const src = (U8*)*pat_p;
4969 STRLEN s = 0, d = 0;
4971 GET_RE_DEBUG_FLAGS_DECL;
4973 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4974 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4976 Newx(dst, *plen_p * 2 + 1, U8);
4978 while (s < *plen_p) {
4979 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
4982 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
4983 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
4985 if (n < num_code_blocks) {
4986 if (!do_end && pRExC_state->code_blocks[n].start == s) {
4987 pRExC_state->code_blocks[n].start = d;
4988 assert(dst[d] == '(');
4991 else if (do_end && pRExC_state->code_blocks[n].end == s) {
4992 pRExC_state->code_blocks[n].end = d;
4993 assert(dst[d] == ')');
5003 *pat_p = (char*) dst;
5005 RExC_orig_utf8 = RExC_utf8 = 1;
5010 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5011 * while recording any code block indices, and handling overloading,
5012 * nested qr// objects etc. If pat is null, it will allocate a new
5013 * string, or just return the first arg, if there's only one.
5015 * Returns the malloced/updated pat.
5016 * patternp and pat_count is the array of SVs to be concatted;
5017 * oplist is the optional list of ops that generated the SVs;
5018 * recompile_p is a pointer to a boolean that will be set if
5019 * the regex will need to be recompiled.
5020 * delim, if non-null is an SV that will be inserted between each element
5024 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5025 SV *pat, SV ** const patternp, int pat_count,
5026 OP *oplist, bool *recompile_p, SV *delim)
5030 bool use_delim = FALSE;
5031 bool alloced = FALSE;
5033 /* if we know we have at least two args, create an empty string,
5034 * then concatenate args to that. For no args, return an empty string */
5035 if (!pat && pat_count != 1) {
5036 pat = newSVpvn("", 0);
5041 for (svp = patternp; svp < patternp + pat_count; svp++) {
5044 STRLEN orig_patlen = 0;
5046 SV *msv = use_delim ? delim : *svp;
5047 if (!msv) msv = &PL_sv_undef;
5049 /* if we've got a delimiter, we go round the loop twice for each
5050 * svp slot (except the last), using the delimiter the second
5059 if (SvTYPE(msv) == SVt_PVAV) {
5060 /* we've encountered an interpolated array within
5061 * the pattern, e.g. /...@a..../. Expand the list of elements,
5062 * then recursively append elements.
5063 * The code in this block is based on S_pushav() */
5065 AV *const av = (AV*)msv;
5066 const SSize_t maxarg = AvFILL(av) + 1;
5070 assert(oplist->op_type == OP_PADAV
5071 || oplist->op_type == OP_RV2AV);
5072 oplist = oplist->op_sibling;;
5075 if (SvRMAGICAL(av)) {
5078 Newx(array, maxarg, SV*);
5080 for (i=0; i < maxarg; i++) {
5081 SV ** const svp = av_fetch(av, i, FALSE);
5082 array[i] = svp ? *svp : &PL_sv_undef;
5086 array = AvARRAY(av);
5088 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5089 array, maxarg, NULL, recompile_p,
5091 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5097 /* we make the assumption here that each op in the list of
5098 * op_siblings maps to one SV pushed onto the stack,
5099 * except for code blocks, with have both an OP_NULL and
5101 * This allows us to match up the list of SVs against the
5102 * list of OPs to find the next code block.
5104 * Note that PUSHMARK PADSV PADSV ..
5106 * PADRANGE PADSV PADSV ..
5107 * so the alignment still works. */
5110 if (oplist->op_type == OP_NULL
5111 && (oplist->op_flags & OPf_SPECIAL))
5113 assert(n < pRExC_state->num_code_blocks);
5114 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5115 pRExC_state->code_blocks[n].block = oplist;
5116 pRExC_state->code_blocks[n].src_regex = NULL;
5119 oplist = oplist->op_sibling; /* skip CONST */
5122 oplist = oplist->op_sibling;;
5125 /* apply magic and QR overloading to arg */
5128 if (SvROK(msv) && SvAMAGIC(msv)) {
5129 SV *sv = AMG_CALLunary(msv, regexp_amg);
5133 if (SvTYPE(sv) != SVt_REGEXP)
5134 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5139 /* try concatenation overload ... */
5140 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5141 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5144 /* overloading involved: all bets are off over literal
5145 * code. Pretend we haven't seen it */
5146 pRExC_state->num_code_blocks -= n;
5150 /* ... or failing that, try "" overload */
5151 while (SvAMAGIC(msv)
5152 && (sv = AMG_CALLunary(msv, string_amg))
5156 && SvRV(msv) == SvRV(sv))
5161 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5165 /* this is a partially unrolled
5166 * sv_catsv_nomg(pat, msv);
5167 * that allows us to adjust code block indices if
5170 char *dst = SvPV_force_nomg(pat, dlen);
5172 if (SvUTF8(msv) && !SvUTF8(pat)) {
5173 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5174 sv_setpvn(pat, dst, dlen);
5177 sv_catsv_nomg(pat, msv);
5184 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5187 /* extract any code blocks within any embedded qr//'s */
5188 if (rx && SvTYPE(rx) == SVt_REGEXP
5189 && RX_ENGINE((REGEXP*)rx)->op_comp)
5192 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5193 if (ri->num_code_blocks) {
5195 /* the presence of an embedded qr// with code means
5196 * we should always recompile: the text of the
5197 * qr// may not have changed, but it may be a
5198 * different closure than last time */
5200 Renew(pRExC_state->code_blocks,
5201 pRExC_state->num_code_blocks + ri->num_code_blocks,
5202 struct reg_code_block);
5203 pRExC_state->num_code_blocks += ri->num_code_blocks;
5205 for (i=0; i < ri->num_code_blocks; i++) {
5206 struct reg_code_block *src, *dst;
5207 STRLEN offset = orig_patlen
5208 + ReANY((REGEXP *)rx)->pre_prefix;
5209 assert(n < pRExC_state->num_code_blocks);
5210 src = &ri->code_blocks[i];
5211 dst = &pRExC_state->code_blocks[n];
5212 dst->start = src->start + offset;
5213 dst->end = src->end + offset;
5214 dst->block = src->block;
5215 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5224 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5233 /* see if there are any run-time code blocks in the pattern.
5234 * False positives are allowed */
5237 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5238 char *pat, STRLEN plen)
5243 for (s = 0; s < plen; s++) {
5244 if (n < pRExC_state->num_code_blocks
5245 && s == pRExC_state->code_blocks[n].start)
5247 s = pRExC_state->code_blocks[n].end;
5251 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5253 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5255 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5262 /* Handle run-time code blocks. We will already have compiled any direct
5263 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5264 * copy of it, but with any literal code blocks blanked out and
5265 * appropriate chars escaped; then feed it into
5267 * eval "qr'modified_pattern'"
5271 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5275 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5277 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5278 * and merge them with any code blocks of the original regexp.
5280 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5281 * instead, just save the qr and return FALSE; this tells our caller that
5282 * the original pattern needs upgrading to utf8.
5286 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5287 char *pat, STRLEN plen)
5291 GET_RE_DEBUG_FLAGS_DECL;
5293 if (pRExC_state->runtime_code_qr) {
5294 /* this is the second time we've been called; this should
5295 * only happen if the main pattern got upgraded to utf8
5296 * during compilation; re-use the qr we compiled first time
5297 * round (which should be utf8 too)
5299 qr = pRExC_state->runtime_code_qr;
5300 pRExC_state->runtime_code_qr = NULL;
5301 assert(RExC_utf8 && SvUTF8(qr));
5307 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5311 /* determine how many extra chars we need for ' and \ escaping */
5312 for (s = 0; s < plen; s++) {
5313 if (pat[s] == '\'' || pat[s] == '\\')
5317 Newx(newpat, newlen, char);
5319 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5321 for (s = 0; s < plen; s++) {
5322 if (n < pRExC_state->num_code_blocks
5323 && s == pRExC_state->code_blocks[n].start)
5325 /* blank out literal code block */
5326 assert(pat[s] == '(');
5327 while (s <= pRExC_state->code_blocks[n].end) {
5335 if (pat[s] == '\'' || pat[s] == '\\')
5340 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5344 PerlIO_printf(Perl_debug_log,
5345 "%sre-parsing pattern for runtime code:%s %s\n",
5346 PL_colors[4],PL_colors[5],newpat);
5349 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5355 PUSHSTACKi(PERLSI_REQUIRE);
5356 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5357 * parsing qr''; normally only q'' does this. It also alters
5359 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5360 SvREFCNT_dec_NN(sv);
5365 SV * const errsv = ERRSV;
5366 if (SvTRUE_NN(errsv))
5368 Safefree(pRExC_state->code_blocks);
5369 /* use croak_sv ? */
5370 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5373 assert(SvROK(qr_ref));
5375 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5376 /* the leaving below frees the tmp qr_ref.
5377 * Give qr a life of its own */
5385 if (!RExC_utf8 && SvUTF8(qr)) {
5386 /* first time through; the pattern got upgraded; save the
5387 * qr for the next time through */
5388 assert(!pRExC_state->runtime_code_qr);
5389 pRExC_state->runtime_code_qr = qr;
5394 /* extract any code blocks within the returned qr// */
5397 /* merge the main (r1) and run-time (r2) code blocks into one */
5399 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5400 struct reg_code_block *new_block, *dst;
5401 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5404 if (!r2->num_code_blocks) /* we guessed wrong */
5406 SvREFCNT_dec_NN(qr);
5411 r1->num_code_blocks + r2->num_code_blocks,
5412 struct reg_code_block);
5415 while ( i1 < r1->num_code_blocks
5416 || i2 < r2->num_code_blocks)
5418 struct reg_code_block *src;
5421 if (i1 == r1->num_code_blocks) {
5422 src = &r2->code_blocks[i2++];
5425 else if (i2 == r2->num_code_blocks)
5426 src = &r1->code_blocks[i1++];
5427 else if ( r1->code_blocks[i1].start
5428 < r2->code_blocks[i2].start)
5430 src = &r1->code_blocks[i1++];
5431 assert(src->end < r2->code_blocks[i2].start);
5434 assert( r1->code_blocks[i1].start
5435 > r2->code_blocks[i2].start);
5436 src = &r2->code_blocks[i2++];
5438 assert(src->end < r1->code_blocks[i1].start);
5441 assert(pat[src->start] == '(');
5442 assert(pat[src->end] == ')');
5443 dst->start = src->start;
5444 dst->end = src->end;
5445 dst->block = src->block;
5446 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5450 r1->num_code_blocks += r2->num_code_blocks;
5451 Safefree(r1->code_blocks);
5452 r1->code_blocks = new_block;
5455 SvREFCNT_dec_NN(qr);
5461 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5462 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5464 /* This is the common code for setting up the floating and fixed length
5465 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5466 * as to whether succeeded or not */
5471 if (! (longest_length
5472 || (eol /* Can't have SEOL and MULTI */
5473 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5475 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5476 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5481 /* copy the information about the longest from the reg_scan_data
5482 over to the program. */
5483 if (SvUTF8(sv_longest)) {
5484 *rx_utf8 = sv_longest;
5487 *rx_substr = sv_longest;
5490 /* end_shift is how many chars that must be matched that
5491 follow this item. We calculate it ahead of time as once the
5492 lookbehind offset is added in we lose the ability to correctly
5494 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5495 *rx_end_shift = ml - offset
5496 - longest_length + (SvTAIL(sv_longest) != 0)
5499 t = (eol/* Can't have SEOL and MULTI */
5500 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5501 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5507 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5508 * regular expression into internal code.
5509 * The pattern may be passed either as:
5510 * a list of SVs (patternp plus pat_count)
5511 * a list of OPs (expr)
5512 * If both are passed, the SV list is used, but the OP list indicates
5513 * which SVs are actually pre-compiled code blocks
5515 * The SVs in the list have magic and qr overloading applied to them (and
5516 * the list may be modified in-place with replacement SVs in the latter
5519 * If the pattern hasn't changed from old_re, then old_re will be
5522 * eng is the current engine. If that engine has an op_comp method, then
5523 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5524 * do the initial concatenation of arguments and pass on to the external
5527 * If is_bare_re is not null, set it to a boolean indicating whether the
5528 * arg list reduced (after overloading) to a single bare regex which has
5529 * been returned (i.e. /$qr/).
5531 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5533 * pm_flags contains the PMf_* flags, typically based on those from the
5534 * pm_flags field of the related PMOP. Currently we're only interested in
5535 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5537 * We can't allocate space until we know how big the compiled form will be,
5538 * but we can't compile it (and thus know how big it is) until we've got a
5539 * place to put the code. So we cheat: we compile it twice, once with code
5540 * generation turned off and size counting turned on, and once "for real".
5541 * This also means that we don't allocate space until we are sure that the
5542 * thing really will compile successfully, and we never have to move the
5543 * code and thus invalidate pointers into it. (Note that it has to be in
5544 * one piece because free() must be able to free it all.) [NB: not true in perl]
5546 * Beware that the optimization-preparation code in here knows about some
5547 * of the structure of the compiled regexp. [I'll say.]
5551 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5552 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5553 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5558 regexp_internal *ri;
5566 SV *code_blocksv = NULL;
5567 SV** new_patternp = patternp;
5569 /* these are all flags - maybe they should be turned
5570 * into a single int with different bit masks */
5571 I32 sawlookahead = 0;
5576 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5578 bool runtime_code = 0;
5580 RExC_state_t RExC_state;
5581 RExC_state_t * const pRExC_state = &RExC_state;
5582 #ifdef TRIE_STUDY_OPT
5584 RExC_state_t copyRExC_state;
5586 GET_RE_DEBUG_FLAGS_DECL;
5588 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5590 DEBUG_r(if (!PL_colorset) reginitcolors());
5592 #ifndef PERL_IN_XSUB_RE
5593 /* Initialize these here instead of as-needed, as is quick and avoids
5594 * having to test them each time otherwise */
5595 if (! PL_AboveLatin1) {
5596 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5597 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5598 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5600 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5601 = _new_invlist_C_array(L1PosixAlnum_invlist);
5602 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5603 = _new_invlist_C_array(PosixAlnum_invlist);
5605 PL_L1Posix_ptrs[_CC_ALPHA]
5606 = _new_invlist_C_array(L1PosixAlpha_invlist);
5607 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5609 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5610 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5612 /* Cased is the same as Alpha in the ASCII range */
5613 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5614 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5616 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5617 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5619 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5620 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5622 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5623 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5625 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5626 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5628 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5629 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5631 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5632 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5634 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5635 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5636 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5637 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5639 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5640 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5642 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5644 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5645 PL_L1Posix_ptrs[_CC_WORDCHAR]
5646 = _new_invlist_C_array(L1PosixWord_invlist);
5648 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5649 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5651 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5655 pRExC_state->code_blocks = NULL;
5656 pRExC_state->num_code_blocks = 0;
5659 *is_bare_re = FALSE;
5661 if (expr && (expr->op_type == OP_LIST ||
5662 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5663 /* allocate code_blocks if needed */
5667 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5668 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5669 ncode++; /* count of DO blocks */
5671 pRExC_state->num_code_blocks = ncode;
5672 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5677 /* compile-time pattern with just OP_CONSTs and DO blocks */
5682 /* find how many CONSTs there are */
5685 if (expr->op_type == OP_CONST)
5688 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5689 if (o->op_type == OP_CONST)
5693 /* fake up an SV array */
5695 assert(!new_patternp);
5696 Newx(new_patternp, n, SV*);
5697 SAVEFREEPV(new_patternp);
5701 if (expr->op_type == OP_CONST)
5702 new_patternp[n] = cSVOPx_sv(expr);
5704 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5705 if (o->op_type == OP_CONST)
5706 new_patternp[n++] = cSVOPo_sv;
5711 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5712 "Assembling pattern from %d elements%s\n", pat_count,
5713 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5715 /* set expr to the first arg op */
5717 if (pRExC_state->num_code_blocks
5718 && expr->op_type != OP_CONST)
5720 expr = cLISTOPx(expr)->op_first;
5721 assert( expr->op_type == OP_PUSHMARK
5722 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5723 || expr->op_type == OP_PADRANGE);
5724 expr = expr->op_sibling;
5727 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5728 expr, &recompile, NULL);
5730 /* handle bare (possibly after overloading) regex: foo =~ $re */
5735 if (SvTYPE(re) == SVt_REGEXP) {
5739 Safefree(pRExC_state->code_blocks);
5740 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5741 "Precompiled pattern%s\n",
5742 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5748 exp = SvPV_nomg(pat, plen);
5750 if (!eng->op_comp) {
5751 if ((SvUTF8(pat) && IN_BYTES)
5752 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5754 /* make a temporary copy; either to convert to bytes,
5755 * or to avoid repeating get-magic / overloaded stringify */
5756 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5757 (IN_BYTES ? 0 : SvUTF8(pat)));
5759 Safefree(pRExC_state->code_blocks);
5760 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5763 /* ignore the utf8ness if the pattern is 0 length */
5764 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5765 RExC_uni_semantics = 0;
5766 RExC_contains_locale = 0;
5767 pRExC_state->runtime_code_qr = NULL;
5770 SV *dsv= sv_newmortal();
5771 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5772 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5773 PL_colors[4],PL_colors[5],s);
5777 /* we jump here if we upgrade the pattern to utf8 and have to
5780 if ((pm_flags & PMf_USE_RE_EVAL)
5781 /* this second condition covers the non-regex literal case,
5782 * i.e. $foo =~ '(?{})'. */
5783 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5785 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5787 /* return old regex if pattern hasn't changed */
5788 /* XXX: note in the below we have to check the flags as well as the pattern.
5790 * Things get a touch tricky as we have to compare the utf8 flag independently
5791 * from the compile flags.
5796 && !!RX_UTF8(old_re) == !!RExC_utf8
5797 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5798 && RX_PRECOMP(old_re)
5799 && RX_PRELEN(old_re) == plen
5800 && memEQ(RX_PRECOMP(old_re), exp, plen)
5801 && !runtime_code /* with runtime code, always recompile */ )
5803 Safefree(pRExC_state->code_blocks);
5807 rx_flags = orig_rx_flags;
5809 if (initial_charset == REGEX_LOCALE_CHARSET) {
5810 RExC_contains_locale = 1;
5812 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5814 /* Set to use unicode semantics if the pattern is in utf8 and has the
5815 * 'depends' charset specified, as it means unicode when utf8 */
5816 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5820 RExC_flags = rx_flags;
5821 RExC_pm_flags = pm_flags;
5824 if (TAINTING_get && TAINT_get)
5825 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5827 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5828 /* whoops, we have a non-utf8 pattern, whilst run-time code
5829 * got compiled as utf8. Try again with a utf8 pattern */
5830 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5831 pRExC_state->num_code_blocks);
5832 goto redo_first_pass;
5835 assert(!pRExC_state->runtime_code_qr);
5840 RExC_in_lookbehind = 0;
5841 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5843 RExC_override_recoding = 0;
5844 RExC_in_multi_char_class = 0;
5846 /* First pass: determine size, legality. */
5849 RExC_end = exp + plen;
5854 RExC_emit = &RExC_emit_dummy;
5855 RExC_whilem_seen = 0;
5856 RExC_open_parens = NULL;
5857 RExC_close_parens = NULL;
5859 RExC_paren_names = NULL;
5861 RExC_paren_name_list = NULL;
5863 RExC_recurse = NULL;
5864 RExC_recurse_count = 0;
5865 pRExC_state->code_index = 0;
5867 #if 0 /* REGC() is (currently) a NOP at the first pass.
5868 * Clever compilers notice this and complain. --jhi */
5869 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5872 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5874 RExC_lastparse=NULL;
5876 /* reg may croak on us, not giving us a chance to free
5877 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5878 need it to survive as long as the regexp (qr/(?{})/).
5879 We must check that code_blocksv is not already set, because we may
5880 have jumped back to restart the sizing pass. */
5881 if (pRExC_state->code_blocks && !code_blocksv) {
5882 code_blocksv = newSV_type(SVt_PV);
5883 SAVEFREESV(code_blocksv);
5884 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5885 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5887 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5888 /* It's possible to write a regexp in ascii that represents Unicode
5889 codepoints outside of the byte range, such as via \x{100}. If we
5890 detect such a sequence we have to convert the entire pattern to utf8
5891 and then recompile, as our sizing calculation will have been based
5892 on 1 byte == 1 character, but we will need to use utf8 to encode
5893 at least some part of the pattern, and therefore must convert the whole
5896 if (flags & RESTART_UTF8) {
5897 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5898 pRExC_state->num_code_blocks);
5899 goto redo_first_pass;
5901 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5904 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5907 PerlIO_printf(Perl_debug_log,
5908 "Required size %"IVdf" nodes\n"
5909 "Starting second pass (creation)\n",
5912 RExC_lastparse=NULL;
5915 /* The first pass could have found things that force Unicode semantics */
5916 if ((RExC_utf8 || RExC_uni_semantics)
5917 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5919 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5922 /* Small enough for pointer-storage convention?
5923 If extralen==0, this means that we will not need long jumps. */
5924 if (RExC_size >= 0x10000L && RExC_extralen)
5925 RExC_size += RExC_extralen;
5928 if (RExC_whilem_seen > 15)
5929 RExC_whilem_seen = 15;
5931 /* Allocate space and zero-initialize. Note, the two step process
5932 of zeroing when in debug mode, thus anything assigned has to
5933 happen after that */
5934 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5936 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5937 char, regexp_internal);
5938 if ( r == NULL || ri == NULL )
5939 FAIL("Regexp out of space");
5941 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5942 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5944 /* bulk initialize base fields with 0. */
5945 Zero(ri, sizeof(regexp_internal), char);
5948 /* non-zero initialization begins here */
5951 r->extflags = rx_flags;
5952 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5954 if (pm_flags & PMf_IS_QR) {
5955 ri->code_blocks = pRExC_state->code_blocks;
5956 ri->num_code_blocks = pRExC_state->num_code_blocks;
5961 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5962 if (pRExC_state->code_blocks[n].src_regex)
5963 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5964 SAVEFREEPV(pRExC_state->code_blocks);
5968 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5969 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5971 /* The caret is output if there are any defaults: if not all the STD
5972 * flags are set, or if no character set specifier is needed */
5974 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5976 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5977 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5978 >> RXf_PMf_STD_PMMOD_SHIFT);
5979 const char *fptr = STD_PAT_MODS; /*"msix"*/
5981 /* Allocate for the worst case, which is all the std flags are turned
5982 * on. If more precision is desired, we could do a population count of
5983 * the flags set. This could be done with a small lookup table, or by
5984 * shifting, masking and adding, or even, when available, assembly
5985 * language for a machine-language population count.
5986 * We never output a minus, as all those are defaults, so are
5987 * covered by the caret */
5988 const STRLEN wraplen = plen + has_p + has_runon
5989 + has_default /* If needs a caret */
5991 /* If needs a character set specifier */
5992 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5993 + (sizeof(STD_PAT_MODS) - 1)
5994 + (sizeof("(?:)") - 1);
5996 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5997 r->xpv_len_u.xpvlenu_pv = p;
5999 SvFLAGS(rx) |= SVf_UTF8;
6002 /* If a default, cover it using the caret */
6004 *p++= DEFAULT_PAT_MOD;
6008 const char* const name = get_regex_charset_name(r->extflags, &len);
6009 Copy(name, p, len, char);
6013 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6016 while((ch = *fptr++)) {
6024 Copy(RExC_precomp, p, plen, char);
6025 assert ((RX_WRAPPED(rx) - p) < 16);
6026 r->pre_prefix = p - RX_WRAPPED(rx);
6032 SvCUR_set(rx, p - RX_WRAPPED(rx));
6036 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6038 if (RExC_seen & REG_SEEN_RECURSE) {
6039 Newxz(RExC_open_parens, RExC_npar,regnode *);
6040 SAVEFREEPV(RExC_open_parens);
6041 Newxz(RExC_close_parens,RExC_npar,regnode *);
6042 SAVEFREEPV(RExC_close_parens);
6045 /* Useful during FAIL. */
6046 #ifdef RE_TRACK_PATTERN_OFFSETS
6047 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6048 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6049 "%s %"UVuf" bytes for offset annotations.\n",
6050 ri->u.offsets ? "Got" : "Couldn't get",
6051 (UV)((2*RExC_size+1) * sizeof(U32))));
6053 SetProgLen(ri,RExC_size);
6058 /* Second pass: emit code. */
6059 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6060 RExC_pm_flags = pm_flags;
6062 RExC_end = exp + plen;
6065 RExC_emit_start = ri->program;
6066 RExC_emit = ri->program;
6067 RExC_emit_bound = ri->program + RExC_size + 1;
6068 pRExC_state->code_index = 0;
6070 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6071 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6073 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6075 /* XXXX To minimize changes to RE engine we always allocate
6076 3-units-long substrs field. */
6077 Newx(r->substrs, 1, struct reg_substr_data);
6078 if (RExC_recurse_count) {
6079 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6080 SAVEFREEPV(RExC_recurse);
6084 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6085 Zero(r->substrs, 1, struct reg_substr_data);
6087 #ifdef TRIE_STUDY_OPT
6089 StructCopy(&zero_scan_data, &data, scan_data_t);
6090 copyRExC_state = RExC_state;
6093 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6095 RExC_state = copyRExC_state;
6096 if (seen & REG_TOP_LEVEL_BRANCHES)
6097 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6099 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6100 StructCopy(&zero_scan_data, &data, scan_data_t);
6103 StructCopy(&zero_scan_data, &data, scan_data_t);
6106 /* Dig out information for optimizations. */
6107 r->extflags = RExC_flags; /* was pm_op */
6108 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6111 SvUTF8_on(rx); /* Unicode in it? */
6112 ri->regstclass = NULL;
6113 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6114 r->intflags |= PREGf_NAUGHTY;
6115 scan = ri->program + 1; /* First BRANCH. */
6117 /* testing for BRANCH here tells us whether there is "must appear"
6118 data in the pattern. If there is then we can use it for optimisations */
6119 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6121 STRLEN longest_float_length, longest_fixed_length;
6122 struct regnode_charclass_class ch_class; /* pointed to by data */
6124 SSize_t last_close = 0; /* pointed to by data */
6125 regnode *first= scan;
6126 regnode *first_next= regnext(first);
6128 * Skip introductions and multiplicators >= 1
6129 * so that we can extract the 'meat' of the pattern that must
6130 * match in the large if() sequence following.
6131 * NOTE that EXACT is NOT covered here, as it is normally
6132 * picked up by the optimiser separately.
6134 * This is unfortunate as the optimiser isnt handling lookahead
6135 * properly currently.
6138 while ((OP(first) == OPEN && (sawopen = 1)) ||
6139 /* An OR of *one* alternative - should not happen now. */
6140 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6141 /* for now we can't handle lookbehind IFMATCH*/
6142 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6143 (OP(first) == PLUS) ||
6144 (OP(first) == MINMOD) ||
6145 /* An {n,m} with n>0 */
6146 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6147 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6150 * the only op that could be a regnode is PLUS, all the rest
6151 * will be regnode_1 or regnode_2.
6153 * (yves doesn't think this is true)
6155 if (OP(first) == PLUS)
6158 if (OP(first) == MINMOD)
6160 first += regarglen[OP(first)];
6162 first = NEXTOPER(first);
6163 first_next= regnext(first);
6166 /* Starting-point info. */
6168 DEBUG_PEEP("first:",first,0);
6169 /* Ignore EXACT as we deal with it later. */
6170 if (PL_regkind[OP(first)] == EXACT) {
6171 if (OP(first) == EXACT)
6172 NOOP; /* Empty, get anchored substr later. */
6174 ri->regstclass = first;
6177 else if (PL_regkind[OP(first)] == TRIE &&
6178 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6181 /* this can happen only on restudy */
6182 if ( OP(first) == TRIE ) {
6183 struct regnode_1 *trieop = (struct regnode_1 *)
6184 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6185 StructCopy(first,trieop,struct regnode_1);
6186 trie_op=(regnode *)trieop;
6188 struct regnode_charclass *trieop = (struct regnode_charclass *)
6189 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6190 StructCopy(first,trieop,struct regnode_charclass);
6191 trie_op=(regnode *)trieop;
6194 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6195 ri->regstclass = trie_op;
6198 else if (REGNODE_SIMPLE(OP(first)))
6199 ri->regstclass = first;
6200 else if (PL_regkind[OP(first)] == BOUND ||
6201 PL_regkind[OP(first)] == NBOUND)
6202 ri->regstclass = first;
6203 else if (PL_regkind[OP(first)] == BOL) {
6204 r->extflags |= (OP(first) == MBOL
6206 : (OP(first) == SBOL
6209 first = NEXTOPER(first);
6212 else if (OP(first) == GPOS) {
6213 r->extflags |= RXf_ANCH_GPOS;
6214 first = NEXTOPER(first);
6217 else if ((!sawopen || !RExC_sawback) &&
6218 (OP(first) == STAR &&
6219 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6220 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6222 /* turn .* into ^.* with an implied $*=1 */
6224 (OP(NEXTOPER(first)) == REG_ANY)
6227 r->extflags |= type;
6228 r->intflags |= PREGf_IMPLICIT;
6229 first = NEXTOPER(first);
6232 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6233 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6234 /* x+ must match at the 1st pos of run of x's */
6235 r->intflags |= PREGf_SKIP;
6237 /* Scan is after the zeroth branch, first is atomic matcher. */
6238 #ifdef TRIE_STUDY_OPT
6241 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6242 (IV)(first - scan + 1))
6246 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6247 (IV)(first - scan + 1))
6253 * If there's something expensive in the r.e., find the
6254 * longest literal string that must appear and make it the
6255 * regmust. Resolve ties in favor of later strings, since
6256 * the regstart check works with the beginning of the r.e.
6257 * and avoiding duplication strengthens checking. Not a
6258 * strong reason, but sufficient in the absence of others.
6259 * [Now we resolve ties in favor of the earlier string if
6260 * it happens that c_offset_min has been invalidated, since the
6261 * earlier string may buy us something the later one won't.]
6264 data.longest_fixed = newSVpvs("");
6265 data.longest_float = newSVpvs("");
6266 data.last_found = newSVpvs("");
6267 data.longest = &(data.longest_fixed);
6268 ENTER_with_name("study_chunk");
6269 SAVEFREESV(data.longest_fixed);
6270 SAVEFREESV(data.longest_float);
6271 SAVEFREESV(data.last_found);
6273 if (!ri->regstclass) {
6274 cl_init(pRExC_state, &ch_class);
6275 data.start_class = &ch_class;
6276 stclass_flag = SCF_DO_STCLASS_AND;
6277 } else /* XXXX Check for BOUND? */
6279 data.last_closep = &last_close;
6281 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6282 &data, -1, NULL, NULL,
6283 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6284 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6288 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6291 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6292 && data.last_start_min == 0 && data.last_end > 0
6293 && !RExC_seen_zerolen
6294 && !(RExC_seen & REG_SEEN_VERBARG)
6295 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6296 r->extflags |= RXf_CHECK_ALL;
6297 scan_commit(pRExC_state, &data,&minlen,0);
6299 longest_float_length = CHR_SVLEN(data.longest_float);
6301 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6302 && data.offset_fixed == data.offset_float_min
6303 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6304 && S_setup_longest (aTHX_ pRExC_state,
6308 &(r->float_end_shift),
6309 data.lookbehind_float,
6310 data.offset_float_min,
6312 longest_float_length,
6313 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6314 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6316 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6317 r->float_max_offset = data.offset_float_max;
6318 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6319 r->float_max_offset -= data.lookbehind_float;
6320 SvREFCNT_inc_simple_void_NN(data.longest_float);
6323 r->float_substr = r->float_utf8 = NULL;
6324 longest_float_length = 0;
6327 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6329 if (S_setup_longest (aTHX_ pRExC_state,
6331 &(r->anchored_utf8),
6332 &(r->anchored_substr),
6333 &(r->anchored_end_shift),
6334 data.lookbehind_fixed,
6337 longest_fixed_length,
6338 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6339 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6341 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6342 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6345 r->anchored_substr = r->anchored_utf8 = NULL;
6346 longest_fixed_length = 0;
6348 LEAVE_with_name("study_chunk");
6351 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6352 ri->regstclass = NULL;
6354 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6356 && ! TEST_SSC_EOS(data.start_class)
6357 && !cl_is_anything(data.start_class))
6359 const U32 n = add_data(pRExC_state, 1, "f");
6360 OP(data.start_class) = ANYOF_SYNTHETIC;
6362 Newx(RExC_rxi->data->data[n], 1,
6363 struct regnode_charclass_class);
6364 StructCopy(data.start_class,
6365 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6366 struct regnode_charclass_class);
6367 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6368 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6369 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6370 regprop(r, sv, (regnode*)data.start_class);
6371 PerlIO_printf(Perl_debug_log,
6372 "synthetic stclass \"%s\".\n",
6373 SvPVX_const(sv));});
6376 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6377 if (longest_fixed_length > longest_float_length) {
6378 r->check_end_shift = r->anchored_end_shift;
6379 r->check_substr = r->anchored_substr;
6380 r->check_utf8 = r->anchored_utf8;
6381 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6382 if (r->extflags & RXf_ANCH_SINGLE)
6383 r->extflags |= RXf_NOSCAN;
6386 r->check_end_shift = r->float_end_shift;
6387 r->check_substr = r->float_substr;
6388 r->check_utf8 = r->float_utf8;
6389 r->check_offset_min = r->float_min_offset;
6390 r->check_offset_max = r->float_max_offset;
6392 if ((r->check_substr || r->check_utf8) ) {
6393 r->extflags |= RXf_USE_INTUIT;
6394 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6395 r->extflags |= RXf_INTUIT_TAIL;
6397 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6398 if ( (STRLEN)minlen < longest_float_length )
6399 minlen= longest_float_length;
6400 if ( (STRLEN)minlen < longest_fixed_length )
6401 minlen= longest_fixed_length;
6405 /* Several toplevels. Best we can is to set minlen. */
6407 struct regnode_charclass_class ch_class;
6408 SSize_t last_close = 0;
6410 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6412 scan = ri->program + 1;
6413 cl_init(pRExC_state, &ch_class);
6414 data.start_class = &ch_class;
6415 data.last_closep = &last_close;
6418 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6419 &data, -1, NULL, NULL,
6420 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6421 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6424 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6426 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6427 = r->float_substr = r->float_utf8 = NULL;
6429 if (! TEST_SSC_EOS(data.start_class)
6430 && !cl_is_anything(data.start_class))
6432 const U32 n = add_data(pRExC_state, 1, "f");
6433 OP(data.start_class) = ANYOF_SYNTHETIC;
6435 Newx(RExC_rxi->data->data[n], 1,
6436 struct regnode_charclass_class);
6437 StructCopy(data.start_class,
6438 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6439 struct regnode_charclass_class);
6440 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6441 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6442 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6443 regprop(r, sv, (regnode*)data.start_class);
6444 PerlIO_printf(Perl_debug_log,
6445 "synthetic stclass \"%s\".\n",
6446 SvPVX_const(sv));});
6450 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6451 the "real" pattern. */
6453 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6454 (IV)minlen, (IV)r->minlen);
6456 r->minlenret = minlen;
6457 if (r->minlen < minlen)
6460 if (RExC_seen & REG_SEEN_GPOS)
6461 r->extflags |= RXf_GPOS_SEEN;
6462 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6463 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6464 if (pRExC_state->num_code_blocks)
6465 r->extflags |= RXf_EVAL_SEEN;
6466 if (RExC_seen & REG_SEEN_CANY)
6467 r->extflags |= RXf_CANY_SEEN;
6468 if (RExC_seen & REG_SEEN_VERBARG)
6470 r->intflags |= PREGf_VERBARG_SEEN;
6471 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6473 if (RExC_seen & REG_SEEN_CUTGROUP)
6474 r->intflags |= PREGf_CUTGROUP_SEEN;
6475 if (pm_flags & PMf_USE_RE_EVAL)
6476 r->intflags |= PREGf_USE_RE_EVAL;
6477 if (RExC_paren_names)
6478 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6480 RXp_PAREN_NAMES(r) = NULL;
6483 regnode *first = ri->program + 1;
6485 regnode *next = NEXTOPER(first);
6488 if (PL_regkind[fop] == NOTHING && nop == END)
6489 r->extflags |= RXf_NULL;
6490 else if (PL_regkind[fop] == BOL && nop == END)
6491 r->extflags |= RXf_START_ONLY;
6492 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6493 r->extflags |= RXf_WHITE;
6494 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6495 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6499 if (RExC_paren_names) {
6500 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6501 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6504 ri->name_list_idx = 0;
6506 if (RExC_recurse_count) {
6507 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6508 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6509 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6512 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6513 /* assume we don't need to swap parens around before we match */
6516 PerlIO_printf(Perl_debug_log,"Final program:\n");
6519 #ifdef RE_TRACK_PATTERN_OFFSETS
6520 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6521 const STRLEN len = ri->u.offsets[0];
6523 GET_RE_DEBUG_FLAGS_DECL;
6524 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6525 for (i = 1; i <= len; i++) {
6526 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6527 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6528 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6530 PerlIO_printf(Perl_debug_log, "\n");
6535 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6536 * by setting the regexp SV to readonly-only instead. If the
6537 * pattern's been recompiled, the USEDness should remain. */
6538 if (old_re && SvREADONLY(old_re))
6546 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6549 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6551 PERL_UNUSED_ARG(value);
6553 if (flags & RXapif_FETCH) {
6554 return reg_named_buff_fetch(rx, key, flags);
6555 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6556 Perl_croak_no_modify();
6558 } else if (flags & RXapif_EXISTS) {
6559 return reg_named_buff_exists(rx, key, flags)
6562 } else if (flags & RXapif_REGNAMES) {
6563 return reg_named_buff_all(rx, flags);
6564 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6565 return reg_named_buff_scalar(rx, flags);
6567 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6573 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6576 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6577 PERL_UNUSED_ARG(lastkey);
6579 if (flags & RXapif_FIRSTKEY)
6580 return reg_named_buff_firstkey(rx, flags);
6581 else if (flags & RXapif_NEXTKEY)
6582 return reg_named_buff_nextkey(rx, flags);
6584 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6590 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6593 AV *retarray = NULL;
6595 struct regexp *const rx = ReANY(r);
6597 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6599 if (flags & RXapif_ALL)
6602 if (rx && RXp_PAREN_NAMES(rx)) {
6603 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6606 SV* sv_dat=HeVAL(he_str);
6607 I32 *nums=(I32*)SvPVX(sv_dat);
6608 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6609 if ((I32)(rx->nparens) >= nums[i]
6610 && rx->offs[nums[i]].start != -1
6611 && rx->offs[nums[i]].end != -1)
6614 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6619 ret = newSVsv(&PL_sv_undef);
6622 av_push(retarray, ret);
6625 return newRV_noinc(MUTABLE_SV(retarray));
6632 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6635 struct regexp *const rx = ReANY(r);
6637 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6639 if (rx && RXp_PAREN_NAMES(rx)) {
6640 if (flags & RXapif_ALL) {
6641 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6643 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6645 SvREFCNT_dec_NN(sv);
6657 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6659 struct regexp *const rx = ReANY(r);
6661 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6663 if ( rx && RXp_PAREN_NAMES(rx) ) {
6664 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6666 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6673 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6675 struct regexp *const rx = ReANY(r);
6676 GET_RE_DEBUG_FLAGS_DECL;
6678 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6680 if (rx && RXp_PAREN_NAMES(rx)) {
6681 HV *hv = RXp_PAREN_NAMES(rx);
6683 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6686 SV* sv_dat = HeVAL(temphe);
6687 I32 *nums = (I32*)SvPVX(sv_dat);
6688 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6689 if ((I32)(rx->lastparen) >= nums[i] &&
6690 rx->offs[nums[i]].start != -1 &&
6691 rx->offs[nums[i]].end != -1)
6697 if (parno || flags & RXapif_ALL) {
6698 return newSVhek(HeKEY_hek(temphe));
6706 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6711 struct regexp *const rx = ReANY(r);
6713 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6715 if (rx && RXp_PAREN_NAMES(rx)) {
6716 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6717 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6718 } else if (flags & RXapif_ONE) {
6719 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6720 av = MUTABLE_AV(SvRV(ret));
6721 length = av_len(av);
6722 SvREFCNT_dec_NN(ret);
6723 return newSViv(length + 1);
6725 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6729 return &PL_sv_undef;
6733 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6735 struct regexp *const rx = ReANY(r);
6738 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6740 if (rx && RXp_PAREN_NAMES(rx)) {
6741 HV *hv= RXp_PAREN_NAMES(rx);
6743 (void)hv_iterinit(hv);
6744 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6747 SV* sv_dat = HeVAL(temphe);
6748 I32 *nums = (I32*)SvPVX(sv_dat);
6749 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6750 if ((I32)(rx->lastparen) >= nums[i] &&
6751 rx->offs[nums[i]].start != -1 &&
6752 rx->offs[nums[i]].end != -1)
6758 if (parno || flags & RXapif_ALL) {
6759 av_push(av, newSVhek(HeKEY_hek(temphe)));
6764 return newRV_noinc(MUTABLE_SV(av));
6768 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6771 struct regexp *const rx = ReANY(r);
6777 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6779 if ( n == RX_BUFF_IDX_CARET_PREMATCH
6780 || n == RX_BUFF_IDX_CARET_FULLMATCH
6781 || n == RX_BUFF_IDX_CARET_POSTMATCH
6784 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6786 /* on something like
6789 * the KEEPCOPY is set on the PMOP rather than the regex */
6790 if (PL_curpm && r == PM_GETRE(PL_curpm))
6791 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6800 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6801 /* no need to distinguish between them any more */
6802 n = RX_BUFF_IDX_FULLMATCH;
6804 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6805 && rx->offs[0].start != -1)
6807 /* $`, ${^PREMATCH} */
6808 i = rx->offs[0].start;
6812 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6813 && rx->offs[0].end != -1)
6815 /* $', ${^POSTMATCH} */
6816 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6817 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6820 if ( 0 <= n && n <= (I32)rx->nparens &&
6821 (s1 = rx->offs[n].start) != -1 &&
6822 (t1 = rx->offs[n].end) != -1)
6824 /* $&, ${^MATCH}, $1 ... */
6826 s = rx->subbeg + s1 - rx->suboffset;
6831 assert(s >= rx->subbeg);
6832 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
6834 #if NO_TAINT_SUPPORT
6835 sv_setpvn(sv, s, i);
6837 const int oldtainted = TAINT_get;
6839 sv_setpvn(sv, s, i);
6840 TAINT_set(oldtainted);
6842 if ( (rx->extflags & RXf_CANY_SEEN)
6843 ? (RXp_MATCH_UTF8(rx)
6844 && (!i || is_utf8_string((U8*)s, i)))
6845 : (RXp_MATCH_UTF8(rx)) )
6852 if (RXp_MATCH_TAINTED(rx)) {
6853 if (SvTYPE(sv) >= SVt_PVMG) {
6854 MAGIC* const mg = SvMAGIC(sv);
6857 SvMAGIC_set(sv, mg->mg_moremagic);
6859 if ((mgt = SvMAGIC(sv))) {
6860 mg->mg_moremagic = mgt;
6861 SvMAGIC_set(sv, mg);
6872 sv_setsv(sv,&PL_sv_undef);
6878 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6879 SV const * const value)
6881 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6883 PERL_UNUSED_ARG(rx);
6884 PERL_UNUSED_ARG(paren);
6885 PERL_UNUSED_ARG(value);
6888 Perl_croak_no_modify();
6892 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6895 struct regexp *const rx = ReANY(r);
6899 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6901 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
6902 || paren == RX_BUFF_IDX_CARET_FULLMATCH
6903 || paren == RX_BUFF_IDX_CARET_POSTMATCH
6906 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6908 /* on something like
6911 * the KEEPCOPY is set on the PMOP rather than the regex */
6912 if (PL_curpm && r == PM_GETRE(PL_curpm))
6913 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6919 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6921 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6922 case RX_BUFF_IDX_PREMATCH: /* $` */
6923 if (rx->offs[0].start != -1) {
6924 i = rx->offs[0].start;
6933 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6934 case RX_BUFF_IDX_POSTMATCH: /* $' */
6935 if (rx->offs[0].end != -1) {
6936 i = rx->sublen - rx->offs[0].end;
6938 s1 = rx->offs[0].end;
6945 default: /* $& / ${^MATCH}, $1, $2, ... */
6946 if (paren <= (I32)rx->nparens &&
6947 (s1 = rx->offs[paren].start) != -1 &&
6948 (t1 = rx->offs[paren].end) != -1)
6954 if (ckWARN(WARN_UNINITIALIZED))
6955 report_uninit((const SV *)sv);
6960 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6961 const char * const s = rx->subbeg - rx->suboffset + s1;
6966 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6973 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6975 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6976 PERL_UNUSED_ARG(rx);
6980 return newSVpvs("Regexp");
6983 /* Scans the name of a named buffer from the pattern.
6984 * If flags is REG_RSN_RETURN_NULL returns null.
6985 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6986 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6987 * to the parsed name as looked up in the RExC_paren_names hash.
6988 * If there is an error throws a vFAIL().. type exception.
6991 #define REG_RSN_RETURN_NULL 0
6992 #define REG_RSN_RETURN_NAME 1
6993 #define REG_RSN_RETURN_DATA 2
6996 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6998 char *name_start = RExC_parse;
7000 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7002 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7003 /* skip IDFIRST by using do...while */
7006 RExC_parse += UTF8SKIP(RExC_parse);
7007 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7011 } while (isWORDCHAR(*RExC_parse));
7013 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7014 vFAIL("Group name must start with a non-digit word character");
7018 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7019 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7020 if ( flags == REG_RSN_RETURN_NAME)
7022 else if (flags==REG_RSN_RETURN_DATA) {
7025 if ( ! sv_name ) /* should not happen*/
7026 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7027 if (RExC_paren_names)
7028 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7030 sv_dat = HeVAL(he_str);
7032 vFAIL("Reference to nonexistent named group");
7036 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7037 (unsigned long) flags);
7039 assert(0); /* NOT REACHED */
7044 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7045 int rem=(int)(RExC_end - RExC_parse); \
7054 if (RExC_lastparse!=RExC_parse) \
7055 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7058 iscut ? "..." : "<" \
7061 PerlIO_printf(Perl_debug_log,"%16s",""); \
7064 num = RExC_size + 1; \
7066 num=REG_NODE_NUM(RExC_emit); \
7067 if (RExC_lastnum!=num) \
7068 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7070 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7071 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7072 (int)((depth*2)), "", \
7076 RExC_lastparse=RExC_parse; \
7081 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7082 DEBUG_PARSE_MSG((funcname)); \
7083 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7085 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7086 DEBUG_PARSE_MSG((funcname)); \
7087 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7090 /* This section of code defines the inversion list object and its methods. The
7091 * interfaces are highly subject to change, so as much as possible is static to
7092 * this file. An inversion list is here implemented as a malloc'd C UV array
7093 * as an SVt_INVLIST scalar.
7095 * An inversion list for Unicode is an array of code points, sorted by ordinal
7096 * number. The zeroth element is the first code point in the list. The 1th
7097 * element is the first element beyond that not in the list. In other words,
7098 * the first range is
7099 * invlist[0]..(invlist[1]-1)
7100 * The other ranges follow. Thus every element whose index is divisible by two
7101 * marks the beginning of a range that is in the list, and every element not
7102 * divisible by two marks the beginning of a range not in the list. A single
7103 * element inversion list that contains the single code point N generally
7104 * consists of two elements
7107 * (The exception is when N is the highest representable value on the
7108 * machine, in which case the list containing just it would be a single
7109 * element, itself. By extension, if the last range in the list extends to
7110 * infinity, then the first element of that range will be in the inversion list
7111 * at a position that is divisible by two, and is the final element in the
7113 * Taking the complement (inverting) an inversion list is quite simple, if the
7114 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7115 * This implementation reserves an element at the beginning of each inversion
7116 * list to always contain 0; there is an additional flag in the header which
7117 * indicates if the list begins at the 0, or is offset to begin at the next
7120 * More about inversion lists can be found in "Unicode Demystified"
7121 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7122 * More will be coming when functionality is added later.
7124 * The inversion list data structure is currently implemented as an SV pointing
7125 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7126 * array of UV whose memory management is automatically handled by the existing
7127 * facilities for SV's.
7129 * Some of the methods should always be private to the implementation, and some
7130 * should eventually be made public */
7132 /* The header definitions are in F<inline_invlist.c> */
7134 PERL_STATIC_INLINE UV*
7135 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7137 /* Returns a pointer to the first element in the inversion list's array.
7138 * This is called upon initialization of an inversion list. Where the
7139 * array begins depends on whether the list has the code point U+0000 in it
7140 * or not. The other parameter tells it whether the code that follows this
7141 * call is about to put a 0 in the inversion list or not. The first
7142 * element is either the element reserved for 0, if TRUE, or the element
7143 * after it, if FALSE */
7145 bool* offset = get_invlist_offset_addr(invlist);
7146 UV* zero_addr = (UV *) SvPVX(invlist);
7148 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7151 assert(! _invlist_len(invlist));
7155 /* 1^1 = 0; 1^0 = 1 */
7156 *offset = 1 ^ will_have_0;
7157 return zero_addr + *offset;
7160 PERL_STATIC_INLINE UV*
7161 S_invlist_array(pTHX_ SV* const invlist)
7163 /* Returns the pointer to the inversion list's array. Every time the
7164 * length changes, this needs to be called in case malloc or realloc moved
7167 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7169 /* Must not be empty. If these fail, you probably didn't check for <len>
7170 * being non-zero before trying to get the array */
7171 assert(_invlist_len(invlist));
7173 /* The very first element always contains zero, The array begins either
7174 * there, or if the inversion list is offset, at the element after it.
7175 * The offset header field determines which; it contains 0 or 1 to indicate
7176 * how much additionally to add */
7177 assert(0 == *(SvPVX(invlist)));
7178 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7181 PERL_STATIC_INLINE void
7182 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7184 /* Sets the current number of elements stored in the inversion list.
7185 * Updates SvCUR correspondingly */
7187 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7189 assert(SvTYPE(invlist) == SVt_INVLIST);
7194 : TO_INTERNAL_SIZE(len + offset));
7195 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7198 PERL_STATIC_INLINE IV*
7199 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7201 /* Return the address of the IV that is reserved to hold the cached index
7204 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7206 assert(SvTYPE(invlist) == SVt_INVLIST);
7208 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7211 PERL_STATIC_INLINE IV
7212 S_invlist_previous_index(pTHX_ SV* const invlist)
7214 /* Returns cached index of previous search */
7216 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7218 return *get_invlist_previous_index_addr(invlist);
7221 PERL_STATIC_INLINE void
7222 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7224 /* Caches <index> for later retrieval */
7226 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7228 assert(index == 0 || index < (int) _invlist_len(invlist));
7230 *get_invlist_previous_index_addr(invlist) = index;
7233 PERL_STATIC_INLINE UV
7234 S_invlist_max(pTHX_ SV* const invlist)
7236 /* Returns the maximum number of elements storable in the inversion list's
7237 * array, without having to realloc() */
7239 PERL_ARGS_ASSERT_INVLIST_MAX;
7241 assert(SvTYPE(invlist) == SVt_INVLIST);
7243 /* Assumes worst case, in which the 0 element is not counted in the
7244 * inversion list, so subtracts 1 for that */
7245 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7246 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7247 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7250 #ifndef PERL_IN_XSUB_RE
7252 Perl__new_invlist(pTHX_ IV initial_size)
7255 /* Return a pointer to a newly constructed inversion list, with enough
7256 * space to store 'initial_size' elements. If that number is negative, a
7257 * system default is used instead */
7261 if (initial_size < 0) {
7265 /* Allocate the initial space */
7266 new_list = newSV_type(SVt_INVLIST);
7268 /* First 1 is in case the zero element isn't in the list; second 1 is for
7270 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7271 invlist_set_len(new_list, 0, 0);
7273 /* Force iterinit() to be used to get iteration to work */
7274 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7276 *get_invlist_previous_index_addr(new_list) = 0;
7283 S__new_invlist_C_array(pTHX_ const UV* const list)
7285 /* Return a pointer to a newly constructed inversion list, initialized to
7286 * point to <list>, which has to be in the exact correct inversion list
7287 * form, including internal fields. Thus this is a dangerous routine that
7288 * should not be used in the wrong hands. The passed in 'list' contains
7289 * several header fields at the beginning that are not part of the
7290 * inversion list body proper */
7292 const STRLEN length = (STRLEN) list[0];
7293 const UV version_id = list[1];
7294 const bool offset = cBOOL(list[2]);
7295 #define HEADER_LENGTH 3
7296 /* If any of the above changes in any way, you must change HEADER_LENGTH
7297 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7298 * perl -E 'say int(rand 2**31-1)'
7300 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7301 data structure type, so that one being
7302 passed in can be validated to be an
7303 inversion list of the correct vintage.
7306 SV* invlist = newSV_type(SVt_INVLIST);
7308 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7310 if (version_id != INVLIST_VERSION_ID) {
7311 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7314 /* The generated array passed in includes header elements that aren't part
7315 * of the list proper, so start it just after them */
7316 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7318 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7319 shouldn't touch it */
7321 *(get_invlist_offset_addr(invlist)) = offset;
7323 /* The 'length' passed to us is the physical number of elements in the
7324 * inversion list. But if there is an offset the logical number is one
7326 invlist_set_len(invlist, length - offset, offset);
7328 invlist_set_previous_index(invlist, 0);
7330 /* Initialize the iteration pointer. */
7331 invlist_iterfinish(invlist);
7337 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7339 /* Grow the maximum size of an inversion list */
7341 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7343 assert(SvTYPE(invlist) == SVt_INVLIST);
7345 /* Add one to account for the zero element at the beginning which may not
7346 * be counted by the calling parameters */
7347 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7350 PERL_STATIC_INLINE void
7351 S_invlist_trim(pTHX_ SV* const invlist)
7353 PERL_ARGS_ASSERT_INVLIST_TRIM;
7355 assert(SvTYPE(invlist) == SVt_INVLIST);
7357 /* Change the length of the inversion list to how many entries it currently
7359 SvPV_shrink_to_cur((SV *) invlist);
7362 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7365 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7367 /* Subject to change or removal. Append the range from 'start' to 'end' at
7368 * the end of the inversion list. The range must be above any existing
7372 UV max = invlist_max(invlist);
7373 UV len = _invlist_len(invlist);
7376 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7378 if (len == 0) { /* Empty lists must be initialized */
7379 offset = start != 0;
7380 array = _invlist_array_init(invlist, ! offset);
7383 /* Here, the existing list is non-empty. The current max entry in the
7384 * list is generally the first value not in the set, except when the
7385 * set extends to the end of permissible values, in which case it is
7386 * the first entry in that final set, and so this call is an attempt to
7387 * append out-of-order */
7389 UV final_element = len - 1;
7390 array = invlist_array(invlist);
7391 if (array[final_element] > start
7392 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7394 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7395 array[final_element], start,
7396 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7399 /* Here, it is a legal append. If the new range begins with the first
7400 * value not in the set, it is extending the set, so the new first
7401 * value not in the set is one greater than the newly extended range.
7403 offset = *get_invlist_offset_addr(invlist);
7404 if (array[final_element] == start) {
7405 if (end != UV_MAX) {
7406 array[final_element] = end + 1;
7409 /* But if the end is the maximum representable on the machine,
7410 * just let the range that this would extend to have no end */
7411 invlist_set_len(invlist, len - 1, offset);
7417 /* Here the new range doesn't extend any existing set. Add it */
7419 len += 2; /* Includes an element each for the start and end of range */
7421 /* If wll overflow the existing space, extend, which may cause the array to
7424 invlist_extend(invlist, len);
7426 /* Have to set len here to avoid assert failure in invlist_array() */
7427 invlist_set_len(invlist, len, offset);
7429 array = invlist_array(invlist);
7432 invlist_set_len(invlist, len, offset);
7435 /* The next item on the list starts the range, the one after that is
7436 * one past the new range. */
7437 array[len - 2] = start;
7438 if (end != UV_MAX) {
7439 array[len - 1] = end + 1;
7442 /* But if the end is the maximum representable on the machine, just let
7443 * the range have no end */
7444 invlist_set_len(invlist, len - 1, offset);
7448 #ifndef PERL_IN_XSUB_RE
7451 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7453 /* Searches the inversion list for the entry that contains the input code
7454 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7455 * return value is the index into the list's array of the range that
7460 IV high = _invlist_len(invlist);
7461 const IV highest_element = high - 1;
7464 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7466 /* If list is empty, return failure. */
7471 /* (We can't get the array unless we know the list is non-empty) */
7472 array = invlist_array(invlist);
7474 mid = invlist_previous_index(invlist);
7475 assert(mid >=0 && mid <= highest_element);
7477 /* <mid> contains the cache of the result of the previous call to this
7478 * function (0 the first time). See if this call is for the same result,
7479 * or if it is for mid-1. This is under the theory that calls to this
7480 * function will often be for related code points that are near each other.
7481 * And benchmarks show that caching gives better results. We also test
7482 * here if the code point is within the bounds of the list. These tests
7483 * replace others that would have had to be made anyway to make sure that
7484 * the array bounds were not exceeded, and these give us extra information
7485 * at the same time */
7486 if (cp >= array[mid]) {
7487 if (cp >= array[highest_element]) {
7488 return highest_element;
7491 /* Here, array[mid] <= cp < array[highest_element]. This means that
7492 * the final element is not the answer, so can exclude it; it also
7493 * means that <mid> is not the final element, so can refer to 'mid + 1'
7495 if (cp < array[mid + 1]) {
7501 else { /* cp < aray[mid] */
7502 if (cp < array[0]) { /* Fail if outside the array */
7506 if (cp >= array[mid - 1]) {
7511 /* Binary search. What we are looking for is <i> such that
7512 * array[i] <= cp < array[i+1]
7513 * The loop below converges on the i+1. Note that there may not be an
7514 * (i+1)th element in the array, and things work nonetheless */
7515 while (low < high) {
7516 mid = (low + high) / 2;
7517 assert(mid <= highest_element);
7518 if (array[mid] <= cp) { /* cp >= array[mid] */
7521 /* We could do this extra test to exit the loop early.
7522 if (cp < array[low]) {
7527 else { /* cp < array[mid] */
7534 invlist_set_previous_index(invlist, high);
7539 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7541 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7542 * but is used when the swash has an inversion list. This makes this much
7543 * faster, as it uses a binary search instead of a linear one. This is
7544 * intimately tied to that function, and perhaps should be in utf8.c,
7545 * except it is intimately tied to inversion lists as well. It assumes
7546 * that <swatch> is all 0's on input */
7549 const IV len = _invlist_len(invlist);
7553 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7555 if (len == 0) { /* Empty inversion list */
7559 array = invlist_array(invlist);
7561 /* Find which element it is */
7562 i = _invlist_search(invlist, start);
7564 /* We populate from <start> to <end> */
7565 while (current < end) {
7568 /* The inversion list gives the results for every possible code point
7569 * after the first one in the list. Only those ranges whose index is
7570 * even are ones that the inversion list matches. For the odd ones,
7571 * and if the initial code point is not in the list, we have to skip
7572 * forward to the next element */
7573 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7575 if (i >= len) { /* Finished if beyond the end of the array */
7579 if (current >= end) { /* Finished if beyond the end of what we
7581 if (LIKELY(end < UV_MAX)) {
7585 /* We get here when the upper bound is the maximum
7586 * representable on the machine, and we are looking for just
7587 * that code point. Have to special case it */
7589 goto join_end_of_list;
7592 assert(current >= start);
7594 /* The current range ends one below the next one, except don't go past
7597 upper = (i < len && array[i] < end) ? array[i] : end;
7599 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7600 * for each code point in it */
7601 for (; current < upper; current++) {
7602 const STRLEN offset = (STRLEN)(current - start);
7603 swatch[offset >> 3] |= 1 << (offset & 7);
7608 /* Quit if at the end of the list */
7611 /* But first, have to deal with the highest possible code point on
7612 * the platform. The previous code assumes that <end> is one
7613 * beyond where we want to populate, but that is impossible at the
7614 * platform's infinity, so have to handle it specially */
7615 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7617 const STRLEN offset = (STRLEN)(end - start);
7618 swatch[offset >> 3] |= 1 << (offset & 7);
7623 /* Advance to the next range, which will be for code points not in the
7632 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7634 /* Take the union of two inversion lists and point <output> to it. *output
7635 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7636 * the reference count to that list will be decremented. The first list,
7637 * <a>, may be NULL, in which case a copy of the second list is returned.
7638 * If <complement_b> is TRUE, the union is taken of the complement
7639 * (inversion) of <b> instead of b itself.
7641 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7642 * Richard Gillam, published by Addison-Wesley, and explained at some
7643 * length there. The preface says to incorporate its examples into your
7644 * code at your own risk.
7646 * The algorithm is like a merge sort.
7648 * XXX A potential performance improvement is to keep track as we go along
7649 * if only one of the inputs contributes to the result, meaning the other
7650 * is a subset of that one. In that case, we can skip the final copy and
7651 * return the larger of the input lists, but then outside code might need
7652 * to keep track of whether to free the input list or not */
7654 const UV* array_a; /* a's array */
7656 UV len_a; /* length of a's array */
7659 SV* u; /* the resulting union */
7663 UV i_a = 0; /* current index into a's array */
7667 /* running count, as explained in the algorithm source book; items are
7668 * stopped accumulating and are output when the count changes to/from 0.
7669 * The count is incremented when we start a range that's in the set, and
7670 * decremented when we start a range that's not in the set. So its range
7671 * is 0 to 2. Only when the count is zero is something not in the set.
7675 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7678 /* If either one is empty, the union is the other one */
7679 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7686 *output = invlist_clone(b);
7688 _invlist_invert(*output);
7690 } /* else *output already = b; */
7693 else if ((len_b = _invlist_len(b)) == 0) {
7698 /* The complement of an empty list is a list that has everything in it,
7699 * so the union with <a> includes everything too */
7704 *output = _new_invlist(1);
7705 _append_range_to_invlist(*output, 0, UV_MAX);
7707 else if (*output != a) {
7708 *output = invlist_clone(a);
7710 /* else *output already = a; */
7714 /* Here both lists exist and are non-empty */
7715 array_a = invlist_array(a);
7716 array_b = invlist_array(b);
7718 /* If are to take the union of 'a' with the complement of b, set it
7719 * up so are looking at b's complement. */
7722 /* To complement, we invert: if the first element is 0, remove it. To
7723 * do this, we just pretend the array starts one later */
7724 if (array_b[0] == 0) {
7730 /* But if the first element is not zero, we pretend the list starts
7731 * at the 0 that is always stored immediately before the array. */
7737 /* Size the union for the worst case: that the sets are completely
7739 u = _new_invlist(len_a + len_b);
7741 /* Will contain U+0000 if either component does */
7742 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7743 || (len_b > 0 && array_b[0] == 0));
7745 /* Go through each list item by item, stopping when exhausted one of
7747 while (i_a < len_a && i_b < len_b) {
7748 UV cp; /* The element to potentially add to the union's array */
7749 bool cp_in_set; /* is it in the the input list's set or not */
7751 /* We need to take one or the other of the two inputs for the union.
7752 * Since we are merging two sorted lists, we take the smaller of the
7753 * next items. In case of a tie, we take the one that is in its set
7754 * first. If we took one not in the set first, it would decrement the
7755 * count, possibly to 0 which would cause it to be output as ending the
7756 * range, and the next time through we would take the same number, and
7757 * output it again as beginning the next range. By doing it the
7758 * opposite way, there is no possibility that the count will be
7759 * momentarily decremented to 0, and thus the two adjoining ranges will
7760 * be seamlessly merged. (In a tie and both are in the set or both not
7761 * in the set, it doesn't matter which we take first.) */
7762 if (array_a[i_a] < array_b[i_b]
7763 || (array_a[i_a] == array_b[i_b]
7764 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7766 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7770 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7771 cp = array_b[i_b++];
7774 /* Here, have chosen which of the two inputs to look at. Only output
7775 * if the running count changes to/from 0, which marks the
7776 * beginning/end of a range in that's in the set */
7779 array_u[i_u++] = cp;
7786 array_u[i_u++] = cp;
7791 /* Here, we are finished going through at least one of the lists, which
7792 * means there is something remaining in at most one. We check if the list
7793 * that hasn't been exhausted is positioned such that we are in the middle
7794 * of a range in its set or not. (i_a and i_b point to the element beyond
7795 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7796 * is potentially more to output.
7797 * There are four cases:
7798 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7799 * in the union is entirely from the non-exhausted set.
7800 * 2) Both were in their sets, count is 2. Nothing further should
7801 * be output, as everything that remains will be in the exhausted
7802 * list's set, hence in the union; decrementing to 1 but not 0 insures
7804 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7805 * Nothing further should be output because the union includes
7806 * everything from the exhausted set. Not decrementing ensures that.
7807 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7808 * decrementing to 0 insures that we look at the remainder of the
7809 * non-exhausted set */
7810 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7811 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7816 /* The final length is what we've output so far, plus what else is about to
7817 * be output. (If 'count' is non-zero, then the input list we exhausted
7818 * has everything remaining up to the machine's limit in its set, and hence
7819 * in the union, so there will be no further output. */
7822 /* At most one of the subexpressions will be non-zero */
7823 len_u += (len_a - i_a) + (len_b - i_b);
7826 /* Set result to final length, which can change the pointer to array_u, so
7828 if (len_u != _invlist_len(u)) {
7829 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7831 array_u = invlist_array(u);
7834 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7835 * the other) ended with everything above it not in its set. That means
7836 * that the remaining part of the union is precisely the same as the
7837 * non-exhausted list, so can just copy it unchanged. (If both list were
7838 * exhausted at the same time, then the operations below will be both 0.)
7841 IV copy_count; /* At most one will have a non-zero copy count */
7842 if ((copy_count = len_a - i_a) > 0) {
7843 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7845 else if ((copy_count = len_b - i_b) > 0) {
7846 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7850 /* We may be removing a reference to one of the inputs */
7851 if (a == *output || b == *output) {
7852 assert(! invlist_is_iterating(*output));
7853 SvREFCNT_dec_NN(*output);
7861 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7863 /* Take the intersection of two inversion lists and point <i> to it. *i
7864 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7865 * the reference count to that list will be decremented.
7866 * If <complement_b> is TRUE, the result will be the intersection of <a>
7867 * and the complement (or inversion) of <b> instead of <b> directly.
7869 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7870 * Richard Gillam, published by Addison-Wesley, and explained at some
7871 * length there. The preface says to incorporate its examples into your
7872 * code at your own risk. In fact, it had bugs
7874 * The algorithm is like a merge sort, and is essentially the same as the
7878 const UV* array_a; /* a's array */
7880 UV len_a; /* length of a's array */
7883 SV* r; /* the resulting intersection */
7887 UV i_a = 0; /* current index into a's array */
7891 /* running count, as explained in the algorithm source book; items are
7892 * stopped accumulating and are output when the count changes to/from 2.
7893 * The count is incremented when we start a range that's in the set, and
7894 * decremented when we start a range that's not in the set. So its range
7895 * is 0 to 2. Only when the count is 2 is something in the intersection.
7899 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7902 /* Special case if either one is empty */
7903 len_a = (a == NULL) ? 0 : _invlist_len(a);
7904 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7906 if (len_a != 0 && complement_b) {
7908 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7909 * be empty. Here, also we are using 'b's complement, which hence
7910 * must be every possible code point. Thus the intersection is
7917 *i = invlist_clone(a);
7919 /* else *i is already 'a' */
7923 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7924 * intersection must be empty */
7931 *i = _new_invlist(0);
7935 /* Here both lists exist and are non-empty */
7936 array_a = invlist_array(a);
7937 array_b = invlist_array(b);
7939 /* If are to take the intersection of 'a' with the complement of b, set it
7940 * up so are looking at b's complement. */
7943 /* To complement, we invert: if the first element is 0, remove it. To
7944 * do this, we just pretend the array starts one later */
7945 if (array_b[0] == 0) {
7951 /* But if the first element is not zero, we pretend the list starts
7952 * at the 0 that is always stored immediately before the array. */
7958 /* Size the intersection for the worst case: that the intersection ends up
7959 * fragmenting everything to be completely disjoint */
7960 r= _new_invlist(len_a + len_b);
7962 /* Will contain U+0000 iff both components do */
7963 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7964 && len_b > 0 && array_b[0] == 0);
7966 /* Go through each list item by item, stopping when exhausted one of
7968 while (i_a < len_a && i_b < len_b) {
7969 UV cp; /* The element to potentially add to the intersection's
7971 bool cp_in_set; /* Is it in the input list's set or not */
7973 /* We need to take one or the other of the two inputs for the
7974 * intersection. Since we are merging two sorted lists, we take the
7975 * smaller of the next items. In case of a tie, we take the one that
7976 * is not in its set first (a difference from the union algorithm). If
7977 * we took one in the set first, it would increment the count, possibly
7978 * to 2 which would cause it to be output as starting a range in the
7979 * intersection, and the next time through we would take that same
7980 * number, and output it again as ending the set. By doing it the
7981 * opposite of this, there is no possibility that the count will be
7982 * momentarily incremented to 2. (In a tie and both are in the set or
7983 * both not in the set, it doesn't matter which we take first.) */
7984 if (array_a[i_a] < array_b[i_b]
7985 || (array_a[i_a] == array_b[i_b]
7986 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7988 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7992 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7996 /* Here, have chosen which of the two inputs to look at. Only output
7997 * if the running count changes to/from 2, which marks the
7998 * beginning/end of a range that's in the intersection */
8002 array_r[i_r++] = cp;
8007 array_r[i_r++] = cp;
8013 /* Here, we are finished going through at least one of the lists, which
8014 * means there is something remaining in at most one. We check if the list
8015 * that has been exhausted is positioned such that we are in the middle
8016 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8017 * the ones we care about.) There are four cases:
8018 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8019 * nothing left in the intersection.
8020 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8021 * above 2. What should be output is exactly that which is in the
8022 * non-exhausted set, as everything it has is also in the intersection
8023 * set, and everything it doesn't have can't be in the intersection
8024 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8025 * gets incremented to 2. Like the previous case, the intersection is
8026 * everything that remains in the non-exhausted set.
8027 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8028 * remains 1. And the intersection has nothing more. */
8029 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8030 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8035 /* The final length is what we've output so far plus what else is in the
8036 * intersection. At most one of the subexpressions below will be non-zero */
8039 len_r += (len_a - i_a) + (len_b - i_b);
8042 /* Set result to final length, which can change the pointer to array_r, so
8044 if (len_r != _invlist_len(r)) {
8045 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8047 array_r = invlist_array(r);
8050 /* Finish outputting any remaining */
8051 if (count >= 2) { /* At most one will have a non-zero copy count */
8053 if ((copy_count = len_a - i_a) > 0) {
8054 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8056 else if ((copy_count = len_b - i_b) > 0) {
8057 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8061 /* We may be removing a reference to one of the inputs */
8062 if (a == *i || b == *i) {
8063 assert(! invlist_is_iterating(*i));
8064 SvREFCNT_dec_NN(*i);
8072 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8074 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8075 * set. A pointer to the inversion list is returned. This may actually be
8076 * a new list, in which case the passed in one has been destroyed. The
8077 * passed in inversion list can be NULL, in which case a new one is created
8078 * with just the one range in it */
8083 if (invlist == NULL) {
8084 invlist = _new_invlist(2);
8088 len = _invlist_len(invlist);
8091 /* If comes after the final entry actually in the list, can just append it
8094 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8095 && start >= invlist_array(invlist)[len - 1]))
8097 _append_range_to_invlist(invlist, start, end);
8101 /* Here, can't just append things, create and return a new inversion list
8102 * which is the union of this range and the existing inversion list */
8103 range_invlist = _new_invlist(2);
8104 _append_range_to_invlist(range_invlist, start, end);
8106 _invlist_union(invlist, range_invlist, &invlist);
8108 /* The temporary can be freed */
8109 SvREFCNT_dec_NN(range_invlist);
8116 PERL_STATIC_INLINE SV*
8117 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8118 return _add_range_to_invlist(invlist, cp, cp);
8121 #ifndef PERL_IN_XSUB_RE
8123 Perl__invlist_invert(pTHX_ SV* const invlist)
8125 /* Complement the input inversion list. This adds a 0 if the list didn't
8126 * have a zero; removes it otherwise. As described above, the data
8127 * structure is set up so that this is very efficient */
8129 PERL_ARGS_ASSERT__INVLIST_INVERT;
8131 assert(! invlist_is_iterating(invlist));
8133 /* The inverse of matching nothing is matching everything */
8134 if (_invlist_len(invlist) == 0) {
8135 _append_range_to_invlist(invlist, 0, UV_MAX);
8139 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8143 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8145 /* Complement the input inversion list (which must be a Unicode property,
8146 * all of which don't match above the Unicode maximum code point.) And
8147 * Perl has chosen to not have the inversion match above that either. This
8148 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8154 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8156 _invlist_invert(invlist);
8158 len = _invlist_len(invlist);
8160 if (len != 0) { /* If empty do nothing */
8161 array = invlist_array(invlist);
8162 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8163 /* Add 0x110000. First, grow if necessary */
8165 if (invlist_max(invlist) < len) {
8166 invlist_extend(invlist, len);
8167 array = invlist_array(invlist);
8169 invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8170 array[len - 1] = PERL_UNICODE_MAX + 1;
8172 else { /* Remove the 0x110000 */
8173 invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8181 PERL_STATIC_INLINE SV*
8182 S_invlist_clone(pTHX_ SV* const invlist)
8185 /* Return a new inversion list that is a copy of the input one, which is
8188 /* Need to allocate extra space to accommodate Perl's addition of a
8189 * trailing NUL to SvPV's, since it thinks they are always strings */
8190 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8191 STRLEN physical_length = SvCUR(invlist);
8192 bool offset = *(get_invlist_offset_addr(invlist));
8194 PERL_ARGS_ASSERT_INVLIST_CLONE;
8196 *(get_invlist_offset_addr(new_invlist)) = offset;
8197 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8198 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8203 PERL_STATIC_INLINE STRLEN*
8204 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8206 /* Return the address of the UV that contains the current iteration
8209 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8211 assert(SvTYPE(invlist) == SVt_INVLIST);
8213 return &(((XINVLIST*) SvANY(invlist))->iterator);
8216 PERL_STATIC_INLINE void
8217 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8219 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8221 *get_invlist_iter_addr(invlist) = 0;
8224 PERL_STATIC_INLINE void
8225 S_invlist_iterfinish(pTHX_ SV* invlist)
8227 /* Terminate iterator for invlist. This is to catch development errors.
8228 * Any iteration that is interrupted before completed should call this
8229 * function. Functions that add code points anywhere else but to the end
8230 * of an inversion list assert that they are not in the middle of an
8231 * iteration. If they were, the addition would make the iteration
8232 * problematical: if the iteration hadn't reached the place where things
8233 * were being added, it would be ok */
8235 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8237 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8241 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8243 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8244 * This call sets in <*start> and <*end>, the next range in <invlist>.
8245 * Returns <TRUE> if successful and the next call will return the next
8246 * range; <FALSE> if was already at the end of the list. If the latter,
8247 * <*start> and <*end> are unchanged, and the next call to this function
8248 * will start over at the beginning of the list */
8250 STRLEN* pos = get_invlist_iter_addr(invlist);
8251 UV len = _invlist_len(invlist);
8254 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8257 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8261 array = invlist_array(invlist);
8263 *start = array[(*pos)++];
8269 *end = array[(*pos)++] - 1;
8275 PERL_STATIC_INLINE bool
8276 S_invlist_is_iterating(pTHX_ SV* const invlist)
8278 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8280 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8283 PERL_STATIC_INLINE UV
8284 S_invlist_highest(pTHX_ SV* const invlist)
8286 /* Returns the highest code point that matches an inversion list. This API
8287 * has an ambiguity, as it returns 0 under either the highest is actually
8288 * 0, or if the list is empty. If this distinction matters to you, check
8289 * for emptiness before calling this function */
8291 UV len = _invlist_len(invlist);
8294 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8300 array = invlist_array(invlist);
8302 /* The last element in the array in the inversion list always starts a
8303 * range that goes to infinity. That range may be for code points that are
8304 * matched in the inversion list, or it may be for ones that aren't
8305 * matched. In the latter case, the highest code point in the set is one
8306 * less than the beginning of this range; otherwise it is the final element
8307 * of this range: infinity */
8308 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8310 : array[len - 1] - 1;
8313 #ifndef PERL_IN_XSUB_RE
8315 Perl__invlist_contents(pTHX_ SV* const invlist)
8317 /* Get the contents of an inversion list into a string SV so that they can
8318 * be printed out. It uses the format traditionally done for debug tracing
8322 SV* output = newSVpvs("\n");
8324 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8326 assert(! invlist_is_iterating(invlist));
8328 invlist_iterinit(invlist);
8329 while (invlist_iternext(invlist, &start, &end)) {
8330 if (end == UV_MAX) {
8331 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8333 else if (end != start) {
8334 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8338 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8346 #ifndef PERL_IN_XSUB_RE
8348 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8350 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8351 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8352 * the string 'indent'. The output looks like this:
8353 [0] 0x000A .. 0x000D
8355 [4] 0x2028 .. 0x2029
8356 [6] 0x3104 .. INFINITY
8357 * This means that the first range of code points matched by the list are
8358 * 0xA through 0xD; the second range contains only the single code point
8359 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8360 * are used to define each range (except if the final range extends to
8361 * infinity, only a single element is needed). The array index of the
8362 * first element for the corresponding range is given in brackets. */
8367 PERL_ARGS_ASSERT__INVLIST_DUMP;
8369 if (invlist_is_iterating(invlist)) {
8370 Perl_dump_indent(aTHX_ level, file,
8371 "%sCan't dump inversion list because is in middle of iterating\n",
8376 invlist_iterinit(invlist);
8377 while (invlist_iternext(invlist, &start, &end)) {
8378 if (end == UV_MAX) {
8379 Perl_dump_indent(aTHX_ level, file,
8380 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8381 indent, (UV)count, start);
8383 else if (end != start) {
8384 Perl_dump_indent(aTHX_ level, file,
8385 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8386 indent, (UV)count, start, end);
8389 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8390 indent, (UV)count, start);
8397 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8399 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8401 /* Return a boolean as to if the two passed in inversion lists are
8402 * identical. The final argument, if TRUE, says to take the complement of
8403 * the second inversion list before doing the comparison */
8405 const UV* array_a = invlist_array(a);
8406 const UV* array_b = invlist_array(b);
8407 UV len_a = _invlist_len(a);
8408 UV len_b = _invlist_len(b);
8410 UV i = 0; /* current index into the arrays */
8411 bool retval = TRUE; /* Assume are identical until proven otherwise */
8413 PERL_ARGS_ASSERT__INVLISTEQ;
8415 /* If are to compare 'a' with the complement of b, set it
8416 * up so are looking at b's complement. */
8419 /* The complement of nothing is everything, so <a> would have to have
8420 * just one element, starting at zero (ending at infinity) */
8422 return (len_a == 1 && array_a[0] == 0);
8424 else if (array_b[0] == 0) {
8426 /* Otherwise, to complement, we invert. Here, the first element is
8427 * 0, just remove it. To do this, we just pretend the array starts
8435 /* But if the first element is not zero, we pretend the list starts
8436 * at the 0 that is always stored immediately before the array. */
8442 /* Make sure that the lengths are the same, as well as the final element
8443 * before looping through the remainder. (Thus we test the length, final,
8444 * and first elements right off the bat) */
8445 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8448 else for (i = 0; i < len_a - 1; i++) {
8449 if (array_a[i] != array_b[i]) {
8459 #undef HEADER_LENGTH
8460 #undef TO_INTERNAL_SIZE
8461 #undef FROM_INTERNAL_SIZE
8462 #undef INVLIST_VERSION_ID
8464 /* End of inversion list object */
8467 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8469 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8470 * constructs, and updates RExC_flags with them. On input, RExC_parse
8471 * should point to the first flag; it is updated on output to point to the
8472 * final ')' or ':'. There needs to be at least one flag, or this will
8475 /* for (?g), (?gc), and (?o) warnings; warning
8476 about (?c) will warn about (?g) -- japhy */
8478 #define WASTED_O 0x01
8479 #define WASTED_G 0x02
8480 #define WASTED_C 0x04
8481 #define WASTED_GC (WASTED_G|WASTED_C)
8482 I32 wastedflags = 0x00;
8483 U32 posflags = 0, negflags = 0;
8484 U32 *flagsp = &posflags;
8485 char has_charset_modifier = '\0';
8487 bool has_use_defaults = FALSE;
8488 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8490 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8492 /* '^' as an initial flag sets certain defaults */
8493 if (UCHARAT(RExC_parse) == '^') {
8495 has_use_defaults = TRUE;
8496 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8497 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8498 ? REGEX_UNICODE_CHARSET
8499 : REGEX_DEPENDS_CHARSET);
8502 cs = get_regex_charset(RExC_flags);
8503 if (cs == REGEX_DEPENDS_CHARSET
8504 && (RExC_utf8 || RExC_uni_semantics))
8506 cs = REGEX_UNICODE_CHARSET;
8509 while (*RExC_parse) {
8510 /* && strchr("iogcmsx", *RExC_parse) */
8511 /* (?g), (?gc) and (?o) are useless here
8512 and must be globally applied -- japhy */
8513 switch (*RExC_parse) {
8515 /* Code for the imsx flags */
8516 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8518 case LOCALE_PAT_MOD:
8519 if (has_charset_modifier) {
8520 goto excess_modifier;
8522 else if (flagsp == &negflags) {
8525 cs = REGEX_LOCALE_CHARSET;
8526 has_charset_modifier = LOCALE_PAT_MOD;
8527 RExC_contains_locale = 1;
8529 case UNICODE_PAT_MOD:
8530 if (has_charset_modifier) {
8531 goto excess_modifier;
8533 else if (flagsp == &negflags) {
8536 cs = REGEX_UNICODE_CHARSET;
8537 has_charset_modifier = UNICODE_PAT_MOD;
8539 case ASCII_RESTRICT_PAT_MOD:
8540 if (flagsp == &negflags) {
8543 if (has_charset_modifier) {
8544 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8545 goto excess_modifier;
8547 /* Doubled modifier implies more restricted */
8548 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8551 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8553 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8555 case DEPENDS_PAT_MOD:
8556 if (has_use_defaults) {
8557 goto fail_modifiers;
8559 else if (flagsp == &negflags) {
8562 else if (has_charset_modifier) {
8563 goto excess_modifier;
8566 /* The dual charset means unicode semantics if the
8567 * pattern (or target, not known until runtime) are
8568 * utf8, or something in the pattern indicates unicode
8570 cs = (RExC_utf8 || RExC_uni_semantics)
8571 ? REGEX_UNICODE_CHARSET
8572 : REGEX_DEPENDS_CHARSET;
8573 has_charset_modifier = DEPENDS_PAT_MOD;
8577 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8578 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8580 else if (has_charset_modifier == *(RExC_parse - 1)) {
8581 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8584 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8589 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8591 case ONCE_PAT_MOD: /* 'o' */
8592 case GLOBAL_PAT_MOD: /* 'g' */
8593 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8594 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8595 if (! (wastedflags & wflagbit) ) {
8596 wastedflags |= wflagbit;
8597 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8600 "Useless (%s%c) - %suse /%c modifier",
8601 flagsp == &negflags ? "?-" : "?",
8603 flagsp == &negflags ? "don't " : "",
8610 case CONTINUE_PAT_MOD: /* 'c' */
8611 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8612 if (! (wastedflags & WASTED_C) ) {
8613 wastedflags |= WASTED_GC;
8614 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8617 "Useless (%sc) - %suse /gc modifier",
8618 flagsp == &negflags ? "?-" : "?",
8619 flagsp == &negflags ? "don't " : ""
8624 case KEEPCOPY_PAT_MOD: /* 'p' */
8625 if (flagsp == &negflags) {
8627 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8629 *flagsp |= RXf_PMf_KEEPCOPY;
8633 /* A flag is a default iff it is following a minus, so
8634 * if there is a minus, it means will be trying to
8635 * re-specify a default which is an error */
8636 if (has_use_defaults || flagsp == &negflags) {
8637 goto fail_modifiers;
8640 wastedflags = 0; /* reset so (?g-c) warns twice */
8644 RExC_flags |= posflags;
8645 RExC_flags &= ~negflags;
8646 set_regex_charset(&RExC_flags, cs);
8651 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
8652 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
8653 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
8662 - reg - regular expression, i.e. main body or parenthesized thing
8664 * Caller must absorb opening parenthesis.
8666 * Combining parenthesis handling with the base level of regular expression
8667 * is a trifle forced, but the need to tie the tails of the branches to what
8668 * follows makes it hard to avoid.
8670 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8672 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8674 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8677 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8678 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8679 needs to be restarted.
8680 Otherwise would only return NULL if regbranch() returns NULL, which
8683 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8684 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8685 * 2 is like 1, but indicates that nextchar() has been called to advance
8686 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
8687 * this flag alerts us to the need to check for that */
8690 regnode *ret; /* Will be the head of the group. */
8693 regnode *ender = NULL;
8696 U32 oregflags = RExC_flags;
8697 bool have_branch = 0;
8699 I32 freeze_paren = 0;
8700 I32 after_freeze = 0;
8702 char * parse_start = RExC_parse; /* MJD */
8703 char * const oregcomp_parse = RExC_parse;
8705 GET_RE_DEBUG_FLAGS_DECL;
8707 PERL_ARGS_ASSERT_REG;
8708 DEBUG_PARSE("reg ");
8710 *flagp = 0; /* Tentatively. */
8713 /* Make an OPEN node, if parenthesized. */
8716 /* Under /x, space and comments can be gobbled up between the '(' and
8717 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
8718 * intervening space, as the sequence is a token, and a token should be
8720 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8722 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8723 char *start_verb = RExC_parse;
8724 STRLEN verb_len = 0;
8725 char *start_arg = NULL;
8726 unsigned char op = 0;
8728 int internal_argval = 0; /* internal_argval is only useful if !argok */
8730 if (has_intervening_patws && SIZE_ONLY) {
8731 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8733 while ( *RExC_parse && *RExC_parse != ')' ) {
8734 if ( *RExC_parse == ':' ) {
8735 start_arg = RExC_parse + 1;
8741 verb_len = RExC_parse - start_verb;
8744 while ( *RExC_parse && *RExC_parse != ')' )
8746 if ( *RExC_parse != ')' )
8747 vFAIL("Unterminated verb pattern argument");
8748 if ( RExC_parse == start_arg )
8751 if ( *RExC_parse != ')' )
8752 vFAIL("Unterminated verb pattern");
8755 switch ( *start_verb ) {
8756 case 'A': /* (*ACCEPT) */
8757 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8759 internal_argval = RExC_nestroot;
8762 case 'C': /* (*COMMIT) */
8763 if ( memEQs(start_verb,verb_len,"COMMIT") )
8766 case 'F': /* (*FAIL) */
8767 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8772 case ':': /* (*:NAME) */
8773 case 'M': /* (*MARK:NAME) */
8774 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8779 case 'P': /* (*PRUNE) */
8780 if ( memEQs(start_verb,verb_len,"PRUNE") )
8783 case 'S': /* (*SKIP) */
8784 if ( memEQs(start_verb,verb_len,"SKIP") )
8787 case 'T': /* (*THEN) */
8788 /* [19:06] <TimToady> :: is then */
8789 if ( memEQs(start_verb,verb_len,"THEN") ) {
8791 RExC_seen |= REG_SEEN_CUTGROUP;
8796 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
8798 "Unknown verb pattern '%"UTF8f"'",
8799 UTF8fARG(UTF, verb_len, start_verb));
8802 if ( start_arg && internal_argval ) {
8803 vFAIL3("Verb pattern '%.*s' may not have an argument",
8804 verb_len, start_verb);
8805 } else if ( argok < 0 && !start_arg ) {
8806 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8807 verb_len, start_verb);
8809 ret = reganode(pRExC_state, op, internal_argval);
8810 if ( ! internal_argval && ! SIZE_ONLY ) {
8812 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8813 ARG(ret) = add_data( pRExC_state, 1, "S" );
8814 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8821 if (!internal_argval)
8822 RExC_seen |= REG_SEEN_VERBARG;
8823 } else if ( start_arg ) {
8824 vFAIL3("Verb pattern '%.*s' may not have an argument",
8825 verb_len, start_verb);
8827 ret = reg_node(pRExC_state, op);
8829 nextchar(pRExC_state);
8832 else if (*RExC_parse == '?') { /* (?...) */
8833 bool is_logical = 0;
8834 const char * const seqstart = RExC_parse;
8835 if (has_intervening_patws && SIZE_ONLY) {
8836 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8840 paren = *RExC_parse++;
8841 ret = NULL; /* For look-ahead/behind. */
8844 case 'P': /* (?P...) variants for those used to PCRE/Python */
8845 paren = *RExC_parse++;
8846 if ( paren == '<') /* (?P<...>) named capture */
8848 else if (paren == '>') { /* (?P>name) named recursion */
8849 goto named_recursion;
8851 else if (paren == '=') { /* (?P=...) named backref */
8852 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8853 you change this make sure you change that */
8854 char* name_start = RExC_parse;
8856 SV *sv_dat = reg_scan_name(pRExC_state,
8857 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8858 if (RExC_parse == name_start || *RExC_parse != ')')
8859 vFAIL2("Sequence %.3s... not terminated",parse_start);
8862 num = add_data( pRExC_state, 1, "S" );
8863 RExC_rxi->data->data[num]=(void*)sv_dat;
8864 SvREFCNT_inc_simple_void(sv_dat);
8867 ret = reganode(pRExC_state,
8870 : (ASCII_FOLD_RESTRICTED)
8872 : (AT_LEAST_UNI_SEMANTICS)
8880 Set_Node_Offset(ret, parse_start+1);
8881 Set_Node_Cur_Length(ret, parse_start);
8883 nextchar(pRExC_state);
8887 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8889 case '<': /* (?<...) */
8890 if (*RExC_parse == '!')
8892 else if (*RExC_parse != '=')
8898 case '\'': /* (?'...') */
8899 name_start= RExC_parse;
8900 svname = reg_scan_name(pRExC_state,
8901 SIZE_ONLY ? /* reverse test from the others */
8902 REG_RSN_RETURN_NAME :
8903 REG_RSN_RETURN_NULL);
8904 if (RExC_parse == name_start) {
8906 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8909 if (*RExC_parse != paren)
8910 vFAIL2("Sequence (?%c... not terminated",
8911 paren=='>' ? '<' : paren);
8915 if (!svname) /* shouldn't happen */
8917 "panic: reg_scan_name returned NULL");
8918 if (!RExC_paren_names) {
8919 RExC_paren_names= newHV();
8920 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8922 RExC_paren_name_list= newAV();
8923 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8926 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8928 sv_dat = HeVAL(he_str);
8930 /* croak baby croak */
8932 "panic: paren_name hash element allocation failed");
8933 } else if ( SvPOK(sv_dat) ) {
8934 /* (?|...) can mean we have dupes so scan to check
8935 its already been stored. Maybe a flag indicating
8936 we are inside such a construct would be useful,
8937 but the arrays are likely to be quite small, so
8938 for now we punt -- dmq */
8939 IV count = SvIV(sv_dat);
8940 I32 *pv = (I32*)SvPVX(sv_dat);
8942 for ( i = 0 ; i < count ; i++ ) {
8943 if ( pv[i] == RExC_npar ) {
8949 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8950 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8951 pv[count] = RExC_npar;
8952 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8955 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8956 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8958 SvIV_set(sv_dat, 1);
8961 /* Yes this does cause a memory leak in debugging Perls */
8962 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8963 SvREFCNT_dec_NN(svname);
8966 /*sv_dump(sv_dat);*/
8968 nextchar(pRExC_state);
8970 goto capturing_parens;
8972 RExC_seen |= REG_SEEN_LOOKBEHIND;
8973 RExC_in_lookbehind++;
8975 case '=': /* (?=...) */
8976 RExC_seen_zerolen++;
8978 case '!': /* (?!...) */
8979 RExC_seen_zerolen++;
8980 if (*RExC_parse == ')') {
8981 ret=reg_node(pRExC_state, OPFAIL);
8982 nextchar(pRExC_state);
8986 case '|': /* (?|...) */
8987 /* branch reset, behave like a (?:...) except that
8988 buffers in alternations share the same numbers */
8990 after_freeze = freeze_paren = RExC_npar;
8992 case ':': /* (?:...) */
8993 case '>': /* (?>...) */
8995 case '$': /* (?$...) */
8996 case '@': /* (?@...) */
8997 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8999 case '#': /* (?#...) */
9000 /* XXX As soon as we disallow separating the '?' and '*' (by
9001 * spaces or (?#...) comment), it is believed that this case
9002 * will be unreachable and can be removed. See
9004 while (*RExC_parse && *RExC_parse != ')')
9006 if (*RExC_parse != ')')
9007 FAIL("Sequence (?#... not terminated");
9008 nextchar(pRExC_state);
9011 case '0' : /* (?0) */
9012 case 'R' : /* (?R) */
9013 if (*RExC_parse != ')')
9014 FAIL("Sequence (?R) not terminated");
9015 ret = reg_node(pRExC_state, GOSTART);
9016 *flagp |= POSTPONED;
9017 nextchar(pRExC_state);
9020 { /* named and numeric backreferences */
9022 case '&': /* (?&NAME) */
9023 parse_start = RExC_parse - 1;
9026 SV *sv_dat = reg_scan_name(pRExC_state,
9027 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9028 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9030 goto gen_recurse_regop;
9031 assert(0); /* NOT REACHED */
9033 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9035 vFAIL("Illegal pattern");
9037 goto parse_recursion;
9039 case '-': /* (?-1) */
9040 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9041 RExC_parse--; /* rewind to let it be handled later */
9045 case '1': case '2': case '3': case '4': /* (?1) */
9046 case '5': case '6': case '7': case '8': case '9':
9049 num = atoi(RExC_parse);
9050 parse_start = RExC_parse - 1; /* MJD */
9051 if (*RExC_parse == '-')
9053 while (isDIGIT(*RExC_parse))
9055 if (*RExC_parse!=')')
9056 vFAIL("Expecting close bracket");
9059 if ( paren == '-' ) {
9061 Diagram of capture buffer numbering.
9062 Top line is the normal capture buffer numbers
9063 Bottom line is the negative indexing as from
9067 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9071 num = RExC_npar + num;
9074 vFAIL("Reference to nonexistent group");
9076 } else if ( paren == '+' ) {
9077 num = RExC_npar + num - 1;
9080 ret = reganode(pRExC_state, GOSUB, num);
9082 if (num > (I32)RExC_rx->nparens) {
9084 vFAIL("Reference to nonexistent group");
9086 ARG2L_SET( ret, RExC_recurse_count++);
9088 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9089 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9093 RExC_seen |= REG_SEEN_RECURSE;
9094 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9095 Set_Node_Offset(ret, parse_start); /* MJD */
9097 *flagp |= POSTPONED;
9098 nextchar(pRExC_state);
9100 } /* named and numeric backreferences */
9101 assert(0); /* NOT REACHED */
9103 case '?': /* (??...) */
9105 if (*RExC_parse != '{') {
9108 "Sequence (%"UTF8f"...) not recognized",
9109 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9112 *flagp |= POSTPONED;
9113 paren = *RExC_parse++;
9115 case '{': /* (?{...}) */
9118 struct reg_code_block *cb;
9120 RExC_seen_zerolen++;
9122 if ( !pRExC_state->num_code_blocks
9123 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9124 || pRExC_state->code_blocks[pRExC_state->code_index].start
9125 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9128 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9129 FAIL("panic: Sequence (?{...}): no code block found\n");
9130 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9132 /* this is a pre-compiled code block (?{...}) */
9133 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9134 RExC_parse = RExC_start + cb->end;
9137 if (cb->src_regex) {
9138 n = add_data(pRExC_state, 2, "rl");
9139 RExC_rxi->data->data[n] =
9140 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9141 RExC_rxi->data->data[n+1] = (void*)o;
9144 n = add_data(pRExC_state, 1,
9145 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9146 RExC_rxi->data->data[n] = (void*)o;
9149 pRExC_state->code_index++;
9150 nextchar(pRExC_state);
9154 ret = reg_node(pRExC_state, LOGICAL);
9155 eval = reganode(pRExC_state, EVAL, n);
9158 /* for later propagation into (??{}) return value */
9159 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9161 REGTAIL(pRExC_state, ret, eval);
9162 /* deal with the length of this later - MJD */
9165 ret = reganode(pRExC_state, EVAL, n);
9166 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9167 Set_Node_Offset(ret, parse_start);
9170 case '(': /* (?(?{...})...) and (?(?=...)...) */
9173 if (RExC_parse[0] == '?') { /* (?(?...)) */
9174 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9175 || RExC_parse[1] == '<'
9176 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9180 ret = reg_node(pRExC_state, LOGICAL);
9184 tail = reg(pRExC_state, 1, &flag, depth+1);
9185 if (flag & RESTART_UTF8) {
9186 *flagp = RESTART_UTF8;
9189 REGTAIL(pRExC_state, ret, tail);
9193 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9194 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9196 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9197 char *name_start= RExC_parse++;
9199 SV *sv_dat=reg_scan_name(pRExC_state,
9200 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9201 if (RExC_parse == name_start || *RExC_parse != ch)
9202 vFAIL2("Sequence (?(%c... not terminated",
9203 (ch == '>' ? '<' : ch));
9206 num = add_data( pRExC_state, 1, "S" );
9207 RExC_rxi->data->data[num]=(void*)sv_dat;
9208 SvREFCNT_inc_simple_void(sv_dat);
9210 ret = reganode(pRExC_state,NGROUPP,num);
9211 goto insert_if_check_paren;
9213 else if (RExC_parse[0] == 'D' &&
9214 RExC_parse[1] == 'E' &&
9215 RExC_parse[2] == 'F' &&
9216 RExC_parse[3] == 'I' &&
9217 RExC_parse[4] == 'N' &&
9218 RExC_parse[5] == 'E')
9220 ret = reganode(pRExC_state,DEFINEP,0);
9223 goto insert_if_check_paren;
9225 else if (RExC_parse[0] == 'R') {
9228 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9229 parno = atoi(RExC_parse++);
9230 while (isDIGIT(*RExC_parse))
9232 } else if (RExC_parse[0] == '&') {
9235 sv_dat = reg_scan_name(pRExC_state,
9236 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9237 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9239 ret = reganode(pRExC_state,INSUBP,parno);
9240 goto insert_if_check_paren;
9242 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9246 parno = atoi(RExC_parse++);
9248 while (isDIGIT(*RExC_parse))
9250 ret = reganode(pRExC_state, GROUPP, parno);
9252 insert_if_check_paren:
9253 if (*(tmp = nextchar(pRExC_state)) != ')') {
9255 /* Like the name implies, nextchar deals in chars,
9256 * not characters, so if under UTF, undo its work
9257 * and skip over the the next character.
9260 RExC_parse += UTF8SKIP(RExC_parse);
9262 vFAIL("Switch condition not recognized");
9265 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9266 br = regbranch(pRExC_state, &flags, 1,depth+1);
9268 if (flags & RESTART_UTF8) {
9269 *flagp = RESTART_UTF8;
9272 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9275 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9276 c = *nextchar(pRExC_state);
9281 vFAIL("(?(DEFINE)....) does not allow branches");
9282 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9283 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9284 if (flags & RESTART_UTF8) {
9285 *flagp = RESTART_UTF8;
9288 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9291 REGTAIL(pRExC_state, ret, lastbr);
9294 c = *nextchar(pRExC_state);
9299 vFAIL("Switch (?(condition)... contains too many branches");
9300 ender = reg_node(pRExC_state, TAIL);
9301 REGTAIL(pRExC_state, br, ender);
9303 REGTAIL(pRExC_state, lastbr, ender);
9304 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9307 REGTAIL(pRExC_state, ret, ender);
9308 RExC_size++; /* XXX WHY do we need this?!!
9309 For large programs it seems to be required
9310 but I can't figure out why. -- dmq*/
9314 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9315 vFAIL("Unknown switch condition (?(...))");
9318 case '[': /* (?[ ... ]) */
9319 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9322 RExC_parse--; /* for vFAIL to print correctly */
9323 vFAIL("Sequence (? incomplete");
9325 default: /* e.g., (?i) */
9328 parse_lparen_question_flags(pRExC_state);
9329 if (UCHARAT(RExC_parse) != ':') {
9330 nextchar(pRExC_state);
9335 nextchar(pRExC_state);
9345 ret = reganode(pRExC_state, OPEN, parno);
9348 RExC_nestroot = parno;
9349 if (RExC_seen & REG_SEEN_RECURSE
9350 && !RExC_open_parens[parno-1])
9352 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9353 "Setting open paren #%"IVdf" to %d\n",
9354 (IV)parno, REG_NODE_NUM(ret)));
9355 RExC_open_parens[parno-1]= ret;
9358 Set_Node_Length(ret, 1); /* MJD */
9359 Set_Node_Offset(ret, RExC_parse); /* MJD */
9367 /* Pick up the branches, linking them together. */
9368 parse_start = RExC_parse; /* MJD */
9369 br = regbranch(pRExC_state, &flags, 1,depth+1);
9371 /* branch_len = (paren != 0); */
9374 if (flags & RESTART_UTF8) {
9375 *flagp = RESTART_UTF8;
9378 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9380 if (*RExC_parse == '|') {
9381 if (!SIZE_ONLY && RExC_extralen) {
9382 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9385 reginsert(pRExC_state, BRANCH, br, depth+1);
9386 Set_Node_Length(br, paren != 0);
9387 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9391 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9393 else if (paren == ':') {
9394 *flagp |= flags&SIMPLE;
9396 if (is_open) { /* Starts with OPEN. */
9397 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9399 else if (paren != '?') /* Not Conditional */
9401 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9403 while (*RExC_parse == '|') {
9404 if (!SIZE_ONLY && RExC_extralen) {
9405 ender = reganode(pRExC_state, LONGJMP,0);
9406 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9409 RExC_extralen += 2; /* Account for LONGJMP. */
9410 nextchar(pRExC_state);
9412 if (RExC_npar > after_freeze)
9413 after_freeze = RExC_npar;
9414 RExC_npar = freeze_paren;
9416 br = regbranch(pRExC_state, &flags, 0, depth+1);
9419 if (flags & RESTART_UTF8) {
9420 *flagp = RESTART_UTF8;
9423 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9425 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9427 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9430 if (have_branch || paren != ':') {
9431 /* Make a closing node, and hook it on the end. */
9434 ender = reg_node(pRExC_state, TAIL);
9437 ender = reganode(pRExC_state, CLOSE, parno);
9438 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9439 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9440 "Setting close paren #%"IVdf" to %d\n",
9441 (IV)parno, REG_NODE_NUM(ender)));
9442 RExC_close_parens[parno-1]= ender;
9443 if (RExC_nestroot == parno)
9446 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9447 Set_Node_Length(ender,1); /* MJD */
9453 *flagp &= ~HASWIDTH;
9456 ender = reg_node(pRExC_state, SUCCEED);
9459 ender = reg_node(pRExC_state, END);
9461 assert(!RExC_opend); /* there can only be one! */
9466 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9467 SV * const mysv_val1=sv_newmortal();
9468 SV * const mysv_val2=sv_newmortal();
9469 DEBUG_PARSE_MSG("lsbr");
9470 regprop(RExC_rx, mysv_val1, lastbr);
9471 regprop(RExC_rx, mysv_val2, ender);
9472 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9473 SvPV_nolen_const(mysv_val1),
9474 (IV)REG_NODE_NUM(lastbr),
9475 SvPV_nolen_const(mysv_val2),
9476 (IV)REG_NODE_NUM(ender),
9477 (IV)(ender - lastbr)
9480 REGTAIL(pRExC_state, lastbr, ender);
9482 if (have_branch && !SIZE_ONLY) {
9485 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9487 /* Hook the tails of the branches to the closing node. */
9488 for (br = ret; br; br = regnext(br)) {
9489 const U8 op = PL_regkind[OP(br)];
9491 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9492 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9495 else if (op == BRANCHJ) {
9496 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9497 /* for now we always disable this optimisation * /
9498 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9504 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9505 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9506 SV * const mysv_val1=sv_newmortal();
9507 SV * const mysv_val2=sv_newmortal();
9508 DEBUG_PARSE_MSG("NADA");
9509 regprop(RExC_rx, mysv_val1, ret);
9510 regprop(RExC_rx, mysv_val2, ender);
9511 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9512 SvPV_nolen_const(mysv_val1),
9513 (IV)REG_NODE_NUM(ret),
9514 SvPV_nolen_const(mysv_val2),
9515 (IV)REG_NODE_NUM(ender),
9520 if (OP(ender) == TAIL) {
9525 for ( opt= br + 1; opt < ender ; opt++ )
9527 NEXT_OFF(br)= ender - br;
9535 static const char parens[] = "=!<,>";
9537 if (paren && (p = strchr(parens, paren))) {
9538 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9539 int flag = (p - parens) > 1;
9542 node = SUSPEND, flag = 0;
9543 reginsert(pRExC_state, node,ret, depth+1);
9544 Set_Node_Cur_Length(ret, parse_start);
9545 Set_Node_Offset(ret, parse_start + 1);
9547 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9551 /* Check for proper termination. */
9553 /* restore original flags, but keep (?p) */
9554 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9555 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9556 RExC_parse = oregcomp_parse;
9557 vFAIL("Unmatched (");
9560 else if (!paren && RExC_parse < RExC_end) {
9561 if (*RExC_parse == ')') {
9563 vFAIL("Unmatched )");
9566 FAIL("Junk on end of regexp"); /* "Can't happen". */
9567 assert(0); /* NOTREACHED */
9570 if (RExC_in_lookbehind) {
9571 RExC_in_lookbehind--;
9573 if (after_freeze > RExC_npar)
9574 RExC_npar = after_freeze;
9579 - regbranch - one alternative of an | operator
9581 * Implements the concatenation operator.
9583 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9587 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9591 regnode *chain = NULL;
9593 I32 flags = 0, c = 0;
9594 GET_RE_DEBUG_FLAGS_DECL;
9596 PERL_ARGS_ASSERT_REGBRANCH;
9598 DEBUG_PARSE("brnc");
9603 if (!SIZE_ONLY && RExC_extralen)
9604 ret = reganode(pRExC_state, BRANCHJ,0);
9606 ret = reg_node(pRExC_state, BRANCH);
9607 Set_Node_Length(ret, 1);
9611 if (!first && SIZE_ONLY)
9612 RExC_extralen += 1; /* BRANCHJ */
9614 *flagp = WORST; /* Tentatively. */
9617 nextchar(pRExC_state);
9618 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9620 latest = regpiece(pRExC_state, &flags,depth+1);
9621 if (latest == NULL) {
9622 if (flags & TRYAGAIN)
9624 if (flags & RESTART_UTF8) {
9625 *flagp = RESTART_UTF8;
9628 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9630 else if (ret == NULL)
9632 *flagp |= flags&(HASWIDTH|POSTPONED);
9633 if (chain == NULL) /* First piece. */
9634 *flagp |= flags&SPSTART;
9637 REGTAIL(pRExC_state, chain, latest);
9642 if (chain == NULL) { /* Loop ran zero times. */
9643 chain = reg_node(pRExC_state, NOTHING);
9648 *flagp |= flags&SIMPLE;
9655 - regpiece - something followed by possible [*+?]
9657 * Note that the branching code sequences used for ? and the general cases
9658 * of * and + are somewhat optimized: they use the same NOTHING node as
9659 * both the endmarker for their branch list and the body of the last branch.
9660 * It might seem that this node could be dispensed with entirely, but the
9661 * endmarker role is not redundant.
9663 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9665 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9669 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9676 const char * const origparse = RExC_parse;
9678 I32 max = REG_INFTY;
9679 #ifdef RE_TRACK_PATTERN_OFFSETS
9682 const char *maxpos = NULL;
9684 /* Save the original in case we change the emitted regop to a FAIL. */
9685 regnode * const orig_emit = RExC_emit;
9687 GET_RE_DEBUG_FLAGS_DECL;
9689 PERL_ARGS_ASSERT_REGPIECE;
9691 DEBUG_PARSE("piec");
9693 ret = regatom(pRExC_state, &flags,depth+1);
9695 if (flags & (TRYAGAIN|RESTART_UTF8))
9696 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9698 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9704 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9706 #ifdef RE_TRACK_PATTERN_OFFSETS
9707 parse_start = RExC_parse; /* MJD */
9709 next = RExC_parse + 1;
9710 while (isDIGIT(*next) || *next == ',') {
9719 if (*next == '}') { /* got one */
9723 min = atoi(RExC_parse);
9727 maxpos = RExC_parse;
9729 if (!max && *maxpos != '0')
9730 max = REG_INFTY; /* meaning "infinity" */
9731 else if (max >= REG_INFTY)
9732 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9734 nextchar(pRExC_state);
9735 if (max < min) { /* If can't match, warn and optimize to fail
9738 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9740 /* We can't back off the size because we have to reserve
9741 * enough space for all the things we are about to throw
9742 * away, but we can shrink it by the ammount we are about
9744 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9747 RExC_emit = orig_emit;
9749 ret = reg_node(pRExC_state, OPFAIL);
9754 if ((flags&SIMPLE)) {
9755 RExC_naughty += 2 + RExC_naughty / 2;
9756 reginsert(pRExC_state, CURLY, ret, depth+1);
9757 Set_Node_Offset(ret, parse_start+1); /* MJD */
9758 Set_Node_Cur_Length(ret, parse_start);
9761 regnode * const w = reg_node(pRExC_state, WHILEM);
9764 REGTAIL(pRExC_state, ret, w);
9765 if (!SIZE_ONLY && RExC_extralen) {
9766 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9767 reginsert(pRExC_state, NOTHING,ret, depth+1);
9768 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9770 reginsert(pRExC_state, CURLYX,ret, depth+1);
9772 Set_Node_Offset(ret, parse_start+1);
9773 Set_Node_Length(ret,
9774 op == '{' ? (RExC_parse - parse_start) : 1);
9776 if (!SIZE_ONLY && RExC_extralen)
9777 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9778 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9780 RExC_whilem_seen++, RExC_extralen += 3;
9781 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9790 ARG1_SET(ret, (U16)min);
9791 ARG2_SET(ret, (U16)max);
9803 #if 0 /* Now runtime fix should be reliable. */
9805 /* if this is reinstated, don't forget to put this back into perldiag:
9807 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9809 (F) The part of the regexp subject to either the * or + quantifier
9810 could match an empty string. The {#} shows in the regular
9811 expression about where the problem was discovered.
9815 if (!(flags&HASWIDTH) && op != '?')
9816 vFAIL("Regexp *+ operand could be empty");
9819 #ifdef RE_TRACK_PATTERN_OFFSETS
9820 parse_start = RExC_parse;
9822 nextchar(pRExC_state);
9824 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9826 if (op == '*' && (flags&SIMPLE)) {
9827 reginsert(pRExC_state, STAR, ret, depth+1);
9831 else if (op == '*') {
9835 else if (op == '+' && (flags&SIMPLE)) {
9836 reginsert(pRExC_state, PLUS, ret, depth+1);
9840 else if (op == '+') {
9844 else if (op == '?') {
9849 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9850 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9851 ckWARN2reg(RExC_parse,
9852 "%"UTF8f" matches null string many times",
9853 UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0),
9855 (void)ReREFCNT_inc(RExC_rx_sv);
9858 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9859 nextchar(pRExC_state);
9860 reginsert(pRExC_state, MINMOD, ret, depth+1);
9861 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9864 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9866 nextchar(pRExC_state);
9867 ender = reg_node(pRExC_state, SUCCEED);
9868 REGTAIL(pRExC_state, ret, ender);
9869 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9871 ender = reg_node(pRExC_state, TAIL);
9872 REGTAIL(pRExC_state, ret, ender);
9875 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9877 vFAIL("Nested quantifiers");
9884 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9885 const bool strict /* Apply stricter parsing rules? */
9889 /* This is expected to be called by a parser routine that has recognized '\N'
9890 and needs to handle the rest. RExC_parse is expected to point at the first
9891 char following the N at the time of the call. On successful return,
9892 RExC_parse has been updated to point to just after the sequence identified
9893 by this routine, and <*flagp> has been updated.
9895 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9898 \N may begin either a named sequence, or if outside a character class, mean
9899 to match a non-newline. For non single-quoted regexes, the tokenizer has
9900 attempted to decide which, and in the case of a named sequence, converted it
9901 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9902 where c1... are the characters in the sequence. For single-quoted regexes,
9903 the tokenizer passes the \N sequence through unchanged; this code will not
9904 attempt to determine this nor expand those, instead raising a syntax error.
9905 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9906 or there is no '}', it signals that this \N occurrence means to match a
9909 Only the \N{U+...} form should occur in a character class, for the same
9910 reason that '.' inside a character class means to just match a period: it
9911 just doesn't make sense.
9913 The function raises an error (via vFAIL), and doesn't return for various
9914 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9915 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9916 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9917 only possible if node_p is non-NULL.
9920 If <valuep> is non-null, it means the caller can accept an input sequence
9921 consisting of a just a single code point; <*valuep> is set to that value
9922 if the input is such.
9924 If <node_p> is non-null it signifies that the caller can accept any other
9925 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9927 1) \N means not-a-NL: points to a newly created REG_ANY node;
9928 2) \N{}: points to a new NOTHING node;
9929 3) otherwise: points to a new EXACT node containing the resolved
9931 Note that FALSE is returned for single code point sequences if <valuep> is
9935 char * endbrace; /* '}' following the name */
9937 char *endchar; /* Points to '.' or '}' ending cur char in the input
9939 bool has_multiple_chars; /* true if the input stream contains a sequence of
9940 more than one character */
9942 GET_RE_DEBUG_FLAGS_DECL;
9944 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9948 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9950 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9951 * modifier. The other meaning does not */
9952 p = (RExC_flags & RXf_PMf_EXTENDED)
9953 ? regwhite( pRExC_state, RExC_parse )
9956 /* Disambiguate between \N meaning a named character versus \N meaning
9957 * [^\n]. The former is assumed when it can't be the latter. */
9958 if (*p != '{' || regcurly(p, FALSE)) {
9961 /* no bare \N in a charclass */
9962 if (in_char_class) {
9963 vFAIL("\\N in a character class must be a named character: \\N{...}");
9967 nextchar(pRExC_state);
9968 *node_p = reg_node(pRExC_state, REG_ANY);
9969 *flagp |= HASWIDTH|SIMPLE;
9972 Set_Node_Length(*node_p, 1); /* MJD */
9976 /* Here, we have decided it should be a named character or sequence */
9978 /* The test above made sure that the next real character is a '{', but
9979 * under the /x modifier, it could be separated by space (or a comment and
9980 * \n) and this is not allowed (for consistency with \x{...} and the
9981 * tokenizer handling of \N{NAME}). */
9982 if (*RExC_parse != '{') {
9983 vFAIL("Missing braces on \\N{}");
9986 RExC_parse++; /* Skip past the '{' */
9988 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9989 || ! (endbrace == RExC_parse /* nothing between the {} */
9990 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9991 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9993 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9994 vFAIL("\\N{NAME} must be resolved by the lexer");
9997 if (endbrace == RExC_parse) { /* empty: \N{} */
10000 *node_p = reg_node(pRExC_state,NOTHING);
10002 else if (in_char_class) {
10003 if (SIZE_ONLY && in_char_class) {
10005 RExC_parse++; /* Position after the "}" */
10006 vFAIL("Zero length \\N{}");
10009 ckWARNreg(RExC_parse,
10010 "Ignoring zero length \\N{} in character class");
10018 nextchar(pRExC_state);
10022 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10023 RExC_parse += 2; /* Skip past the 'U+' */
10025 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10027 /* Code points are separated by dots. If none, there is only one code
10028 * point, and is terminated by the brace */
10029 has_multiple_chars = (endchar < endbrace);
10031 if (valuep && (! has_multiple_chars || in_char_class)) {
10032 /* We only pay attention to the first char of
10033 multichar strings being returned in char classes. I kinda wonder
10034 if this makes sense as it does change the behaviour
10035 from earlier versions, OTOH that behaviour was broken
10036 as well. XXX Solution is to recharacterize as
10037 [rest-of-class]|multi1|multi2... */
10039 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10040 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10041 | PERL_SCAN_DISALLOW_PREFIX
10042 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10044 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10046 /* The tokenizer should have guaranteed validity, but it's possible to
10047 * bypass it by using single quoting, so check */
10048 if (length_of_hex == 0
10049 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10051 RExC_parse += length_of_hex; /* Includes all the valid */
10052 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10053 ? UTF8SKIP(RExC_parse)
10055 /* Guard against malformed utf8 */
10056 if (RExC_parse >= endchar) {
10057 RExC_parse = endchar;
10059 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10062 if (in_char_class && has_multiple_chars) {
10064 RExC_parse = endbrace;
10065 vFAIL("\\N{} in character class restricted to one character");
10068 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10072 RExC_parse = endbrace + 1;
10074 else if (! node_p || ! has_multiple_chars) {
10076 /* Here, the input is legal, but not according to the caller's
10077 * options. We fail without advancing the parse, so that the
10078 * caller can try again */
10084 /* What is done here is to convert this to a sub-pattern of the form
10085 * (?:\x{char1}\x{char2}...)
10086 * and then call reg recursively. That way, it retains its atomicness,
10087 * while not having to worry about special handling that some code
10088 * points may have. toke.c has converted the original Unicode values
10089 * to native, so that we can just pass on the hex values unchanged. We
10090 * do have to set a flag to keep recoding from happening in the
10093 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10095 char *orig_end = RExC_end;
10098 while (RExC_parse < endbrace) {
10100 /* Convert to notation the rest of the code understands */
10101 sv_catpv(substitute_parse, "\\x{");
10102 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10103 sv_catpv(substitute_parse, "}");
10105 /* Point to the beginning of the next character in the sequence. */
10106 RExC_parse = endchar + 1;
10107 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10109 sv_catpv(substitute_parse, ")");
10111 RExC_parse = SvPV(substitute_parse, len);
10113 /* Don't allow empty number */
10115 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10117 RExC_end = RExC_parse + len;
10119 /* The values are Unicode, and therefore not subject to recoding */
10120 RExC_override_recoding = 1;
10122 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10123 if (flags & RESTART_UTF8) {
10124 *flagp = RESTART_UTF8;
10127 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10130 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10132 RExC_parse = endbrace;
10133 RExC_end = orig_end;
10134 RExC_override_recoding = 0;
10136 nextchar(pRExC_state);
10146 * It returns the code point in utf8 for the value in *encp.
10147 * value: a code value in the source encoding
10148 * encp: a pointer to an Encode object
10150 * If the result from Encode is not a single character,
10151 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10154 S_reg_recode(pTHX_ const char value, SV **encp)
10157 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10158 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10159 const STRLEN newlen = SvCUR(sv);
10160 UV uv = UNICODE_REPLACEMENT;
10162 PERL_ARGS_ASSERT_REG_RECODE;
10166 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10169 if (!newlen || numlen != newlen) {
10170 uv = UNICODE_REPLACEMENT;
10176 PERL_STATIC_INLINE U8
10177 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10181 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10187 op = get_regex_charset(RExC_flags);
10188 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10189 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10190 been, so there is no hole */
10193 return op + EXACTF;
10196 PERL_STATIC_INLINE void
10197 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10199 /* This knows the details about sizing an EXACTish node, setting flags for
10200 * it (by setting <*flagp>, and potentially populating it with a single
10203 * If <len> (the length in bytes) is non-zero, this function assumes that
10204 * the node has already been populated, and just does the sizing. In this
10205 * case <code_point> should be the final code point that has already been
10206 * placed into the node. This value will be ignored except that under some
10207 * circumstances <*flagp> is set based on it.
10209 * If <len> is zero, the function assumes that the node is to contain only
10210 * the single character given by <code_point> and calculates what <len>
10211 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10212 * additionally will populate the node's STRING with <code_point>, if <len>
10213 * is 0. In both cases <*flagp> is appropriately set
10215 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10216 * 255, must be folded (the former only when the rules indicate it can
10219 bool len_passed_in = cBOOL(len != 0);
10220 U8 character[UTF8_MAXBYTES_CASE+1];
10222 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10224 if (! len_passed_in) {
10226 if (FOLD && (! LOC || code_point > 255)) {
10227 _to_uni_fold_flags(code_point,
10230 FOLD_FLAGS_FULL | ((LOC)
10231 ? FOLD_FLAGS_LOCALE
10232 : (ASCII_FOLD_RESTRICTED)
10233 ? FOLD_FLAGS_NOMIX_ASCII
10237 uvchr_to_utf8( character, code_point);
10238 len = UTF8SKIP(character);
10242 || code_point != LATIN_SMALL_LETTER_SHARP_S
10243 || ASCII_FOLD_RESTRICTED
10244 || ! AT_LEAST_UNI_SEMANTICS)
10246 *character = (U8) code_point;
10251 *(character + 1) = 's';
10257 RExC_size += STR_SZ(len);
10260 RExC_emit += STR_SZ(len);
10261 STR_LEN(node) = len;
10262 if (! len_passed_in) {
10263 Copy((char *) character, STRING(node), len, char);
10267 *flagp |= HASWIDTH;
10269 /* A single character node is SIMPLE, except for the special-cased SHARP S
10271 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10272 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10273 || ! FOLD || ! DEPENDS_SEMANTICS))
10280 - regatom - the lowest level
10282 Try to identify anything special at the start of the pattern. If there
10283 is, then handle it as required. This may involve generating a single regop,
10284 such as for an assertion; or it may involve recursing, such as to
10285 handle a () structure.
10287 If the string doesn't start with something special then we gobble up
10288 as much literal text as we can.
10290 Once we have been able to handle whatever type of thing started the
10291 sequence, we return.
10293 Note: we have to be careful with escapes, as they can be both literal
10294 and special, and in the case of \10 and friends, context determines which.
10296 A summary of the code structure is:
10298 switch (first_byte) {
10299 cases for each special:
10300 handle this special;
10303 switch (2nd byte) {
10304 cases for each unambiguous special:
10305 handle this special;
10307 cases for each ambigous special/literal:
10309 if (special) handle here
10311 default: // unambiguously literal:
10314 default: // is a literal char
10317 create EXACTish node for literal;
10318 while (more input and node isn't full) {
10319 switch (input_byte) {
10320 cases for each special;
10321 make sure parse pointer is set so that the next call to
10322 regatom will see this special first
10323 goto loopdone; // EXACTish node terminated by prev. char
10325 append char to EXACTISH node;
10327 get next input byte;
10331 return the generated node;
10333 Specifically there are two separate switches for handling
10334 escape sequences, with the one for handling literal escapes requiring
10335 a dummy entry for all of the special escapes that are actually handled
10338 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10340 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10342 Otherwise does not return NULL.
10346 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10349 regnode *ret = NULL;
10351 char *parse_start = RExC_parse;
10355 GET_RE_DEBUG_FLAGS_DECL;
10357 *flagp = WORST; /* Tentatively. */
10359 DEBUG_PARSE("atom");
10361 PERL_ARGS_ASSERT_REGATOM;
10364 switch ((U8)*RExC_parse) {
10366 RExC_seen_zerolen++;
10367 nextchar(pRExC_state);
10368 if (RExC_flags & RXf_PMf_MULTILINE)
10369 ret = reg_node(pRExC_state, MBOL);
10370 else if (RExC_flags & RXf_PMf_SINGLELINE)
10371 ret = reg_node(pRExC_state, SBOL);
10373 ret = reg_node(pRExC_state, BOL);
10374 Set_Node_Length(ret, 1); /* MJD */
10377 nextchar(pRExC_state);
10379 RExC_seen_zerolen++;
10380 if (RExC_flags & RXf_PMf_MULTILINE)
10381 ret = reg_node(pRExC_state, MEOL);
10382 else if (RExC_flags & RXf_PMf_SINGLELINE)
10383 ret = reg_node(pRExC_state, SEOL);
10385 ret = reg_node(pRExC_state, EOL);
10386 Set_Node_Length(ret, 1); /* MJD */
10389 nextchar(pRExC_state);
10390 if (RExC_flags & RXf_PMf_SINGLELINE)
10391 ret = reg_node(pRExC_state, SANY);
10393 ret = reg_node(pRExC_state, REG_ANY);
10394 *flagp |= HASWIDTH|SIMPLE;
10396 Set_Node_Length(ret, 1); /* MJD */
10400 char * const oregcomp_parse = ++RExC_parse;
10401 ret = regclass(pRExC_state, flagp,depth+1,
10402 FALSE, /* means parse the whole char class */
10403 TRUE, /* allow multi-char folds */
10404 FALSE, /* don't silence non-portable warnings. */
10406 if (*RExC_parse != ']') {
10407 RExC_parse = oregcomp_parse;
10408 vFAIL("Unmatched [");
10411 if (*flagp & RESTART_UTF8)
10413 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10416 nextchar(pRExC_state);
10417 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10421 nextchar(pRExC_state);
10422 ret = reg(pRExC_state, 2, &flags,depth+1);
10424 if (flags & TRYAGAIN) {
10425 if (RExC_parse == RExC_end) {
10426 /* Make parent create an empty node if needed. */
10427 *flagp |= TRYAGAIN;
10432 if (flags & RESTART_UTF8) {
10433 *flagp = RESTART_UTF8;
10436 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10438 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10442 if (flags & TRYAGAIN) {
10443 *flagp |= TRYAGAIN;
10446 vFAIL("Internal urp");
10447 /* Supposed to be caught earlier. */
10450 if (!regcurly(RExC_parse, FALSE)) {
10459 vFAIL("Quantifier follows nothing");
10464 This switch handles escape sequences that resolve to some kind
10465 of special regop and not to literal text. Escape sequnces that
10466 resolve to literal text are handled below in the switch marked
10469 Every entry in this switch *must* have a corresponding entry
10470 in the literal escape switch. However, the opposite is not
10471 required, as the default for this switch is to jump to the
10472 literal text handling code.
10474 switch ((U8)*++RExC_parse) {
10476 /* Special Escapes */
10478 RExC_seen_zerolen++;
10479 ret = reg_node(pRExC_state, SBOL);
10481 goto finish_meta_pat;
10483 ret = reg_node(pRExC_state, GPOS);
10484 RExC_seen |= REG_SEEN_GPOS;
10486 goto finish_meta_pat;
10488 RExC_seen_zerolen++;
10489 ret = reg_node(pRExC_state, KEEPS);
10491 /* XXX:dmq : disabling in-place substitution seems to
10492 * be necessary here to avoid cases of memory corruption, as
10493 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10495 RExC_seen |= REG_SEEN_LOOKBEHIND;
10496 goto finish_meta_pat;
10498 ret = reg_node(pRExC_state, SEOL);
10500 RExC_seen_zerolen++; /* Do not optimize RE away */
10501 goto finish_meta_pat;
10503 ret = reg_node(pRExC_state, EOS);
10505 RExC_seen_zerolen++; /* Do not optimize RE away */
10506 goto finish_meta_pat;
10508 ret = reg_node(pRExC_state, CANY);
10509 RExC_seen |= REG_SEEN_CANY;
10510 *flagp |= HASWIDTH|SIMPLE;
10511 goto finish_meta_pat;
10513 ret = reg_node(pRExC_state, CLUMP);
10514 *flagp |= HASWIDTH;
10515 goto finish_meta_pat;
10521 arg = ANYOF_WORDCHAR;
10525 RExC_seen_zerolen++;
10526 RExC_seen |= REG_SEEN_LOOKBEHIND;
10527 op = BOUND + get_regex_charset(RExC_flags);
10528 if (op > BOUNDA) { /* /aa is same as /a */
10531 ret = reg_node(pRExC_state, op);
10532 FLAGS(ret) = get_regex_charset(RExC_flags);
10534 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10535 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10537 goto finish_meta_pat;
10539 RExC_seen_zerolen++;
10540 RExC_seen |= REG_SEEN_LOOKBEHIND;
10541 op = NBOUND + get_regex_charset(RExC_flags);
10542 if (op > NBOUNDA) { /* /aa is same as /a */
10545 ret = reg_node(pRExC_state, op);
10546 FLAGS(ret) = get_regex_charset(RExC_flags);
10548 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10549 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10551 goto finish_meta_pat;
10561 ret = reg_node(pRExC_state, LNBREAK);
10562 *flagp |= HASWIDTH|SIMPLE;
10563 goto finish_meta_pat;
10571 goto join_posix_op_known;
10577 arg = ANYOF_VERTWS;
10579 goto join_posix_op_known;
10589 op = POSIXD + get_regex_charset(RExC_flags);
10590 if (op > POSIXA) { /* /aa is same as /a */
10594 join_posix_op_known:
10597 op += NPOSIXD - POSIXD;
10600 ret = reg_node(pRExC_state, op);
10602 FLAGS(ret) = namedclass_to_classnum(arg);
10605 *flagp |= HASWIDTH|SIMPLE;
10609 nextchar(pRExC_state);
10610 Set_Node_Length(ret, 2); /* MJD */
10616 char* parse_start = RExC_parse - 2;
10621 ret = regclass(pRExC_state, flagp,depth+1,
10622 TRUE, /* means just parse this element */
10623 FALSE, /* don't allow multi-char folds */
10624 FALSE, /* don't silence non-portable warnings.
10625 It would be a bug if these returned
10628 /* regclass() can only return RESTART_UTF8 if multi-char folds
10631 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10636 Set_Node_Offset(ret, parse_start + 2);
10637 Set_Node_Cur_Length(ret, parse_start);
10638 nextchar(pRExC_state);
10642 /* Handle \N and \N{NAME} with multiple code points here and not
10643 * below because it can be multicharacter. join_exact() will join
10644 * them up later on. Also this makes sure that things like
10645 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10646 * The options to the grok function call causes it to fail if the
10647 * sequence is just a single code point. We then go treat it as
10648 * just another character in the current EXACT node, and hence it
10649 * gets uniform treatment with all the other characters. The
10650 * special treatment for quantifiers is not needed for such single
10651 * character sequences */
10653 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10654 FALSE /* not strict */ )) {
10655 if (*flagp & RESTART_UTF8)
10661 case 'k': /* Handle \k<NAME> and \k'NAME' */
10664 char ch= RExC_parse[1];
10665 if (ch != '<' && ch != '\'' && ch != '{') {
10667 vFAIL2("Sequence %.2s... not terminated",parse_start);
10669 /* this pretty much dupes the code for (?P=...) in reg(), if
10670 you change this make sure you change that */
10671 char* name_start = (RExC_parse += 2);
10673 SV *sv_dat = reg_scan_name(pRExC_state,
10674 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10675 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10676 if (RExC_parse == name_start || *RExC_parse != ch)
10677 vFAIL2("Sequence %.3s... not terminated",parse_start);
10680 num = add_data( pRExC_state, 1, "S" );
10681 RExC_rxi->data->data[num]=(void*)sv_dat;
10682 SvREFCNT_inc_simple_void(sv_dat);
10686 ret = reganode(pRExC_state,
10689 : (ASCII_FOLD_RESTRICTED)
10691 : (AT_LEAST_UNI_SEMANTICS)
10697 *flagp |= HASWIDTH;
10699 /* override incorrect value set in reganode MJD */
10700 Set_Node_Offset(ret, parse_start+1);
10701 Set_Node_Cur_Length(ret, parse_start);
10702 nextchar(pRExC_state);
10708 case '1': case '2': case '3': case '4':
10709 case '5': case '6': case '7': case '8': case '9':
10712 bool isg = *RExC_parse == 'g';
10717 if (*RExC_parse == '{') {
10721 if (*RExC_parse == '-') {
10725 if (hasbrace && !isDIGIT(*RExC_parse)) {
10726 if (isrel) RExC_parse--;
10728 goto parse_named_seq;
10730 num = atoi(RExC_parse);
10731 if (isg && num == 0) {
10732 if (*RExC_parse == '0') {
10733 vFAIL("Reference to invalid group 0");
10736 vFAIL("Unterminated \\g... pattern");
10740 num = RExC_npar - num;
10742 vFAIL("Reference to nonexistent or unclosed group");
10744 if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10745 /* Probably a character specified in octal, e.g. \35 */
10748 #ifdef RE_TRACK_PATTERN_OFFSETS
10749 char * const parse_start = RExC_parse - 1; /* MJD */
10751 while (isDIGIT(*RExC_parse))
10754 if (*RExC_parse != '}')
10755 vFAIL("Unterminated \\g{...} pattern");
10759 if (num > (I32)RExC_rx->nparens)
10760 vFAIL("Reference to nonexistent group");
10763 ret = reganode(pRExC_state,
10766 : (ASCII_FOLD_RESTRICTED)
10768 : (AT_LEAST_UNI_SEMANTICS)
10774 *flagp |= HASWIDTH;
10776 /* override incorrect value set in reganode MJD */
10777 Set_Node_Offset(ret, parse_start+1);
10778 Set_Node_Cur_Length(ret, parse_start);
10780 nextchar(pRExC_state);
10785 if (RExC_parse >= RExC_end)
10786 FAIL("Trailing \\");
10789 /* Do not generate "unrecognized" warnings here, we fall
10790 back into the quick-grab loop below */
10797 if (RExC_flags & RXf_PMf_EXTENDED) {
10798 if ( reg_skipcomment( pRExC_state ) )
10805 parse_start = RExC_parse - 1;
10814 #define MAX_NODE_STRING_SIZE 127
10815 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10817 U8 upper_parse = MAX_NODE_STRING_SIZE;
10819 U8 node_type = compute_EXACTish(pRExC_state);
10820 bool next_is_quantifier;
10821 char * oldp = NULL;
10823 /* We can convert EXACTF nodes to EXACTFU if they contain only
10824 * characters that match identically regardless of the target
10825 * string's UTF8ness. The reason to do this is that EXACTF is not
10826 * trie-able, EXACTFU is. (We don't need to figure this out until
10828 bool maybe_exactfu = node_type == EXACTF && PASS2;
10830 /* If a folding node contains only code points that don't
10831 * participate in folds, it can be changed into an EXACT node,
10832 * which allows the optimizer more things to look for */
10835 ret = reg_node(pRExC_state, node_type);
10837 /* In pass1, folded, we use a temporary buffer instead of the
10838 * actual node, as the node doesn't exist yet */
10839 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10845 /* We do the EXACTFish to EXACT node only if folding, and not if in
10846 * locale, as whether a character folds or not isn't known until
10847 * runtime. (And we don't need to figure this out until pass 2) */
10848 maybe_exact = FOLD && ! LOC && PASS2;
10850 /* XXX The node can hold up to 255 bytes, yet this only goes to
10851 * 127. I (khw) do not know why. Keeping it somewhat less than
10852 * 255 allows us to not have to worry about overflow due to
10853 * converting to utf8 and fold expansion, but that value is
10854 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10855 * split up by this limit into a single one using the real max of
10856 * 255. Even at 127, this breaks under rare circumstances. If
10857 * folding, we do not want to split a node at a character that is a
10858 * non-final in a multi-char fold, as an input string could just
10859 * happen to want to match across the node boundary. The join
10860 * would solve that problem if the join actually happens. But a
10861 * series of more than two nodes in a row each of 127 would cause
10862 * the first join to succeed to get to 254, but then there wouldn't
10863 * be room for the next one, which could at be one of those split
10864 * multi-char folds. I don't know of any fool-proof solution. One
10865 * could back off to end with only a code point that isn't such a
10866 * non-final, but it is possible for there not to be any in the
10868 for (p = RExC_parse - 1;
10869 len < upper_parse && p < RExC_end;
10874 if (RExC_flags & RXf_PMf_EXTENDED)
10875 p = regwhite( pRExC_state, p );
10886 /* Literal Escapes Switch
10888 This switch is meant to handle escape sequences that
10889 resolve to a literal character.
10891 Every escape sequence that represents something
10892 else, like an assertion or a char class, is handled
10893 in the switch marked 'Special Escapes' above in this
10894 routine, but also has an entry here as anything that
10895 isn't explicitly mentioned here will be treated as
10896 an unescaped equivalent literal.
10899 switch ((U8)*++p) {
10900 /* These are all the special escapes. */
10901 case 'A': /* Start assertion */
10902 case 'b': case 'B': /* Word-boundary assertion*/
10903 case 'C': /* Single char !DANGEROUS! */
10904 case 'd': case 'D': /* digit class */
10905 case 'g': case 'G': /* generic-backref, pos assertion */
10906 case 'h': case 'H': /* HORIZWS */
10907 case 'k': case 'K': /* named backref, keep marker */
10908 case 'p': case 'P': /* Unicode property */
10909 case 'R': /* LNBREAK */
10910 case 's': case 'S': /* space class */
10911 case 'v': case 'V': /* VERTWS */
10912 case 'w': case 'W': /* word class */
10913 case 'X': /* eXtended Unicode "combining character sequence" */
10914 case 'z': case 'Z': /* End of line/string assertion */
10918 /* Anything after here is an escape that resolves to a
10919 literal. (Except digits, which may or may not)
10925 case 'N': /* Handle a single-code point named character. */
10926 /* The options cause it to fail if a multiple code
10927 * point sequence. Handle those in the switch() above
10929 RExC_parse = p + 1;
10930 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10931 flagp, depth, FALSE,
10932 FALSE /* not strict */ ))
10934 if (*flagp & RESTART_UTF8)
10935 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10936 RExC_parse = p = oldp;
10940 if (ender > 0xff) {
10957 ender = ASCII_TO_NATIVE('\033');
10967 const char* error_msg;
10969 bool valid = grok_bslash_o(&p,
10972 TRUE, /* out warnings */
10973 FALSE, /* not strict */
10974 TRUE, /* Output warnings
10979 RExC_parse = p; /* going to die anyway; point
10980 to exact spot of failure */
10984 if (PL_encoding && ender < 0x100) {
10985 goto recode_encoding;
10987 if (ender > 0xff) {
10994 UV result = UV_MAX; /* initialize to erroneous
10996 const char* error_msg;
10998 bool valid = grok_bslash_x(&p,
11001 TRUE, /* out warnings */
11002 FALSE, /* not strict */
11003 TRUE, /* Output warnings
11008 RExC_parse = p; /* going to die anyway; point
11009 to exact spot of failure */
11014 if (PL_encoding && ender < 0x100) {
11015 goto recode_encoding;
11017 if (ender > 0xff) {
11024 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
11026 case '8': case '9': /* must be a backreference */
11029 case '1': case '2': case '3':case '4':
11030 case '5': case '6': case '7':
11031 /* When we parse backslash escapes there is ambiguity
11032 * between backreferences and octal escapes. Any escape
11033 * from \1 - \9 is a backreference, any multi-digit
11034 * escape which does not start with 0 and which when
11035 * evaluated as decimal could refer to an already
11036 * parsed capture buffer is a backslash. Anything else
11039 * Note this implies that \118 could be interpreted as
11040 * 118 OR as "\11" . "8" depending on whether there
11041 * were 118 capture buffers defined already in the
11043 if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
11044 { /* Not to be treated as an octal constant, go
11051 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11053 ender = grok_oct(p, &numlen, &flags, NULL);
11054 if (ender > 0xff) {
11058 if (SIZE_ONLY /* like \08, \178 */
11061 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11063 reg_warn_non_literal_string(
11065 form_short_octal_warning(p, numlen));
11068 if (PL_encoding && ender < 0x100)
11069 goto recode_encoding;
11072 if (! RExC_override_recoding) {
11073 SV* enc = PL_encoding;
11074 ender = reg_recode((const char)(U8)ender, &enc);
11075 if (!enc && SIZE_ONLY)
11076 ckWARNreg(p, "Invalid escape in the specified encoding");
11082 FAIL("Trailing \\");
11085 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11086 /* Include any { following the alpha to emphasize
11087 * that it could be part of an escape at some point
11089 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11090 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11092 goto normal_default;
11093 } /* End of switch on '\' */
11095 default: /* A literal character */
11098 && RExC_flags & RXf_PMf_EXTENDED
11099 && ckWARN_d(WARN_DEPRECATED)
11100 && is_PATWS_non_low(p, UTF))
11102 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11103 "Escape literal pattern white space under /x");
11107 if (UTF8_IS_START(*p) && UTF) {
11109 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11110 &numlen, UTF8_ALLOW_DEFAULT);
11116 } /* End of switch on the literal */
11118 /* Here, have looked at the literal character and <ender>
11119 * contains its ordinal, <p> points to the character after it
11122 if ( RExC_flags & RXf_PMf_EXTENDED)
11123 p = regwhite( pRExC_state, p );
11125 /* If the next thing is a quantifier, it applies to this
11126 * character only, which means that this character has to be in
11127 * its own node and can't just be appended to the string in an
11128 * existing node, so if there are already other characters in
11129 * the node, close the node with just them, and set up to do
11130 * this character again next time through, when it will be the
11131 * only thing in its new node */
11132 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11140 const STRLEN unilen = reguni(pRExC_state, ender, s);
11146 /* The loop increments <len> each time, as all but this
11147 * path (and one other) through it add a single byte to
11148 * the EXACTish node. But this one has changed len to
11149 * be the correct final value, so subtract one to
11150 * cancel out the increment that follows */
11154 REGC((char)ender, s++);
11157 else /* FOLD */ if (! ( UTF
11158 /* See comments for join_exact() as to why we fold this
11159 * non-UTF at compile time */
11160 || (node_type == EXACTFU
11161 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11163 if (IS_IN_SOME_FOLD_L1(ender)) {
11164 maybe_exact = FALSE;
11166 /* See if the character's fold differs between /d and
11167 * /u. This includes the multi-char fold SHARP S to
11170 && (PL_fold[ender] != PL_fold_latin1[ender]
11171 || ender == LATIN_SMALL_LETTER_SHARP_S
11173 && isARG2_lower_or_UPPER_ARG1('s', ender)
11174 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11176 maybe_exactfu = FALSE;
11179 *(s++) = (char) ender;
11183 /* Prime the casefolded buffer. Locale rules, which apply
11184 * only to code points < 256, aren't known until execution,
11185 * so for them, just output the original character using
11186 * utf8. If we start to fold non-UTF patterns, be sure to
11187 * update join_exact() */
11188 if (LOC && ender < 256) {
11189 if (UVCHR_IS_INVARIANT(ender)) {
11193 *s = UTF8_TWO_BYTE_HI(ender);
11194 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11199 UV folded = _to_uni_fold_flags(
11204 | ((LOC) ? FOLD_FLAGS_LOCALE
11205 : (ASCII_FOLD_RESTRICTED)
11206 ? FOLD_FLAGS_NOMIX_ASCII
11210 /* If this node only contains non-folding code points
11211 * so far, see if this new one is also non-folding */
11213 if (folded != ender) {
11214 maybe_exact = FALSE;
11217 /* Here the fold is the original; we have
11218 * to check further to see if anything
11220 if (! PL_utf8_foldable) {
11221 SV* swash = swash_init("utf8",
11223 &PL_sv_undef, 1, 0);
11225 _get_swash_invlist(swash);
11226 SvREFCNT_dec_NN(swash);
11228 if (_invlist_contains_cp(PL_utf8_foldable,
11231 maybe_exact = FALSE;
11239 /* The loop increments <len> each time, as all but this
11240 * path (and one other) through it add a single byte to the
11241 * EXACTish node. But this one has changed len to be the
11242 * correct final value, so subtract one to cancel out the
11243 * increment that follows */
11244 len += foldlen - 1;
11247 if (next_is_quantifier) {
11249 /* Here, the next input is a quantifier, and to get here,
11250 * the current character is the only one in the node.
11251 * Also, here <len> doesn't include the final byte for this
11257 } /* End of loop through literal characters */
11259 /* Here we have either exhausted the input or ran out of room in
11260 * the node. (If we encountered a character that can't be in the
11261 * node, transfer is made directly to <loopdone>, and so we
11262 * wouldn't have fallen off the end of the loop.) In the latter
11263 * case, we artificially have to split the node into two, because
11264 * we just don't have enough space to hold everything. This
11265 * creates a problem if the final character participates in a
11266 * multi-character fold in the non-final position, as a match that
11267 * should have occurred won't, due to the way nodes are matched,
11268 * and our artificial boundary. So back off until we find a non-
11269 * problematic character -- one that isn't at the beginning or
11270 * middle of such a fold. (Either it doesn't participate in any
11271 * folds, or appears only in the final position of all the folds it
11272 * does participate in.) A better solution with far fewer false
11273 * positives, and that would fill the nodes more completely, would
11274 * be to actually have available all the multi-character folds to
11275 * test against, and to back-off only far enough to be sure that
11276 * this node isn't ending with a partial one. <upper_parse> is set
11277 * further below (if we need to reparse the node) to include just
11278 * up through that final non-problematic character that this code
11279 * identifies, so when it is set to less than the full node, we can
11280 * skip the rest of this */
11281 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11283 const STRLEN full_len = len;
11285 assert(len >= MAX_NODE_STRING_SIZE);
11287 /* Here, <s> points to the final byte of the final character.
11288 * Look backwards through the string until find a non-
11289 * problematic character */
11293 /* These two have no multi-char folds to non-UTF characters
11295 if (ASCII_FOLD_RESTRICTED || LOC) {
11299 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11303 if (! PL_NonL1NonFinalFold) {
11304 PL_NonL1NonFinalFold = _new_invlist_C_array(
11305 NonL1_Perl_Non_Final_Folds_invlist);
11308 /* Point to the first byte of the final character */
11309 s = (char *) utf8_hop((U8 *) s, -1);
11311 while (s >= s0) { /* Search backwards until find
11312 non-problematic char */
11313 if (UTF8_IS_INVARIANT(*s)) {
11315 /* There are no ascii characters that participate
11316 * in multi-char folds under /aa. In EBCDIC, the
11317 * non-ascii invariants are all control characters,
11318 * so don't ever participate in any folds. */
11319 if (ASCII_FOLD_RESTRICTED
11320 || ! IS_NON_FINAL_FOLD(*s))
11325 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11327 /* No Latin1 characters participate in multi-char
11328 * folds under /l */
11330 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11336 else if (! _invlist_contains_cp(
11337 PL_NonL1NonFinalFold,
11338 valid_utf8_to_uvchr((U8 *) s, NULL)))
11343 /* Here, the current character is problematic in that
11344 * it does occur in the non-final position of some
11345 * fold, so try the character before it, but have to
11346 * special case the very first byte in the string, so
11347 * we don't read outside the string */
11348 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11349 } /* End of loop backwards through the string */
11351 /* If there were only problematic characters in the string,
11352 * <s> will point to before s0, in which case the length
11353 * should be 0, otherwise include the length of the
11354 * non-problematic character just found */
11355 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11358 /* Here, have found the final character, if any, that is
11359 * non-problematic as far as ending the node without splitting
11360 * it across a potential multi-char fold. <len> contains the
11361 * number of bytes in the node up-to and including that
11362 * character, or is 0 if there is no such character, meaning
11363 * the whole node contains only problematic characters. In
11364 * this case, give up and just take the node as-is. We can't
11369 /* If the node ends in an 's' we make sure it stays EXACTF,
11370 * as if it turns into an EXACTFU, it could later get
11371 * joined with another 's' that would then wrongly match
11373 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11375 maybe_exactfu = FALSE;
11379 /* Here, the node does contain some characters that aren't
11380 * problematic. If one such is the final character in the
11381 * node, we are done */
11382 if (len == full_len) {
11385 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11387 /* If the final character is problematic, but the
11388 * penultimate is not, back-off that last character to
11389 * later start a new node with it */
11394 /* Here, the final non-problematic character is earlier
11395 * in the input than the penultimate character. What we do
11396 * is reparse from the beginning, going up only as far as
11397 * this final ok one, thus guaranteeing that the node ends
11398 * in an acceptable character. The reason we reparse is
11399 * that we know how far in the character is, but we don't
11400 * know how to correlate its position with the input parse.
11401 * An alternate implementation would be to build that
11402 * correlation as we go along during the original parse,
11403 * but that would entail extra work for every node, whereas
11404 * this code gets executed only when the string is too
11405 * large for the node, and the final two characters are
11406 * problematic, an infrequent occurrence. Yet another
11407 * possible strategy would be to save the tail of the
11408 * string, and the next time regatom is called, initialize
11409 * with that. The problem with this is that unless you
11410 * back off one more character, you won't be guaranteed
11411 * regatom will get called again, unless regbranch,
11412 * regpiece ... are also changed. If you do back off that
11413 * extra character, so that there is input guaranteed to
11414 * force calling regatom, you can't handle the case where
11415 * just the first character in the node is acceptable. I
11416 * (khw) decided to try this method which doesn't have that
11417 * pitfall; if performance issues are found, we can do a
11418 * combination of the current approach plus that one */
11424 } /* End of verifying node ends with an appropriate char */
11426 loopdone: /* Jumped to when encounters something that shouldn't be in
11429 /* I (khw) don't know if you can get here with zero length, but the
11430 * old code handled this situation by creating a zero-length EXACT
11431 * node. Might as well be NOTHING instead */
11437 /* If 'maybe_exact' is still set here, means there are no
11438 * code points in the node that participate in folds;
11439 * similarly for 'maybe_exactfu' and code points that match
11440 * differently depending on UTF8ness of the target string
11445 else if (maybe_exactfu) {
11449 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11452 RExC_parse = p - 1;
11453 Set_Node_Cur_Length(ret, parse_start);
11454 nextchar(pRExC_state);
11456 /* len is STRLEN which is unsigned, need to copy to signed */
11459 vFAIL("Internal disaster");
11462 } /* End of label 'defchar:' */
11464 } /* End of giant switch on input character */
11470 S_regwhite( RExC_state_t *pRExC_state, char *p )
11472 const char *e = RExC_end;
11474 PERL_ARGS_ASSERT_REGWHITE;
11479 else if (*p == '#') {
11482 if (*p++ == '\n') {
11488 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11497 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11499 /* Returns the next non-pattern-white space, non-comment character (the
11500 * latter only if 'recognize_comment is true) in the string p, which is
11501 * ended by RExC_end. If there is no line break ending a comment,
11502 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11503 const char *e = RExC_end;
11505 PERL_ARGS_ASSERT_REGPATWS;
11509 if ((len = is_PATWS_safe(p, e, UTF))) {
11512 else if (recognize_comment && *p == '#') {
11516 if (is_LNBREAK_safe(p, e, UTF)) {
11522 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11530 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11531 Character classes ([:foo:]) can also be negated ([:^foo:]).
11532 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11533 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11534 but trigger failures because they are currently unimplemented. */
11536 #define POSIXCC_DONE(c) ((c) == ':')
11537 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11538 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11540 PERL_STATIC_INLINE I32
11541 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11544 I32 namedclass = OOB_NAMEDCLASS;
11546 PERL_ARGS_ASSERT_REGPPOSIXCC;
11548 if (value == '[' && RExC_parse + 1 < RExC_end &&
11549 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11550 POSIXCC(UCHARAT(RExC_parse)))
11552 const char c = UCHARAT(RExC_parse);
11553 char* const s = RExC_parse++;
11555 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11557 if (RExC_parse == RExC_end) {
11560 /* Try to give a better location for the error (than the end of
11561 * the string) by looking for the matching ']' */
11563 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11566 vFAIL2("Unmatched '%c' in POSIX class", c);
11568 /* Grandfather lone [:, [=, [. */
11572 const char* const t = RExC_parse++; /* skip over the c */
11575 if (UCHARAT(RExC_parse) == ']') {
11576 const char *posixcc = s + 1;
11577 RExC_parse++; /* skip over the ending ] */
11580 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11581 const I32 skip = t - posixcc;
11583 /* Initially switch on the length of the name. */
11586 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11587 this is the Perl \w
11589 namedclass = ANYOF_WORDCHAR;
11592 /* Names all of length 5. */
11593 /* alnum alpha ascii blank cntrl digit graph lower
11594 print punct space upper */
11595 /* Offset 4 gives the best switch position. */
11596 switch (posixcc[4]) {
11598 if (memEQ(posixcc, "alph", 4)) /* alpha */
11599 namedclass = ANYOF_ALPHA;
11602 if (memEQ(posixcc, "spac", 4)) /* space */
11603 namedclass = ANYOF_PSXSPC;
11606 if (memEQ(posixcc, "grap", 4)) /* graph */
11607 namedclass = ANYOF_GRAPH;
11610 if (memEQ(posixcc, "asci", 4)) /* ascii */
11611 namedclass = ANYOF_ASCII;
11614 if (memEQ(posixcc, "blan", 4)) /* blank */
11615 namedclass = ANYOF_BLANK;
11618 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11619 namedclass = ANYOF_CNTRL;
11622 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11623 namedclass = ANYOF_ALPHANUMERIC;
11626 if (memEQ(posixcc, "lowe", 4)) /* lower */
11627 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11628 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11629 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11632 if (memEQ(posixcc, "digi", 4)) /* digit */
11633 namedclass = ANYOF_DIGIT;
11634 else if (memEQ(posixcc, "prin", 4)) /* print */
11635 namedclass = ANYOF_PRINT;
11636 else if (memEQ(posixcc, "punc", 4)) /* punct */
11637 namedclass = ANYOF_PUNCT;
11642 if (memEQ(posixcc, "xdigit", 6))
11643 namedclass = ANYOF_XDIGIT;
11647 if (namedclass == OOB_NAMEDCLASS)
11649 "POSIX class [:%"UTF8f":] unknown",
11650 UTF8fARG(UTF, t - s - 1, s + 1));
11652 /* The #defines are structured so each complement is +1 to
11653 * the normal one */
11657 assert (posixcc[skip] == ':');
11658 assert (posixcc[skip+1] == ']');
11659 } else if (!SIZE_ONLY) {
11660 /* [[=foo=]] and [[.foo.]] are still future. */
11662 /* adjust RExC_parse so the warning shows after
11663 the class closes */
11664 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11666 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11669 /* Maternal grandfather:
11670 * "[:" ending in ":" but not in ":]" */
11672 vFAIL("Unmatched '[' in POSIX class");
11675 /* Grandfather lone [:, [=, [. */
11685 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11687 /* This applies some heuristics at the current parse position (which should
11688 * be at a '[') to see if what follows might be intended to be a [:posix:]
11689 * class. It returns true if it really is a posix class, of course, but it
11690 * also can return true if it thinks that what was intended was a posix
11691 * class that didn't quite make it.
11693 * It will return true for
11695 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11696 * ')' indicating the end of the (?[
11697 * [:any garbage including %^&$ punctuation:]
11699 * This is designed to be called only from S_handle_regex_sets; it could be
11700 * easily adapted to be called from the spot at the beginning of regclass()
11701 * that checks to see in a normal bracketed class if the surrounding []
11702 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11703 * change long-standing behavior, so I (khw) didn't do that */
11704 char* p = RExC_parse + 1;
11705 char first_char = *p;
11707 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11709 assert(*(p - 1) == '[');
11711 if (! POSIXCC(first_char)) {
11716 while (p < RExC_end && isWORDCHAR(*p)) p++;
11718 if (p >= RExC_end) {
11722 if (p - RExC_parse > 2 /* Got at least 1 word character */
11723 && (*p == first_char
11724 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11729 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11732 && p - RExC_parse > 2 /* [:] evaluates to colon;
11733 [::] is a bad posix class. */
11734 && first_char == *(p - 1));
11738 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11739 char * const oregcomp_parse)
11741 /* Handle the (?[...]) construct to do set operations */
11744 UV start, end; /* End points of code point ranges */
11746 char *save_end, *save_parse;
11751 const bool save_fold = FOLD;
11753 GET_RE_DEBUG_FLAGS_DECL;
11755 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11758 vFAIL("(?[...]) not valid in locale");
11760 RExC_uni_semantics = 1;
11762 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11763 * (such as EXACT). Thus we can skip most everything if just sizing. We
11764 * call regclass to handle '[]' so as to not have to reinvent its parsing
11765 * rules here (throwing away the size it computes each time). And, we exit
11766 * upon an unescaped ']' that isn't one ending a regclass. To do both
11767 * these things, we need to realize that something preceded by a backslash
11768 * is escaped, so we have to keep track of backslashes */
11770 UV depth = 0; /* how many nested (?[...]) constructs */
11772 Perl_ck_warner_d(aTHX_
11773 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11774 "The regex_sets feature is experimental" REPORT_LOCATION,
11775 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
11776 UTF8fARG(UTF, RExC_end - RExC_start - (RExC_parse - RExC_precomp), RExC_precomp + (RExC_parse - RExC_precomp)));
11778 while (RExC_parse < RExC_end) {
11779 SV* current = NULL;
11780 RExC_parse = regpatws(pRExC_state, RExC_parse,
11781 TRUE); /* means recognize comments */
11782 switch (*RExC_parse) {
11784 if (RExC_parse[1] == '[') depth++, RExC_parse++;
11789 /* Skip the next byte (which could cause us to end up in
11790 * the middle of a UTF-8 character, but since none of those
11791 * are confusable with anything we currently handle in this
11792 * switch (invariants all), it's safe. We'll just hit the
11793 * default: case next time and keep on incrementing until
11794 * we find one of the invariants we do handle. */
11799 /* If this looks like it is a [:posix:] class, leave the
11800 * parse pointer at the '[' to fool regclass() into
11801 * thinking it is part of a '[[:posix:]]'. That function
11802 * will use strict checking to force a syntax error if it
11803 * doesn't work out to a legitimate class */
11804 bool is_posix_class
11805 = could_it_be_a_POSIX_class(pRExC_state);
11806 if (! is_posix_class) {
11810 /* regclass() can only return RESTART_UTF8 if multi-char
11811 folds are allowed. */
11812 if (!regclass(pRExC_state, flagp,depth+1,
11813 is_posix_class, /* parse the whole char
11814 class only if not a
11816 FALSE, /* don't allow multi-char folds */
11817 TRUE, /* silence non-portable warnings. */
11819 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11822 /* function call leaves parse pointing to the ']', except
11823 * if we faked it */
11824 if (is_posix_class) {
11828 SvREFCNT_dec(current); /* In case it returned something */
11833 if (depth--) break;
11835 if (RExC_parse < RExC_end
11836 && *RExC_parse == ')')
11838 node = reganode(pRExC_state, ANYOF, 0);
11839 RExC_size += ANYOF_SKIP;
11840 nextchar(pRExC_state);
11841 Set_Node_Length(node,
11842 RExC_parse - oregcomp_parse + 1); /* MJD */
11851 FAIL("Syntax error in (?[...])");
11854 /* Pass 2 only after this. Everything in this construct is a
11855 * metacharacter. Operands begin with either a '\' (for an escape
11856 * sequence), or a '[' for a bracketed character class. Any other
11857 * character should be an operator, or parenthesis for grouping. Both
11858 * types of operands are handled by calling regclass() to parse them. It
11859 * is called with a parameter to indicate to return the computed inversion
11860 * list. The parsing here is implemented via a stack. Each entry on the
11861 * stack is a single character representing one of the operators, or the
11862 * '('; or else a pointer to an operand inversion list. */
11864 #define IS_OPERAND(a) (! SvIOK(a))
11866 /* The stack starts empty. It is a syntax error if the first thing parsed
11867 * is a binary operator; everything else is pushed on the stack. When an
11868 * operand is parsed, the top of the stack is examined. If it is a binary
11869 * operator, the item before it should be an operand, and both are replaced
11870 * by the result of doing that operation on the new operand and the one on
11871 * the stack. Thus a sequence of binary operands is reduced to a single
11872 * one before the next one is parsed.
11874 * A unary operator may immediately follow a binary in the input, for
11877 * When an operand is parsed and the top of the stack is a unary operator,
11878 * the operation is performed, and then the stack is rechecked to see if
11879 * this new operand is part of a binary operation; if so, it is handled as
11882 * A '(' is simply pushed on the stack; it is valid only if the stack is
11883 * empty, or the top element of the stack is an operator or another '('
11884 * (for which the parenthesized expression will become an operand). By the
11885 * time the corresponding ')' is parsed everything in between should have
11886 * been parsed and evaluated to a single operand (or else is a syntax
11887 * error), and is handled as a regular operand */
11889 sv_2mortal((SV *)(stack = newAV()));
11891 while (RExC_parse < RExC_end) {
11892 I32 top_index = av_tindex(stack);
11894 SV* current = NULL;
11896 /* Skip white space */
11897 RExC_parse = regpatws(pRExC_state, RExC_parse,
11898 TRUE); /* means recognize comments */
11899 if (RExC_parse >= RExC_end) {
11900 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11902 if ((curchar = UCHARAT(RExC_parse)) == ']') {
11909 if (av_tindex(stack) >= 0 /* This makes sure that we can
11910 safely subtract 1 from
11911 RExC_parse in the next clause.
11912 If we have something on the
11913 stack, we have parsed something
11915 && UCHARAT(RExC_parse - 1) == '('
11916 && RExC_parse < RExC_end)
11918 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11919 * This happens when we have some thing like
11921 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11923 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11925 * Here we would be handling the interpolated
11926 * '$thai_or_lao'. We handle this by a recursive call to
11927 * ourselves which returns the inversion list the
11928 * interpolated expression evaluates to. We use the flags
11929 * from the interpolated pattern. */
11930 U32 save_flags = RExC_flags;
11931 const char * const save_parse = ++RExC_parse;
11933 parse_lparen_question_flags(pRExC_state);
11935 if (RExC_parse == save_parse /* Makes sure there was at
11936 least one flag (or this
11937 embedding wasn't compiled)
11939 || RExC_parse >= RExC_end - 4
11940 || UCHARAT(RExC_parse) != ':'
11941 || UCHARAT(++RExC_parse) != '('
11942 || UCHARAT(++RExC_parse) != '?'
11943 || UCHARAT(++RExC_parse) != '[')
11946 /* In combination with the above, this moves the
11947 * pointer to the point just after the first erroneous
11948 * character (or if there are no flags, to where they
11949 * should have been) */
11950 if (RExC_parse >= RExC_end - 4) {
11951 RExC_parse = RExC_end;
11953 else if (RExC_parse != save_parse) {
11954 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11956 vFAIL("Expecting '(?flags:(?[...'");
11959 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
11960 depth+1, oregcomp_parse);
11962 /* Here, 'current' contains the embedded expression's
11963 * inversion list, and RExC_parse points to the trailing
11964 * ']'; the next character should be the ')' which will be
11965 * paired with the '(' that has been put on the stack, so
11966 * the whole embedded expression reduces to '(operand)' */
11969 RExC_flags = save_flags;
11970 goto handle_operand;
11975 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11976 vFAIL("Unexpected character");
11979 /* regclass() can only return RESTART_UTF8 if multi-char
11980 folds are allowed. */
11981 if (!regclass(pRExC_state, flagp,depth+1,
11982 TRUE, /* means parse just the next thing */
11983 FALSE, /* don't allow multi-char folds */
11984 FALSE, /* don't silence non-portable warnings. */
11986 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11988 /* regclass() will return with parsing just the \ sequence,
11989 * leaving the parse pointer at the next thing to parse */
11991 goto handle_operand;
11993 case '[': /* Is a bracketed character class */
11995 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11997 if (! is_posix_class) {
12001 /* regclass() can only return RESTART_UTF8 if multi-char
12002 folds are allowed. */
12003 if(!regclass(pRExC_state, flagp,depth+1,
12004 is_posix_class, /* parse the whole char class
12005 only if not a posix class */
12006 FALSE, /* don't allow multi-char folds */
12007 FALSE, /* don't silence non-portable warnings. */
12009 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12011 /* function call leaves parse pointing to the ']', except if we
12013 if (is_posix_class) {
12017 goto handle_operand;
12026 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12027 || ! IS_OPERAND(*top_ptr))
12030 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12032 av_push(stack, newSVuv(curchar));
12036 av_push(stack, newSVuv(curchar));
12040 if (top_index >= 0) {
12041 top_ptr = av_fetch(stack, top_index, FALSE);
12043 if (IS_OPERAND(*top_ptr)) {
12045 vFAIL("Unexpected '(' with no preceding operator");
12048 av_push(stack, newSVuv(curchar));
12055 || ! (current = av_pop(stack))
12056 || ! IS_OPERAND(current)
12057 || ! (lparen = av_pop(stack))
12058 || IS_OPERAND(lparen)
12059 || SvUV(lparen) != '(')
12061 SvREFCNT_dec(current);
12063 vFAIL("Unexpected ')'");
12066 SvREFCNT_dec_NN(lparen);
12073 /* Here, we have an operand to process, in 'current' */
12075 if (top_index < 0) { /* Just push if stack is empty */
12076 av_push(stack, current);
12079 SV* top = av_pop(stack);
12081 char current_operator;
12083 if (IS_OPERAND(top)) {
12084 SvREFCNT_dec_NN(top);
12085 SvREFCNT_dec_NN(current);
12086 vFAIL("Operand with no preceding operator");
12088 current_operator = (char) SvUV(top);
12089 switch (current_operator) {
12090 case '(': /* Push the '(' back on followed by the new
12092 av_push(stack, top);
12093 av_push(stack, current);
12094 SvREFCNT_inc(top); /* Counters the '_dec' done
12095 just after the 'break', so
12096 it doesn't get wrongly freed
12101 _invlist_invert(current);
12103 /* Unlike binary operators, the top of the stack,
12104 * now that this unary one has been popped off, may
12105 * legally be an operator, and we now have operand
12108 SvREFCNT_dec_NN(top);
12109 goto handle_operand;
12112 prev = av_pop(stack);
12113 _invlist_intersection(prev,
12116 av_push(stack, current);
12121 prev = av_pop(stack);
12122 _invlist_union(prev, current, ¤t);
12123 av_push(stack, current);
12127 prev = av_pop(stack);;
12128 _invlist_subtract(prev, current, ¤t);
12129 av_push(stack, current);
12132 case '^': /* The union minus the intersection */
12138 prev = av_pop(stack);
12139 _invlist_union(prev, current, &u);
12140 _invlist_intersection(prev, current, &i);
12141 /* _invlist_subtract will overwrite current
12142 without freeing what it already contains */
12144 _invlist_subtract(u, i, ¤t);
12145 av_push(stack, current);
12146 SvREFCNT_dec_NN(i);
12147 SvREFCNT_dec_NN(u);
12148 SvREFCNT_dec_NN(element);
12153 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12155 SvREFCNT_dec_NN(top);
12156 SvREFCNT_dec(prev);
12160 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12163 if (av_tindex(stack) < 0 /* Was empty */
12164 || ((final = av_pop(stack)) == NULL)
12165 || ! IS_OPERAND(final)
12166 || av_tindex(stack) >= 0) /* More left on stack */
12168 vFAIL("Incomplete expression within '(?[ ])'");
12171 /* Here, 'final' is the resultant inversion list from evaluating the
12172 * expression. Return it if so requested */
12173 if (return_invlist) {
12174 *return_invlist = final;
12178 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12179 * expecting a string of ranges and individual code points */
12180 invlist_iterinit(final);
12181 result_string = newSVpvs("");
12182 while (invlist_iternext(final, &start, &end)) {
12183 if (start == end) {
12184 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12187 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12192 save_parse = RExC_parse;
12193 RExC_parse = SvPV(result_string, len);
12194 save_end = RExC_end;
12195 RExC_end = RExC_parse + len;
12197 /* We turn off folding around the call, as the class we have constructed
12198 * already has all folding taken into consideration, and we don't want
12199 * regclass() to add to that */
12200 RExC_flags &= ~RXf_PMf_FOLD;
12201 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12203 node = regclass(pRExC_state, flagp,depth+1,
12204 FALSE, /* means parse the whole char class */
12205 FALSE, /* don't allow multi-char folds */
12206 TRUE, /* silence non-portable warnings. The above may very
12207 well have generated non-portable code points, but
12208 they're valid on this machine */
12211 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12214 RExC_flags |= RXf_PMf_FOLD;
12216 RExC_parse = save_parse + 1;
12217 RExC_end = save_end;
12218 SvREFCNT_dec_NN(final);
12219 SvREFCNT_dec_NN(result_string);
12221 nextchar(pRExC_state);
12222 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12227 /* The names of properties whose definitions are not known at compile time are
12228 * stored in this SV, after a constant heading. So if the length has been
12229 * changed since initialization, then there is a run-time definition. */
12230 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12233 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12234 const bool stop_at_1, /* Just parse the next thing, don't
12235 look for a full character class */
12236 bool allow_multi_folds,
12237 const bool silence_non_portable, /* Don't output warnings
12240 SV** ret_invlist) /* Return an inversion list, not a node */
12242 /* parse a bracketed class specification. Most of these will produce an
12243 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12244 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12245 * under /i with multi-character folds: it will be rewritten following the
12246 * paradigm of this example, where the <multi-fold>s are characters which
12247 * fold to multiple character sequences:
12248 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12249 * gets effectively rewritten as:
12250 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12251 * reg() gets called (recursively) on the rewritten version, and this
12252 * function will return what it constructs. (Actually the <multi-fold>s
12253 * aren't physically removed from the [abcdefghi], it's just that they are
12254 * ignored in the recursion by means of a flag:
12255 * <RExC_in_multi_char_class>.)
12257 * ANYOF nodes contain a bit map for the first 256 characters, with the
12258 * corresponding bit set if that character is in the list. For characters
12259 * above 255, a range list or swash is used. There are extra bits for \w,
12260 * etc. in locale ANYOFs, as what these match is not determinable at
12263 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12264 * to be restarted. This can only happen if ret_invlist is non-NULL.
12268 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12270 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12273 IV namedclass = OOB_NAMEDCLASS;
12274 char *rangebegin = NULL;
12275 bool need_class = 0;
12277 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12278 than just initialized. */
12279 SV* properties = NULL; /* Code points that match \p{} \P{} */
12280 SV* posixes = NULL; /* Code points that match classes like, [:word:],
12281 extended beyond the Latin1 range */
12282 UV element_count = 0; /* Number of distinct elements in the class.
12283 Optimizations may be possible if this is tiny */
12284 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12285 character; used under /i */
12287 char * stop_ptr = RExC_end; /* where to stop parsing */
12288 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12290 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12292 /* Unicode properties are stored in a swash; this holds the current one
12293 * being parsed. If this swash is the only above-latin1 component of the
12294 * character class, an optimization is to pass it directly on to the
12295 * execution engine. Otherwise, it is set to NULL to indicate that there
12296 * are other things in the class that have to be dealt with at execution
12298 SV* swash = NULL; /* Code points that match \p{} \P{} */
12300 /* Set if a component of this character class is user-defined; just passed
12301 * on to the engine */
12302 bool has_user_defined_property = FALSE;
12304 /* inversion list of code points this node matches only when the target
12305 * string is in UTF-8. (Because is under /d) */
12306 SV* depends_list = NULL;
12308 /* inversion list of code points this node matches. For much of the
12309 * function, it includes only those that match regardless of the utf8ness
12310 * of the target string */
12311 SV* cp_list = NULL;
12314 /* In a range, counts how many 0-2 of the ends of it came from literals,
12315 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12316 UV literal_endpoint = 0;
12318 bool invert = FALSE; /* Is this class to be complemented */
12320 /* Is there any thing like \W or [:^digit:] that matches above the legal
12321 * Unicode range? */
12322 bool runtime_posix_matches_above_Unicode = FALSE;
12324 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12325 case we need to change the emitted regop to an EXACT. */
12326 const char * orig_parse = RExC_parse;
12327 const SSize_t orig_size = RExC_size;
12328 GET_RE_DEBUG_FLAGS_DECL;
12330 PERL_ARGS_ASSERT_REGCLASS;
12332 PERL_UNUSED_ARG(depth);
12335 DEBUG_PARSE("clas");
12337 /* Assume we are going to generate an ANYOF node. */
12338 ret = reganode(pRExC_state, ANYOF, 0);
12341 RExC_size += ANYOF_SKIP;
12342 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12345 ANYOF_FLAGS(ret) = 0;
12347 RExC_emit += ANYOF_SKIP;
12349 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12351 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12352 initial_listsv_len = SvCUR(listsv);
12353 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12357 RExC_parse = regpatws(pRExC_state, RExC_parse,
12358 FALSE /* means don't recognize comments */);
12361 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12364 allow_multi_folds = FALSE;
12367 RExC_parse = regpatws(pRExC_state, RExC_parse,
12368 FALSE /* means don't recognize comments */);
12372 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12373 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12374 const char *s = RExC_parse;
12375 const char c = *s++;
12377 while (isWORDCHAR(*s))
12379 if (*s && c == *s && s[1] == ']') {
12380 SAVEFREESV(RExC_rx_sv);
12382 "POSIX syntax [%c %c] belongs inside character classes",
12384 (void)ReREFCNT_inc(RExC_rx_sv);
12388 /* If the caller wants us to just parse a single element, accomplish this
12389 * by faking the loop ending condition */
12390 if (stop_at_1 && RExC_end > RExC_parse) {
12391 stop_ptr = RExC_parse + 1;
12394 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12395 if (UCHARAT(RExC_parse) == ']')
12396 goto charclassloop;
12400 if (RExC_parse >= stop_ptr) {
12405 RExC_parse = regpatws(pRExC_state, RExC_parse,
12406 FALSE /* means don't recognize comments */);
12409 if (UCHARAT(RExC_parse) == ']') {
12415 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12416 save_value = value;
12417 save_prevvalue = prevvalue;
12420 rangebegin = RExC_parse;
12424 value = utf8n_to_uvchr((U8*)RExC_parse,
12425 RExC_end - RExC_parse,
12426 &numlen, UTF8_ALLOW_DEFAULT);
12427 RExC_parse += numlen;
12430 value = UCHARAT(RExC_parse++);
12433 && RExC_parse < RExC_end
12434 && POSIXCC(UCHARAT(RExC_parse)))
12436 namedclass = regpposixcc(pRExC_state, value, strict);
12438 else if (value == '\\') {
12440 value = utf8n_to_uvchr((U8*)RExC_parse,
12441 RExC_end - RExC_parse,
12442 &numlen, UTF8_ALLOW_DEFAULT);
12443 RExC_parse += numlen;
12446 value = UCHARAT(RExC_parse++);
12448 /* Some compilers cannot handle switching on 64-bit integer
12449 * values, therefore value cannot be an UV. Yes, this will
12450 * be a problem later if we want switch on Unicode.
12451 * A similar issue a little bit later when switching on
12452 * namedclass. --jhi */
12454 /* If the \ is escaping white space when white space is being
12455 * skipped, it means that that white space is wanted literally, and
12456 * is already in 'value'. Otherwise, need to translate the escape
12457 * into what it signifies. */
12458 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12460 case 'w': namedclass = ANYOF_WORDCHAR; break;
12461 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12462 case 's': namedclass = ANYOF_SPACE; break;
12463 case 'S': namedclass = ANYOF_NSPACE; break;
12464 case 'd': namedclass = ANYOF_DIGIT; break;
12465 case 'D': namedclass = ANYOF_NDIGIT; break;
12466 case 'v': namedclass = ANYOF_VERTWS; break;
12467 case 'V': namedclass = ANYOF_NVERTWS; break;
12468 case 'h': namedclass = ANYOF_HORIZWS; break;
12469 case 'H': namedclass = ANYOF_NHORIZWS; break;
12470 case 'N': /* Handle \N{NAME} in class */
12472 /* We only pay attention to the first char of
12473 multichar strings being returned. I kinda wonder
12474 if this makes sense as it does change the behaviour
12475 from earlier versions, OTOH that behaviour was broken
12477 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12478 TRUE, /* => charclass */
12481 if (*flagp & RESTART_UTF8)
12482 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12492 /* We will handle any undefined properties ourselves */
12493 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12495 if (RExC_parse >= RExC_end)
12496 vFAIL2("Empty \\%c{}", (U8)value);
12497 if (*RExC_parse == '{') {
12498 const U8 c = (U8)value;
12499 e = strchr(RExC_parse++, '}');
12501 vFAIL2("Missing right brace on \\%c{}", c);
12502 while (isSPACE(UCHARAT(RExC_parse)))
12504 if (e == RExC_parse)
12505 vFAIL2("Empty \\%c{}", c);
12506 n = e - RExC_parse;
12507 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12519 if (UCHARAT(RExC_parse) == '^') {
12522 /* toggle. (The rhs xor gets the single bit that
12523 * differs between P and p; the other xor inverts just
12525 value ^= 'P' ^ 'p';
12527 while (isSPACE(UCHARAT(RExC_parse))) {
12532 /* Try to get the definition of the property into
12533 * <invlist>. If /i is in effect, the effective property
12534 * will have its name be <__NAME_i>. The design is
12535 * discussed in commit
12536 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12537 formatted = Perl_form(aTHX_
12539 (FOLD) ? "__" : "",
12544 name = savepvn(formatted, strlen(formatted));
12546 /* Look up the property name, and get its swash and
12547 * inversion list, if the property is found */
12549 SvREFCNT_dec_NN(swash);
12551 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12554 NULL, /* No inversion list */
12557 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12559 SvREFCNT_dec_NN(swash);
12563 /* Here didn't find it. It could be a user-defined
12564 * property that will be available at run-time. If we
12565 * accept only compile-time properties, is an error;
12566 * otherwise add it to the list for run-time look up */
12568 RExC_parse = e + 1;
12570 "Property '%"UTF8f"' is unknown",
12571 UTF8fARG(UTF, n, name));
12573 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
12574 (value == 'p' ? '+' : '!'),
12575 UTF8fARG(UTF, n, name));
12576 has_user_defined_property = TRUE;
12578 /* We don't know yet, so have to assume that the
12579 * property could match something in the Latin1 range,
12580 * hence something that isn't utf8. Note that this
12581 * would cause things in <depends_list> to match
12582 * inappropriately, except that any \p{}, including
12583 * this one forces Unicode semantics, which means there
12584 * is <no depends_list> */
12585 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12589 /* Here, did get the swash and its inversion list. If
12590 * the swash is from a user-defined property, then this
12591 * whole character class should be regarded as such */
12592 has_user_defined_property =
12594 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12596 /* Invert if asking for the complement */
12597 if (value == 'P') {
12598 _invlist_union_complement_2nd(properties,
12602 /* The swash can't be used as-is, because we've
12603 * inverted things; delay removing it to here after
12604 * have copied its invlist above */
12605 SvREFCNT_dec_NN(swash);
12609 _invlist_union(properties, invlist, &properties);
12614 RExC_parse = e + 1;
12615 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12618 /* \p means they want Unicode semantics */
12619 RExC_uni_semantics = 1;
12622 case 'n': value = '\n'; break;
12623 case 'r': value = '\r'; break;
12624 case 't': value = '\t'; break;
12625 case 'f': value = '\f'; break;
12626 case 'b': value = '\b'; break;
12627 case 'e': value = ASCII_TO_NATIVE('\033');break;
12628 case 'a': value = '\a'; break;
12630 RExC_parse--; /* function expects to be pointed at the 'o' */
12632 const char* error_msg;
12633 bool valid = grok_bslash_o(&RExC_parse,
12636 SIZE_ONLY, /* warnings in pass
12639 silence_non_portable,
12645 if (PL_encoding && value < 0x100) {
12646 goto recode_encoding;
12650 RExC_parse--; /* function expects to be pointed at the 'x' */
12652 const char* error_msg;
12653 bool valid = grok_bslash_x(&RExC_parse,
12656 TRUE, /* Output warnings */
12658 silence_non_portable,
12664 if (PL_encoding && value < 0x100)
12665 goto recode_encoding;
12668 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12670 case '0': case '1': case '2': case '3': case '4':
12671 case '5': case '6': case '7':
12673 /* Take 1-3 octal digits */
12674 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12675 numlen = (strict) ? 4 : 3;
12676 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12677 RExC_parse += numlen;
12680 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12681 vFAIL("Need exactly 3 octal digits");
12683 else if (! SIZE_ONLY /* like \08, \178 */
12685 && RExC_parse < RExC_end
12686 && isDIGIT(*RExC_parse)
12687 && ckWARN(WARN_REGEXP))
12689 SAVEFREESV(RExC_rx_sv);
12690 reg_warn_non_literal_string(
12692 form_short_octal_warning(RExC_parse, numlen));
12693 (void)ReREFCNT_inc(RExC_rx_sv);
12696 if (PL_encoding && value < 0x100)
12697 goto recode_encoding;
12701 if (! RExC_override_recoding) {
12702 SV* enc = PL_encoding;
12703 value = reg_recode((const char)(U8)value, &enc);
12706 vFAIL("Invalid escape in the specified encoding");
12708 else if (SIZE_ONLY) {
12709 ckWARNreg(RExC_parse,
12710 "Invalid escape in the specified encoding");
12716 /* Allow \_ to not give an error */
12717 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12719 vFAIL2("Unrecognized escape \\%c in character class",
12723 SAVEFREESV(RExC_rx_sv);
12724 ckWARN2reg(RExC_parse,
12725 "Unrecognized escape \\%c in character class passed through",
12727 (void)ReREFCNT_inc(RExC_rx_sv);
12731 } /* End of switch on char following backslash */
12732 } /* end of handling backslash escape sequences */
12735 literal_endpoint++;
12738 /* Here, we have the current token in 'value' */
12740 /* What matches in a locale is not known until runtime. This includes
12741 * what the Posix classes (like \w, [:space:]) match. Room must be
12742 * reserved (one time per class) to store such classes, either if Perl
12743 * is compiled so that locale nodes always should have this space, or
12744 * if there is such class info to be stored. The space will contain a
12745 * bit for each named class that is to be matched against. This isn't
12746 * needed for \p{} and pseudo-classes, as they are not affected by
12747 * locale, and hence are dealt with separately */
12750 && (ANYOF_LOCALE == ANYOF_CLASS
12751 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12755 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12758 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12759 ANYOF_CLASS_ZERO(ret);
12761 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12764 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12766 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12767 * literal, as is the character that began the false range, i.e.
12768 * the 'a' in the examples */
12771 const int w = (RExC_parse >= rangebegin)
12772 ? RExC_parse - rangebegin
12776 "False [] range \"%"UTF8f"\"",
12777 UTF8fARG(UTF, w, rangebegin));
12780 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12781 ckWARN2reg(RExC_parse,
12782 "False [] range \"%"UTF8f"\"",
12783 UTF8fARG(UTF, w, rangebegin));
12784 (void)ReREFCNT_inc(RExC_rx_sv);
12785 cp_list = add_cp_to_invlist(cp_list, '-');
12786 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12790 range = 0; /* this was not a true range */
12791 element_count += 2; /* So counts for three values */
12795 U8 classnum = namedclass_to_classnum(namedclass);
12796 if (namedclass >= ANYOF_MAX) { /* If a special class */
12797 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12799 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12800 * /l make a difference in what these match. There
12801 * would be problems if these characters had folds
12802 * other than themselves, as cp_list is subject to
12804 if (classnum != _CC_VERTSPACE) {
12805 assert( namedclass == ANYOF_HORIZWS
12806 || namedclass == ANYOF_NHORIZWS);
12808 /* It turns out that \h is just a synonym for
12810 classnum = _CC_BLANK;
12813 _invlist_union_maybe_complement_2nd(
12815 PL_XPosix_ptrs[classnum],
12816 cBOOL(namedclass % 2), /* Complement if odd
12817 (NHORIZWS, NVERTWS)
12822 else if (classnum == _CC_ASCII) {
12825 ANYOF_CLASS_SET(ret, namedclass);
12828 #endif /* Not isascii(); just use the hard-coded definition for it */
12829 _invlist_union_maybe_complement_2nd(
12832 cBOOL(namedclass % 2), /* Complement if odd
12836 else { /* Garden variety class */
12838 /* The ascii range inversion list */
12839 SV* ascii_source = PL_Posix_ptrs[classnum];
12841 /* The full Latin1 range inversion list */
12842 SV* l1_source = PL_L1Posix_ptrs[classnum];
12844 /* This code is structured into two major clauses. The
12845 * first is for classes whose complete definitions may not
12846 * already be known. It not, the Latin1 definition
12847 * (guaranteed to already known) is used plus code is
12848 * generated to load the rest at run-time (only if needed).
12849 * If the complete definition is known, it drops down to
12850 * the second clause, where the complete definition is
12853 if (classnum < _FIRST_NON_SWASH_CC) {
12855 /* Here, the class has a swash, which may or not
12856 * already be loaded */
12858 /* The name of the property to use to match the full
12859 * eXtended Unicode range swash for this character
12861 const char *Xname = swash_property_names[classnum];
12863 /* If returning the inversion list, we can't defer
12864 * getting this until runtime */
12865 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12866 PL_utf8_swash_ptrs[classnum] =
12867 _core_swash_init("utf8", Xname, &PL_sv_undef,
12870 NULL, /* No inversion list */
12871 NULL /* No flags */
12873 assert(PL_utf8_swash_ptrs[classnum]);
12875 if ( ! PL_utf8_swash_ptrs[classnum]) {
12876 if (namedclass % 2 == 0) { /* A non-complemented
12878 /* If not /a matching, there are code points we
12879 * don't know at compile time. Arrange for the
12880 * unknown matches to be loaded at run-time, if
12882 if (! AT_LEAST_ASCII_RESTRICTED) {
12883 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12886 if (LOC) { /* Under locale, set run-time
12888 ANYOF_CLASS_SET(ret, namedclass);
12891 /* Add the current class's code points to
12892 * the running total */
12893 _invlist_union(posixes,
12894 (AT_LEAST_ASCII_RESTRICTED)
12900 else { /* A complemented class */
12901 if (AT_LEAST_ASCII_RESTRICTED) {
12902 /* Under /a should match everything above
12903 * ASCII, plus the complement of the set's
12905 _invlist_union_complement_2nd(posixes,
12910 /* Arrange for the unknown matches to be
12911 * loaded at run-time, if needed */
12912 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12914 runtime_posix_matches_above_Unicode = TRUE;
12916 ANYOF_CLASS_SET(ret, namedclass);
12920 /* We want to match everything in
12921 * Latin1, except those things that
12922 * l1_source matches */
12923 SV* scratch_list = NULL;
12924 _invlist_subtract(PL_Latin1, l1_source,
12927 /* Add the list from this class to the
12930 posixes = scratch_list;
12933 _invlist_union(posixes,
12936 SvREFCNT_dec_NN(scratch_list);
12938 if (DEPENDS_SEMANTICS) {
12940 |= ANYOF_NON_UTF8_LATIN1_ALL;
12945 goto namedclass_done;
12948 /* Here, there is a swash loaded for the class. If no
12949 * inversion list for it yet, get it */
12950 if (! PL_XPosix_ptrs[classnum]) {
12951 PL_XPosix_ptrs[classnum]
12952 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12956 /* Here there is an inversion list already loaded for the
12959 if (namedclass % 2 == 0) { /* A non-complemented class,
12960 like ANYOF_PUNCT */
12962 /* For non-locale, just add it to any existing list
12964 _invlist_union(posixes,
12965 (AT_LEAST_ASCII_RESTRICTED)
12967 : PL_XPosix_ptrs[classnum],
12970 else { /* Locale */
12971 SV* scratch_list = NULL;
12973 /* For above Latin1 code points, we use the full
12975 _invlist_intersection(PL_AboveLatin1,
12976 PL_XPosix_ptrs[classnum],
12978 /* And set the output to it, adding instead if
12979 * there already is an output. Checking if
12980 * 'posixes' is NULL first saves an extra clone.
12981 * Its reference count will be decremented at the
12982 * next union, etc, or if this is the only
12983 * instance, at the end of the routine */
12985 posixes = scratch_list;
12988 _invlist_union(posixes, scratch_list, &posixes);
12989 SvREFCNT_dec_NN(scratch_list);
12992 #ifndef HAS_ISBLANK
12993 if (namedclass != ANYOF_BLANK) {
12995 /* Set this class in the node for runtime
12997 ANYOF_CLASS_SET(ret, namedclass);
12998 #ifndef HAS_ISBLANK
13001 /* No isblank(), use the hard-coded ASCII-range
13002 * blanks, adding them to the running total. */
13004 _invlist_union(posixes, ascii_source, &posixes);
13009 else { /* A complemented class, like ANYOF_NPUNCT */
13011 _invlist_union_complement_2nd(
13013 (AT_LEAST_ASCII_RESTRICTED)
13015 : PL_XPosix_ptrs[classnum],
13017 /* Under /d, everything in the upper half of the
13018 * Latin1 range matches this complement */
13019 if (DEPENDS_SEMANTICS) {
13020 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
13023 else { /* Locale */
13024 SV* scratch_list = NULL;
13025 _invlist_subtract(PL_AboveLatin1,
13026 PL_XPosix_ptrs[classnum],
13029 posixes = scratch_list;
13032 _invlist_union(posixes, scratch_list, &posixes);
13033 SvREFCNT_dec_NN(scratch_list);
13035 #ifndef HAS_ISBLANK
13036 if (namedclass != ANYOF_NBLANK) {
13038 ANYOF_CLASS_SET(ret, namedclass);
13039 #ifndef HAS_ISBLANK
13042 /* Get the list of all code points in Latin1
13043 * that are not ASCII blanks, and add them to
13044 * the running total */
13045 _invlist_subtract(PL_Latin1, ascii_source,
13047 _invlist_union(posixes, scratch_list, &posixes);
13048 SvREFCNT_dec_NN(scratch_list);
13055 continue; /* Go get next character */
13057 } /* end of namedclass \blah */
13059 /* Here, we have a single value. If 'range' is set, it is the ending
13060 * of a range--check its validity. Later, we will handle each
13061 * individual code point in the range. If 'range' isn't set, this
13062 * could be the beginning of a range, so check for that by looking
13063 * ahead to see if the next real character to be processed is the range
13064 * indicator--the minus sign */
13067 RExC_parse = regpatws(pRExC_state, RExC_parse,
13068 FALSE /* means don't recognize comments */);
13072 if (prevvalue > value) /* b-a */ {
13073 const int w = RExC_parse - rangebegin;
13075 "Invalid [] range \"%"UTF8f"\"",
13076 UTF8fARG(UTF, w, rangebegin));
13077 range = 0; /* not a valid range */
13081 prevvalue = value; /* save the beginning of the potential range */
13082 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13083 && *RExC_parse == '-')
13085 char* next_char_ptr = RExC_parse + 1;
13086 if (skip_white) { /* Get the next real char after the '-' */
13087 next_char_ptr = regpatws(pRExC_state,
13089 FALSE); /* means don't recognize
13093 /* If the '-' is at the end of the class (just before the ']',
13094 * it is a literal minus; otherwise it is a range */
13095 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13096 RExC_parse = next_char_ptr;
13098 /* a bad range like \w-, [:word:]- ? */
13099 if (namedclass > OOB_NAMEDCLASS) {
13100 if (strict || ckWARN(WARN_REGEXP)) {
13102 RExC_parse >= rangebegin ?
13103 RExC_parse - rangebegin : 0;
13105 vFAIL4("False [] range \"%*.*s\"",
13110 "False [] range \"%*.*s\"",
13115 cp_list = add_cp_to_invlist(cp_list, '-');
13119 range = 1; /* yeah, it's a range! */
13120 continue; /* but do it the next time */
13125 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13128 /* non-Latin1 code point implies unicode semantics. Must be set in
13129 * pass1 so is there for the whole of pass 2 */
13131 RExC_uni_semantics = 1;
13134 /* Ready to process either the single value, or the completed range.
13135 * For single-valued non-inverted ranges, we consider the possibility
13136 * of multi-char folds. (We made a conscious decision to not do this
13137 * for the other cases because it can often lead to non-intuitive
13138 * results. For example, you have the peculiar case that:
13139 * "s s" =~ /^[^\xDF]+$/i => Y
13140 * "ss" =~ /^[^\xDF]+$/i => N
13142 * See [perl #89750] */
13143 if (FOLD && allow_multi_folds && value == prevvalue) {
13144 if (value == LATIN_SMALL_LETTER_SHARP_S
13145 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13148 /* Here <value> is indeed a multi-char fold. Get what it is */
13150 U8 foldbuf[UTF8_MAXBYTES_CASE];
13153 UV folded = _to_uni_fold_flags(
13158 | ((LOC) ? FOLD_FLAGS_LOCALE
13159 : (ASCII_FOLD_RESTRICTED)
13160 ? FOLD_FLAGS_NOMIX_ASCII
13164 /* Here, <folded> should be the first character of the
13165 * multi-char fold of <value>, with <foldbuf> containing the
13166 * whole thing. But, if this fold is not allowed (because of
13167 * the flags), <fold> will be the same as <value>, and should
13168 * be processed like any other character, so skip the special
13170 if (folded != value) {
13172 /* Skip if we are recursed, currently parsing the class
13173 * again. Otherwise add this character to the list of
13174 * multi-char folds. */
13175 if (! RExC_in_multi_char_class) {
13176 AV** this_array_ptr;
13178 STRLEN cp_count = utf8_length(foldbuf,
13179 foldbuf + foldlen);
13180 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13182 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13185 if (! multi_char_matches) {
13186 multi_char_matches = newAV();
13189 /* <multi_char_matches> is actually an array of arrays.
13190 * There will be one or two top-level elements: [2],
13191 * and/or [3]. The [2] element is an array, each
13192 * element thereof is a character which folds to TWO
13193 * characters; [3] is for folds to THREE characters.
13194 * (Unicode guarantees a maximum of 3 characters in any
13195 * fold.) When we rewrite the character class below,
13196 * we will do so such that the longest folds are
13197 * written first, so that it prefers the longest
13198 * matching strings first. This is done even if it
13199 * turns out that any quantifier is non-greedy, out of
13200 * programmer laziness. Tom Christiansen has agreed
13201 * that this is ok. This makes the test for the
13202 * ligature 'ffi' come before the test for 'ff' */
13203 if (av_exists(multi_char_matches, cp_count)) {
13204 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13206 this_array = *this_array_ptr;
13209 this_array = newAV();
13210 av_store(multi_char_matches, cp_count,
13213 av_push(this_array, multi_fold);
13216 /* This element should not be processed further in this
13219 value = save_value;
13220 prevvalue = save_prevvalue;
13226 /* Deal with this element of the class */
13229 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13231 SV* this_range = _new_invlist(1);
13232 _append_range_to_invlist(this_range, prevvalue, value);
13234 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13235 * If this range was specified using something like 'i-j', we want
13236 * to include only the 'i' and the 'j', and not anything in
13237 * between, so exclude non-ASCII, non-alphabetics from it.
13238 * However, if the range was specified with something like
13239 * [\x89-\x91] or [\x89-j], all code points within it should be
13240 * included. literal_endpoint==2 means both ends of the range used
13241 * a literal character, not \x{foo} */
13242 if (literal_endpoint == 2
13243 && ((prevvalue >= 'a' && value <= 'z')
13244 || (prevvalue >= 'A' && value <= 'Z')))
13246 _invlist_intersection(this_range, PL_ASCII,
13248 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13251 _invlist_union(cp_list, this_range, &cp_list);
13252 literal_endpoint = 0;
13256 range = 0; /* this range (if it was one) is done now */
13257 } /* End of loop through all the text within the brackets */
13259 /* If anything in the class expands to more than one character, we have to
13260 * deal with them by building up a substitute parse string, and recursively
13261 * calling reg() on it, instead of proceeding */
13262 if (multi_char_matches) {
13263 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13266 char *save_end = RExC_end;
13267 char *save_parse = RExC_parse;
13268 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13273 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13274 because too confusing */
13276 sv_catpv(substitute_parse, "(?:");
13280 /* Look at the longest folds first */
13281 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13283 if (av_exists(multi_char_matches, cp_count)) {
13284 AV** this_array_ptr;
13287 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13289 while ((this_sequence = av_pop(*this_array_ptr)) !=
13292 if (! first_time) {
13293 sv_catpv(substitute_parse, "|");
13295 first_time = FALSE;
13297 sv_catpv(substitute_parse, SvPVX(this_sequence));
13302 /* If the character class contains anything else besides these
13303 * multi-character folds, have to include it in recursive parsing */
13304 if (element_count) {
13305 sv_catpv(substitute_parse, "|[");
13306 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13307 sv_catpv(substitute_parse, "]");
13310 sv_catpv(substitute_parse, ")");
13313 /* This is a way to get the parse to skip forward a whole named
13314 * sequence instead of matching the 2nd character when it fails the
13316 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13320 RExC_parse = SvPV(substitute_parse, len);
13321 RExC_end = RExC_parse + len;
13322 RExC_in_multi_char_class = 1;
13323 RExC_emit = (regnode *)orig_emit;
13325 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13327 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13329 RExC_parse = save_parse;
13330 RExC_end = save_end;
13331 RExC_in_multi_char_class = 0;
13332 SvREFCNT_dec_NN(multi_char_matches);
13336 /* If the character class contains only a single element, it may be
13337 * optimizable into another node type which is smaller and runs faster.
13338 * Check if this is the case for this class */
13339 if (element_count == 1 && ! ret_invlist) {
13343 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13344 [:digit:] or \p{foo} */
13346 /* All named classes are mapped into POSIXish nodes, with its FLAG
13347 * argument giving which class it is */
13348 switch ((I32)namedclass) {
13349 case ANYOF_UNIPROP:
13352 /* These don't depend on the charset modifiers. They always
13353 * match under /u rules */
13354 case ANYOF_NHORIZWS:
13355 case ANYOF_HORIZWS:
13356 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13359 case ANYOF_NVERTWS:
13364 /* The actual POSIXish node for all the rest depends on the
13365 * charset modifier. The ones in the first set depend only on
13366 * ASCII or, if available on this platform, locale */
13370 op = (LOC) ? POSIXL : POSIXA;
13381 /* under /a could be alpha */
13383 if (ASCII_RESTRICTED) {
13384 namedclass = ANYOF_ALPHA + (namedclass % 2);
13392 /* The rest have more possibilities depending on the charset.
13393 * We take advantage of the enum ordering of the charset
13394 * modifiers to get the exact node type, */
13396 op = POSIXD + get_regex_charset(RExC_flags);
13397 if (op > POSIXA) { /* /aa is same as /a */
13400 #ifndef HAS_ISBLANK
13402 && (namedclass == ANYOF_BLANK
13403 || namedclass == ANYOF_NBLANK))
13410 /* The odd numbered ones are the complements of the
13411 * next-lower even number one */
13412 if (namedclass % 2 == 1) {
13416 arg = namedclass_to_classnum(namedclass);
13420 else if (value == prevvalue) {
13422 /* Here, the class consists of just a single code point */
13425 if (! LOC && value == '\n') {
13426 op = REG_ANY; /* Optimize [^\n] */
13427 *flagp |= HASWIDTH|SIMPLE;
13431 else if (value < 256 || UTF) {
13433 /* Optimize a single value into an EXACTish node, but not if it
13434 * would require converting the pattern to UTF-8. */
13435 op = compute_EXACTish(pRExC_state);
13437 } /* Otherwise is a range */
13438 else if (! LOC) { /* locale could vary these */
13439 if (prevvalue == '0') {
13440 if (value == '9') {
13447 /* Here, we have changed <op> away from its initial value iff we found
13448 * an optimization */
13451 /* Throw away this ANYOF regnode, and emit the calculated one,
13452 * which should correspond to the beginning, not current, state of
13454 const char * cur_parse = RExC_parse;
13455 RExC_parse = (char *)orig_parse;
13459 /* To get locale nodes to not use the full ANYOF size would
13460 * require moving the code above that writes the portions
13461 * of it that aren't in other nodes to after this point.
13462 * e.g. ANYOF_CLASS_SET */
13463 RExC_size = orig_size;
13467 RExC_emit = (regnode *)orig_emit;
13468 if (PL_regkind[op] == POSIXD) {
13470 op += NPOSIXD - POSIXD;
13475 ret = reg_node(pRExC_state, op);
13477 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13481 *flagp |= HASWIDTH|SIMPLE;
13483 else if (PL_regkind[op] == EXACT) {
13484 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13487 RExC_parse = (char *) cur_parse;
13489 SvREFCNT_dec(posixes);
13490 SvREFCNT_dec(cp_list);
13497 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13499 /* If folding, we calculate all characters that could fold to or from the
13500 * ones already on the list */
13501 if (FOLD && cp_list) {
13502 UV start, end; /* End points of code point ranges */
13504 SV* fold_intersection = NULL;
13506 /* If the highest code point is within Latin1, we can use the
13507 * compiled-in Alphas list, and not have to go out to disk. This
13508 * yields two false positives, the masculine and feminine ordinal
13509 * indicators, which are weeded out below using the
13510 * IS_IN_SOME_FOLD_L1() macro */
13511 if (invlist_highest(cp_list) < 256) {
13512 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13513 &fold_intersection);
13517 /* Here, there are non-Latin1 code points, so we will have to go
13518 * fetch the list of all the characters that participate in folds
13520 if (! PL_utf8_foldable) {
13521 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13522 &PL_sv_undef, 1, 0);
13523 PL_utf8_foldable = _get_swash_invlist(swash);
13524 SvREFCNT_dec_NN(swash);
13527 /* This is a hash that for a particular fold gives all characters
13528 * that are involved in it */
13529 if (! PL_utf8_foldclosures) {
13531 /* If we were unable to find any folds, then we likely won't be
13532 * able to find the closures. So just create an empty list.
13533 * Folding will effectively be restricted to the non-Unicode
13534 * rules hard-coded into Perl. (This case happens legitimately
13535 * during compilation of Perl itself before the Unicode tables
13536 * are generated) */
13537 if (_invlist_len(PL_utf8_foldable) == 0) {
13538 PL_utf8_foldclosures = newHV();
13541 /* If the folds haven't been read in, call a fold function
13543 if (! PL_utf8_tofold) {
13544 U8 dummy[UTF8_MAXBYTES_CASE+1];
13546 /* This string is just a short named one above \xff */
13547 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13548 assert(PL_utf8_tofold); /* Verify that worked */
13550 PL_utf8_foldclosures =
13551 _swash_inversion_hash(PL_utf8_tofold);
13555 /* Only the characters in this class that participate in folds need
13556 * be checked. Get the intersection of this class and all the
13557 * possible characters that are foldable. This can quickly narrow
13558 * down a large class */
13559 _invlist_intersection(PL_utf8_foldable, cp_list,
13560 &fold_intersection);
13563 /* Now look at the foldable characters in this class individually */
13564 invlist_iterinit(fold_intersection);
13565 while (invlist_iternext(fold_intersection, &start, &end)) {
13568 /* Locale folding for Latin1 characters is deferred until runtime */
13569 if (LOC && start < 256) {
13573 /* Look at every character in the range */
13574 for (j = start; j <= end; j++) {
13576 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13582 /* We have the latin1 folding rules hard-coded here so that
13583 * an innocent-looking character class, like /[ks]/i won't
13584 * have to go out to disk to find the possible matches.
13585 * XXX It would be better to generate these via regen, in
13586 * case a new version of the Unicode standard adds new
13587 * mappings, though that is not really likely, and may be
13588 * caught by the default: case of the switch below. */
13590 if (IS_IN_SOME_FOLD_L1(j)) {
13592 /* ASCII is always matched; non-ASCII is matched only
13593 * under Unicode rules */
13594 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13596 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13600 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13604 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13605 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13607 /* Certain Latin1 characters have matches outside
13608 * Latin1. To get here, <j> is one of those
13609 * characters. None of these matches is valid for
13610 * ASCII characters under /aa, which is why the 'if'
13611 * just above excludes those. These matches only
13612 * happen when the target string is utf8. The code
13613 * below adds the single fold closures for <j> to the
13614 * inversion list. */
13619 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13623 cp_list = add_cp_to_invlist(cp_list,
13624 LATIN_SMALL_LETTER_LONG_S);
13627 cp_list = add_cp_to_invlist(cp_list,
13628 GREEK_CAPITAL_LETTER_MU);
13629 cp_list = add_cp_to_invlist(cp_list,
13630 GREEK_SMALL_LETTER_MU);
13632 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13633 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13635 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13637 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13638 cp_list = add_cp_to_invlist(cp_list,
13639 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13641 case LATIN_SMALL_LETTER_SHARP_S:
13642 cp_list = add_cp_to_invlist(cp_list,
13643 LATIN_CAPITAL_LETTER_SHARP_S);
13645 case 'F': case 'f':
13646 case 'I': case 'i':
13647 case 'L': case 'l':
13648 case 'T': case 't':
13649 case 'A': case 'a':
13650 case 'H': case 'h':
13651 case 'J': case 'j':
13652 case 'N': case 'n':
13653 case 'W': case 'w':
13654 case 'Y': case 'y':
13655 /* These all are targets of multi-character
13656 * folds from code points that require UTF8 to
13657 * express, so they can't match unless the
13658 * target string is in UTF-8, so no action here
13659 * is necessary, as regexec.c properly handles
13660 * the general case for UTF-8 matching and
13661 * multi-char folds */
13664 /* Use deprecated warning to increase the
13665 * chances of this being output */
13666 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13673 /* Here is an above Latin1 character. We don't have the rules
13674 * hard-coded for it. First, get its fold. This is the simple
13675 * fold, as the multi-character folds have been handled earlier
13676 * and separated out */
13677 _to_uni_fold_flags(j, foldbuf, &foldlen,
13679 ? FOLD_FLAGS_LOCALE
13680 : (ASCII_FOLD_RESTRICTED)
13681 ? FOLD_FLAGS_NOMIX_ASCII
13684 /* Single character fold of above Latin1. Add everything in
13685 * its fold closure to the list that this node should match.
13686 * The fold closures data structure is a hash with the keys
13687 * being the UTF-8 of every character that is folded to, like
13688 * 'k', and the values each an array of all code points that
13689 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13690 * Multi-character folds are not included */
13691 if ((listp = hv_fetch(PL_utf8_foldclosures,
13692 (char *) foldbuf, foldlen, FALSE)))
13694 AV* list = (AV*) *listp;
13696 for (k = 0; k <= av_len(list); k++) {
13697 SV** c_p = av_fetch(list, k, FALSE);
13700 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13704 /* /aa doesn't allow folds between ASCII and non-; /l
13705 * doesn't allow them between above and below 256 */
13706 if ((ASCII_FOLD_RESTRICTED
13707 && (isASCII(c) != isASCII(j)))
13708 || (LOC && c < 256)) {
13712 /* Folds involving non-ascii Latin1 characters
13713 * under /d are added to a separate list */
13714 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13716 cp_list = add_cp_to_invlist(cp_list, c);
13719 depends_list = add_cp_to_invlist(depends_list, c);
13725 SvREFCNT_dec_NN(fold_intersection);
13728 /* And combine the result (if any) with any inversion list from posix
13729 * classes. The lists are kept separate up to now because we don't want to
13730 * fold the classes (folding of those is automatically handled by the swash
13731 * fetching code) */
13733 if (! DEPENDS_SEMANTICS) {
13735 _invlist_union(cp_list, posixes, &cp_list);
13736 SvREFCNT_dec_NN(posixes);
13743 /* Under /d, we put into a separate list the Latin1 things that
13744 * match only when the target string is utf8 */
13745 SV* nonascii_but_latin1_properties = NULL;
13746 _invlist_intersection(posixes, PL_Latin1,
13747 &nonascii_but_latin1_properties);
13748 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13749 &nonascii_but_latin1_properties);
13750 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13753 _invlist_union(cp_list, posixes, &cp_list);
13754 SvREFCNT_dec_NN(posixes);
13760 if (depends_list) {
13761 _invlist_union(depends_list, nonascii_but_latin1_properties,
13763 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13766 depends_list = nonascii_but_latin1_properties;
13771 /* And combine the result (if any) with any inversion list from properties.
13772 * The lists are kept separate up to now so that we can distinguish the two
13773 * in regards to matching above-Unicode. A run-time warning is generated
13774 * if a Unicode property is matched against a non-Unicode code point. But,
13775 * we allow user-defined properties to match anything, without any warning,
13776 * and we also suppress the warning if there is a portion of the character
13777 * class that isn't a Unicode property, and which matches above Unicode, \W
13778 * or [\x{110000}] for example.
13779 * (Note that in this case, unlike the Posix one above, there is no
13780 * <depends_list>, because having a Unicode property forces Unicode
13783 bool warn_super = ! has_user_defined_property;
13786 /* If it matters to the final outcome, see if a non-property
13787 * component of the class matches above Unicode. If so, the
13788 * warning gets suppressed. This is true even if just a single
13789 * such code point is specified, as though not strictly correct if
13790 * another such code point is matched against, the fact that they
13791 * are using above-Unicode code points indicates they should know
13792 * the issues involved */
13794 bool non_prop_matches_above_Unicode =
13795 runtime_posix_matches_above_Unicode
13796 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13798 non_prop_matches_above_Unicode =
13799 ! non_prop_matches_above_Unicode;
13801 warn_super = ! non_prop_matches_above_Unicode;
13804 _invlist_union(properties, cp_list, &cp_list);
13805 SvREFCNT_dec_NN(properties);
13808 cp_list = properties;
13812 OP(ret) = ANYOF_WARN_SUPER;
13816 /* Here, we have calculated what code points should be in the character
13819 * Now we can see about various optimizations. Fold calculation (which we
13820 * did above) needs to take place before inversion. Otherwise /[^k]/i
13821 * would invert to include K, which under /i would match k, which it
13822 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13823 * folded until runtime */
13825 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13826 * at compile time. Besides not inverting folded locale now, we can't
13827 * invert if there are things such as \w, which aren't known until runtime
13830 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13832 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13834 _invlist_invert(cp_list);
13836 /* Any swash can't be used as-is, because we've inverted things */
13838 SvREFCNT_dec_NN(swash);
13842 /* Clear the invert flag since have just done it here */
13847 *ret_invlist = cp_list;
13848 SvREFCNT_dec(swash);
13850 /* Discard the generated node */
13852 RExC_size = orig_size;
13855 RExC_emit = orig_emit;
13860 /* If we didn't do folding, it's because some information isn't available
13861 * until runtime; set the run-time fold flag for these. (We don't have to
13862 * worry about properties folding, as that is taken care of by the swash
13866 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13869 /* Some character classes are equivalent to other nodes. Such nodes take
13870 * up less room and generally fewer operations to execute than ANYOF nodes.
13871 * Above, we checked for and optimized into some such equivalents for
13872 * certain common classes that are easy to test. Getting to this point in
13873 * the code means that the class didn't get optimized there. Since this
13874 * code is only executed in Pass 2, it is too late to save space--it has
13875 * been allocated in Pass 1, and currently isn't given back. But turning
13876 * things into an EXACTish node can allow the optimizer to join it to any
13877 * adjacent such nodes. And if the class is equivalent to things like /./,
13878 * expensive run-time swashes can be avoided. Now that we have more
13879 * complete information, we can find things necessarily missed by the
13880 * earlier code. I (khw) am not sure how much to look for here. It would
13881 * be easy, but perhaps too slow, to check any candidates against all the
13882 * node types they could possibly match using _invlistEQ(). */
13887 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13888 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13891 U8 op = END; /* The optimzation node-type */
13892 const char * cur_parse= RExC_parse;
13894 invlist_iterinit(cp_list);
13895 if (! invlist_iternext(cp_list, &start, &end)) {
13897 /* Here, the list is empty. This happens, for example, when a
13898 * Unicode property is the only thing in the character class, and
13899 * it doesn't match anything. (perluniprops.pod notes such
13902 *flagp |= HASWIDTH|SIMPLE;
13904 else if (start == end) { /* The range is a single code point */
13905 if (! invlist_iternext(cp_list, &start, &end)
13907 /* Don't do this optimization if it would require changing
13908 * the pattern to UTF-8 */
13909 && (start < 256 || UTF))
13911 /* Here, the list contains a single code point. Can optimize
13912 * into an EXACT node */
13921 /* A locale node under folding with one code point can be
13922 * an EXACTFL, as its fold won't be calculated until
13928 /* Here, we are generally folding, but there is only one
13929 * code point to match. If we have to, we use an EXACT
13930 * node, but it would be better for joining with adjacent
13931 * nodes in the optimization pass if we used the same
13932 * EXACTFish node that any such are likely to be. We can
13933 * do this iff the code point doesn't participate in any
13934 * folds. For example, an EXACTF of a colon is the same as
13935 * an EXACT one, since nothing folds to or from a colon. */
13937 if (IS_IN_SOME_FOLD_L1(value)) {
13942 if (! PL_utf8_foldable) {
13943 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13944 &PL_sv_undef, 1, 0);
13945 PL_utf8_foldable = _get_swash_invlist(swash);
13946 SvREFCNT_dec_NN(swash);
13948 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13953 /* If we haven't found the node type, above, it means we
13954 * can use the prevailing one */
13956 op = compute_EXACTish(pRExC_state);
13961 else if (start == 0) {
13962 if (end == UV_MAX) {
13964 *flagp |= HASWIDTH|SIMPLE;
13967 else if (end == '\n' - 1
13968 && invlist_iternext(cp_list, &start, &end)
13969 && start == '\n' + 1 && end == UV_MAX)
13972 *flagp |= HASWIDTH|SIMPLE;
13976 invlist_iterfinish(cp_list);
13979 RExC_parse = (char *)orig_parse;
13980 RExC_emit = (regnode *)orig_emit;
13982 ret = reg_node(pRExC_state, op);
13984 RExC_parse = (char *)cur_parse;
13986 if (PL_regkind[op] == EXACT) {
13987 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13990 SvREFCNT_dec_NN(cp_list);
13995 /* Here, <cp_list> contains all the code points we can determine at
13996 * compile time that match under all conditions. Go through it, and
13997 * for things that belong in the bitmap, put them there, and delete from
13998 * <cp_list>. While we are at it, see if everything above 255 is in the
13999 * list, and if so, set a flag to speed up execution */
14000 ANYOF_BITMAP_ZERO(ret);
14003 /* This gets set if we actually need to modify things */
14004 bool change_invlist = FALSE;
14008 /* Start looking through <cp_list> */
14009 invlist_iterinit(cp_list);
14010 while (invlist_iternext(cp_list, &start, &end)) {
14014 if (end == UV_MAX && start <= 256) {
14015 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
14018 /* Quit if are above what we should change */
14023 change_invlist = TRUE;
14025 /* Set all the bits in the range, up to the max that we are doing */
14026 high = (end < 255) ? end : 255;
14027 for (i = start; i <= (int) high; i++) {
14028 if (! ANYOF_BITMAP_TEST(ret, i)) {
14029 ANYOF_BITMAP_SET(ret, i);
14033 invlist_iterfinish(cp_list);
14035 /* Done with loop; remove any code points that are in the bitmap from
14037 if (change_invlist) {
14038 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
14041 /* If have completely emptied it, remove it completely */
14042 if (_invlist_len(cp_list) == 0) {
14043 SvREFCNT_dec_NN(cp_list);
14049 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14052 /* Here, the bitmap has been populated with all the Latin1 code points that
14053 * always match. Can now add to the overall list those that match only
14054 * when the target string is UTF-8 (<depends_list>). */
14055 if (depends_list) {
14057 _invlist_union(cp_list, depends_list, &cp_list);
14058 SvREFCNT_dec_NN(depends_list);
14061 cp_list = depends_list;
14065 /* If there is a swash and more than one element, we can't use the swash in
14066 * the optimization below. */
14067 if (swash && element_count > 1) {
14068 SvREFCNT_dec_NN(swash);
14073 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14075 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14078 /* av[0] stores the character class description in its textual form:
14079 * used later (regexec.c:Perl_regclass_swash()) to initialize the
14080 * appropriate swash, and is also useful for dumping the regnode.
14081 * av[1] if NULL, is a placeholder to later contain the swash computed
14082 * from av[0]. But if no further computation need be done, the
14083 * swash is stored there now.
14084 * av[2] stores the cp_list inversion list for use in addition or
14085 * instead of av[0]; used only if av[1] is NULL
14086 * av[3] is set if any component of the class is from a user-defined
14087 * property; used only if av[1] is NULL */
14088 AV * const av = newAV();
14091 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14092 ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14094 av_store(av, 1, swash);
14095 SvREFCNT_dec_NN(cp_list);
14098 av_store(av, 1, NULL);
14100 av_store(av, 2, cp_list);
14101 av_store(av, 3, newSVuv(has_user_defined_property));
14105 rv = newRV_noinc(MUTABLE_SV(av));
14106 n = add_data(pRExC_state, 1, "s");
14107 RExC_rxi->data->data[n] = (void*)rv;
14111 *flagp |= HASWIDTH|SIMPLE;
14114 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14117 /* reg_skipcomment()
14119 Absorbs an /x style # comments from the input stream.
14120 Returns true if there is more text remaining in the stream.
14121 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14122 terminates the pattern without including a newline.
14124 Note its the callers responsibility to ensure that we are
14125 actually in /x mode
14130 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14134 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14136 while (RExC_parse < RExC_end)
14137 if (*RExC_parse++ == '\n') {
14142 /* we ran off the end of the pattern without ending
14143 the comment, so we have to add an \n when wrapping */
14144 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14152 Advances the parse position, and optionally absorbs
14153 "whitespace" from the inputstream.
14155 Without /x "whitespace" means (?#...) style comments only,
14156 with /x this means (?#...) and # comments and whitespace proper.
14158 Returns the RExC_parse point from BEFORE the scan occurs.
14160 This is the /x friendly way of saying RExC_parse++.
14164 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14166 char* const retval = RExC_parse++;
14168 PERL_ARGS_ASSERT_NEXTCHAR;
14171 if (RExC_end - RExC_parse >= 3
14172 && *RExC_parse == '('
14173 && RExC_parse[1] == '?'
14174 && RExC_parse[2] == '#')
14176 while (*RExC_parse != ')') {
14177 if (RExC_parse == RExC_end)
14178 FAIL("Sequence (?#... not terminated");
14184 if (RExC_flags & RXf_PMf_EXTENDED) {
14185 if (isSPACE(*RExC_parse)) {
14189 else if (*RExC_parse == '#') {
14190 if ( reg_skipcomment( pRExC_state ) )
14199 - reg_node - emit a node
14201 STATIC regnode * /* Location. */
14202 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14206 regnode * const ret = RExC_emit;
14207 GET_RE_DEBUG_FLAGS_DECL;
14209 PERL_ARGS_ASSERT_REG_NODE;
14212 SIZE_ALIGN(RExC_size);
14216 if (RExC_emit >= RExC_emit_bound)
14217 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14218 op, RExC_emit, RExC_emit_bound);
14220 NODE_ALIGN_FILL(ret);
14222 FILL_ADVANCE_NODE(ptr, op);
14223 #ifdef RE_TRACK_PATTERN_OFFSETS
14224 if (RExC_offsets) { /* MJD */
14225 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14226 "reg_node", __LINE__,
14228 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14229 ? "Overwriting end of array!\n" : "OK",
14230 (UV)(RExC_emit - RExC_emit_start),
14231 (UV)(RExC_parse - RExC_start),
14232 (UV)RExC_offsets[0]));
14233 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14241 - reganode - emit a node with an argument
14243 STATIC regnode * /* Location. */
14244 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14248 regnode * const ret = RExC_emit;
14249 GET_RE_DEBUG_FLAGS_DECL;
14251 PERL_ARGS_ASSERT_REGANODE;
14254 SIZE_ALIGN(RExC_size);
14259 assert(2==regarglen[op]+1);
14261 Anything larger than this has to allocate the extra amount.
14262 If we changed this to be:
14264 RExC_size += (1 + regarglen[op]);
14266 then it wouldn't matter. Its not clear what side effect
14267 might come from that so its not done so far.
14272 if (RExC_emit >= RExC_emit_bound)
14273 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14274 op, RExC_emit, RExC_emit_bound);
14276 NODE_ALIGN_FILL(ret);
14278 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14279 #ifdef RE_TRACK_PATTERN_OFFSETS
14280 if (RExC_offsets) { /* MJD */
14281 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14285 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14286 "Overwriting end of array!\n" : "OK",
14287 (UV)(RExC_emit - RExC_emit_start),
14288 (UV)(RExC_parse - RExC_start),
14289 (UV)RExC_offsets[0]));
14290 Set_Cur_Node_Offset;
14298 - reguni - emit (if appropriate) a Unicode character
14301 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14305 PERL_ARGS_ASSERT_REGUNI;
14307 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14311 - reginsert - insert an operator in front of already-emitted operand
14313 * Means relocating the operand.
14316 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14322 const int offset = regarglen[(U8)op];
14323 const int size = NODE_STEP_REGNODE + offset;
14324 GET_RE_DEBUG_FLAGS_DECL;
14326 PERL_ARGS_ASSERT_REGINSERT;
14327 PERL_UNUSED_ARG(depth);
14328 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14329 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14338 if (RExC_open_parens) {
14340 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14341 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14342 if ( RExC_open_parens[paren] >= opnd ) {
14343 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14344 RExC_open_parens[paren] += size;
14346 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14348 if ( RExC_close_parens[paren] >= opnd ) {
14349 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14350 RExC_close_parens[paren] += size;
14352 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14357 while (src > opnd) {
14358 StructCopy(--src, --dst, regnode);
14359 #ifdef RE_TRACK_PATTERN_OFFSETS
14360 if (RExC_offsets) { /* MJD 20010112 */
14361 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14365 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14366 ? "Overwriting end of array!\n" : "OK",
14367 (UV)(src - RExC_emit_start),
14368 (UV)(dst - RExC_emit_start),
14369 (UV)RExC_offsets[0]));
14370 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14371 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14377 place = opnd; /* Op node, where operand used to be. */
14378 #ifdef RE_TRACK_PATTERN_OFFSETS
14379 if (RExC_offsets) { /* MJD */
14380 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14384 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14385 ? "Overwriting end of array!\n" : "OK",
14386 (UV)(place - RExC_emit_start),
14387 (UV)(RExC_parse - RExC_start),
14388 (UV)RExC_offsets[0]));
14389 Set_Node_Offset(place, RExC_parse);
14390 Set_Node_Length(place, 1);
14393 src = NEXTOPER(place);
14394 FILL_ADVANCE_NODE(place, op);
14395 Zero(src, offset, regnode);
14399 - regtail - set the next-pointer at the end of a node chain of p to val.
14400 - SEE ALSO: regtail_study
14402 /* TODO: All three parms should be const */
14404 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14408 GET_RE_DEBUG_FLAGS_DECL;
14410 PERL_ARGS_ASSERT_REGTAIL;
14412 PERL_UNUSED_ARG(depth);
14418 /* Find last node. */
14421 regnode * const temp = regnext(scan);
14423 SV * const mysv=sv_newmortal();
14424 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14425 regprop(RExC_rx, mysv, scan);
14426 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14427 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14428 (temp == NULL ? "->" : ""),
14429 (temp == NULL ? PL_reg_name[OP(val)] : "")
14437 if (reg_off_by_arg[OP(scan)]) {
14438 ARG_SET(scan, val - scan);
14441 NEXT_OFF(scan) = val - scan;
14447 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14448 - Look for optimizable sequences at the same time.
14449 - currently only looks for EXACT chains.
14451 This is experimental code. The idea is to use this routine to perform
14452 in place optimizations on branches and groups as they are constructed,
14453 with the long term intention of removing optimization from study_chunk so
14454 that it is purely analytical.
14456 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14457 to control which is which.
14460 /* TODO: All four parms should be const */
14463 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14468 #ifdef EXPERIMENTAL_INPLACESCAN
14471 GET_RE_DEBUG_FLAGS_DECL;
14473 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14479 /* Find last node. */
14483 regnode * const temp = regnext(scan);
14484 #ifdef EXPERIMENTAL_INPLACESCAN
14485 if (PL_regkind[OP(scan)] == EXACT) {
14486 bool has_exactf_sharp_s; /* Unexamined in this routine */
14487 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14492 switch (OP(scan)) {
14495 case EXACTFA_NO_TRIE:
14500 if( exact == PSEUDO )
14502 else if ( exact != OP(scan) )
14511 SV * const mysv=sv_newmortal();
14512 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14513 regprop(RExC_rx, mysv, scan);
14514 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14515 SvPV_nolen_const(mysv),
14516 REG_NODE_NUM(scan),
14517 PL_reg_name[exact]);
14524 SV * const mysv_val=sv_newmortal();
14525 DEBUG_PARSE_MSG("");
14526 regprop(RExC_rx, mysv_val, val);
14527 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14528 SvPV_nolen_const(mysv_val),
14529 (IV)REG_NODE_NUM(val),
14533 if (reg_off_by_arg[OP(scan)]) {
14534 ARG_SET(scan, val - scan);
14537 NEXT_OFF(scan) = val - scan;
14545 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14550 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14555 for (bit=0; bit<32; bit++) {
14556 if (flags & (1<<bit)) {
14557 if (!set++ && lead)
14558 PerlIO_printf(Perl_debug_log, "%s",lead);
14559 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14564 PerlIO_printf(Perl_debug_log, "\n");
14566 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14571 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14577 for (bit=0; bit<32; bit++) {
14578 if (flags & (1<<bit)) {
14579 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14582 if (!set++ && lead)
14583 PerlIO_printf(Perl_debug_log, "%s",lead);
14584 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14587 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14588 if (!set++ && lead) {
14589 PerlIO_printf(Perl_debug_log, "%s",lead);
14592 case REGEX_UNICODE_CHARSET:
14593 PerlIO_printf(Perl_debug_log, "UNICODE");
14595 case REGEX_LOCALE_CHARSET:
14596 PerlIO_printf(Perl_debug_log, "LOCALE");
14598 case REGEX_ASCII_RESTRICTED_CHARSET:
14599 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14601 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14602 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14605 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14611 PerlIO_printf(Perl_debug_log, "\n");
14613 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14619 Perl_regdump(pTHX_ const regexp *r)
14623 SV * const sv = sv_newmortal();
14624 SV *dsv= sv_newmortal();
14625 RXi_GET_DECL(r,ri);
14626 GET_RE_DEBUG_FLAGS_DECL;
14628 PERL_ARGS_ASSERT_REGDUMP;
14630 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14632 /* Header fields of interest. */
14633 if (r->anchored_substr) {
14634 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14635 RE_SV_DUMPLEN(r->anchored_substr), 30);
14636 PerlIO_printf(Perl_debug_log,
14637 "anchored %s%s at %"IVdf" ",
14638 s, RE_SV_TAIL(r->anchored_substr),
14639 (IV)r->anchored_offset);
14640 } else if (r->anchored_utf8) {
14641 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14642 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14643 PerlIO_printf(Perl_debug_log,
14644 "anchored utf8 %s%s at %"IVdf" ",
14645 s, RE_SV_TAIL(r->anchored_utf8),
14646 (IV)r->anchored_offset);
14648 if (r->float_substr) {
14649 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14650 RE_SV_DUMPLEN(r->float_substr), 30);
14651 PerlIO_printf(Perl_debug_log,
14652 "floating %s%s at %"IVdf"..%"UVuf" ",
14653 s, RE_SV_TAIL(r->float_substr),
14654 (IV)r->float_min_offset, (UV)r->float_max_offset);
14655 } else if (r->float_utf8) {
14656 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14657 RE_SV_DUMPLEN(r->float_utf8), 30);
14658 PerlIO_printf(Perl_debug_log,
14659 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14660 s, RE_SV_TAIL(r->float_utf8),
14661 (IV)r->float_min_offset, (UV)r->float_max_offset);
14663 if (r->check_substr || r->check_utf8)
14664 PerlIO_printf(Perl_debug_log,
14666 (r->check_substr == r->float_substr
14667 && r->check_utf8 == r->float_utf8
14668 ? "(checking floating" : "(checking anchored"));
14669 if (r->extflags & RXf_NOSCAN)
14670 PerlIO_printf(Perl_debug_log, " noscan");
14671 if (r->extflags & RXf_CHECK_ALL)
14672 PerlIO_printf(Perl_debug_log, " isall");
14673 if (r->check_substr || r->check_utf8)
14674 PerlIO_printf(Perl_debug_log, ") ");
14676 if (ri->regstclass) {
14677 regprop(r, sv, ri->regstclass);
14678 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14680 if (r->extflags & RXf_ANCH) {
14681 PerlIO_printf(Perl_debug_log, "anchored");
14682 if (r->extflags & RXf_ANCH_BOL)
14683 PerlIO_printf(Perl_debug_log, "(BOL)");
14684 if (r->extflags & RXf_ANCH_MBOL)
14685 PerlIO_printf(Perl_debug_log, "(MBOL)");
14686 if (r->extflags & RXf_ANCH_SBOL)
14687 PerlIO_printf(Perl_debug_log, "(SBOL)");
14688 if (r->extflags & RXf_ANCH_GPOS)
14689 PerlIO_printf(Perl_debug_log, "(GPOS)");
14690 PerlIO_putc(Perl_debug_log, ' ');
14692 if (r->extflags & RXf_GPOS_SEEN)
14693 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14694 if (r->intflags & PREGf_SKIP)
14695 PerlIO_printf(Perl_debug_log, "plus ");
14696 if (r->intflags & PREGf_IMPLICIT)
14697 PerlIO_printf(Perl_debug_log, "implicit ");
14698 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14699 if (r->extflags & RXf_EVAL_SEEN)
14700 PerlIO_printf(Perl_debug_log, "with eval ");
14701 PerlIO_printf(Perl_debug_log, "\n");
14703 regdump_extflags("r->extflags: ",r->extflags);
14704 regdump_intflags("r->intflags: ",r->intflags);
14707 PERL_ARGS_ASSERT_REGDUMP;
14708 PERL_UNUSED_CONTEXT;
14709 PERL_UNUSED_ARG(r);
14710 #endif /* DEBUGGING */
14714 - regprop - printable representation of opcode
14716 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14719 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14720 if (flags & ANYOF_INVERT) \
14721 /*make sure the invert info is in each */ \
14722 sv_catpvs(sv, "^"); \
14728 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14734 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14735 static const char * const anyofs[] = {
14736 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14737 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14738 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14739 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14740 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14741 || _CC_VERTSPACE != 16
14742 #error Need to adjust order of anyofs[]
14779 RXi_GET_DECL(prog,progi);
14780 GET_RE_DEBUG_FLAGS_DECL;
14782 PERL_ARGS_ASSERT_REGPROP;
14786 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14787 /* It would be nice to FAIL() here, but this may be called from
14788 regexec.c, and it would be hard to supply pRExC_state. */
14789 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14790 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14792 k = PL_regkind[OP(o)];
14795 sv_catpvs(sv, " ");
14796 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14797 * is a crude hack but it may be the best for now since
14798 * we have no flag "this EXACTish node was UTF-8"
14800 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14801 PERL_PV_ESCAPE_UNI_DETECT |
14802 PERL_PV_ESCAPE_NONASCII |
14803 PERL_PV_PRETTY_ELLIPSES |
14804 PERL_PV_PRETTY_LTGT |
14805 PERL_PV_PRETTY_NOCLEAR
14807 } else if (k == TRIE) {
14808 /* print the details of the trie in dumpuntil instead, as
14809 * progi->data isn't available here */
14810 const char op = OP(o);
14811 const U32 n = ARG(o);
14812 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14813 (reg_ac_data *)progi->data->data[n] :
14815 const reg_trie_data * const trie
14816 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14818 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14819 DEBUG_TRIE_COMPILE_r(
14820 Perl_sv_catpvf(aTHX_ sv,
14821 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14822 (UV)trie->startstate,
14823 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14824 (UV)trie->wordcount,
14827 (UV)TRIE_CHARCOUNT(trie),
14828 (UV)trie->uniquecharcount
14831 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14832 sv_catpvs(sv, "[");
14833 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14835 : TRIE_BITMAP(trie));
14836 sv_catpvs(sv, "]");
14839 } else if (k == CURLY) {
14840 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14841 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14842 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14844 else if (k == WHILEM && o->flags) /* Ordinal/of */
14845 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14846 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14847 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14848 if ( RXp_PAREN_NAMES(prog) ) {
14849 if ( k != REF || (OP(o) < NREF)) {
14850 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14851 SV **name= av_fetch(list, ARG(o), 0 );
14853 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14856 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14857 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14858 I32 *nums=(I32*)SvPVX(sv_dat);
14859 SV **name= av_fetch(list, nums[0], 0 );
14862 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14863 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14864 (n ? "," : ""), (IV)nums[n]);
14866 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14870 } else if (k == GOSUB)
14871 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14872 else if (k == VERB) {
14874 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14875 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14876 } else if (k == LOGICAL)
14877 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14878 else if (k == ANYOF) {
14879 const U8 flags = ANYOF_FLAGS(o);
14883 if (flags & ANYOF_LOCALE)
14884 sv_catpvs(sv, "{loc}");
14885 if (flags & ANYOF_LOC_FOLD)
14886 sv_catpvs(sv, "{i}");
14887 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14888 if (flags & ANYOF_INVERT)
14889 sv_catpvs(sv, "^");
14891 /* output what the standard cp 0-255 bitmap matches */
14892 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14894 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14895 /* output any special charclass tests (used entirely under use locale) */
14896 if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14898 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14899 if (ANYOF_CLASS_TEST(o,i)) {
14900 sv_catpv(sv, anyofs[i]);
14906 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14908 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14909 sv_catpvs(sv, "{non-utf8-latin1-all}");
14912 /* output information about the unicode matching */
14913 if (flags & ANYOF_UNICODE_ALL)
14914 sv_catpvs(sv, "{unicode_all}");
14915 else if (ANYOF_NONBITMAP(o)) {
14916 SV *lv; /* Set if there is something outside the bit map. */
14917 bool byte_output = FALSE; /* If something in the bitmap has been
14920 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14921 sv_catpvs(sv, "{outside bitmap}");
14924 sv_catpvs(sv, "{utf8}");
14927 /* Get the stuff that wasn't in the bitmap */
14928 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
14929 if (lv && lv != &PL_sv_undef) {
14930 char *s = savesvpv(lv);
14931 char * const origs = s;
14933 while (*s && *s != '\n')
14937 const char * const t = ++s;
14940 sv_catpvs(sv, " ");
14946 /* Truncate very long output */
14947 if (s - origs > 256) {
14948 Perl_sv_catpvf(aTHX_ sv,
14950 (int) (s - origs - 1),
14956 else if (*s == '\t') {
14970 SvREFCNT_dec_NN(lv);
14974 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14976 else if (k == POSIXD || k == NPOSIXD) {
14977 U8 index = FLAGS(o) * 2;
14978 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14979 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14982 sv_catpv(sv, anyofs[index]);
14985 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14986 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14988 PERL_UNUSED_CONTEXT;
14989 PERL_UNUSED_ARG(sv);
14990 PERL_UNUSED_ARG(o);
14991 PERL_UNUSED_ARG(prog);
14992 #endif /* DEBUGGING */
14996 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14997 { /* Assume that RE_INTUIT is set */
14999 struct regexp *const prog = ReANY(r);
15000 GET_RE_DEBUG_FLAGS_DECL;
15002 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15003 PERL_UNUSED_CONTEXT;
15007 const char * const s = SvPV_nolen_const(prog->check_substr
15008 ? prog->check_substr : prog->check_utf8);
15010 if (!PL_colorset) reginitcolors();
15011 PerlIO_printf(Perl_debug_log,
15012 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15014 prog->check_substr ? "" : "utf8 ",
15015 PL_colors[5],PL_colors[0],
15018 (strlen(s) > 60 ? "..." : ""));
15021 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15027 handles refcounting and freeing the perl core regexp structure. When
15028 it is necessary to actually free the structure the first thing it
15029 does is call the 'free' method of the regexp_engine associated to
15030 the regexp, allowing the handling of the void *pprivate; member
15031 first. (This routine is not overridable by extensions, which is why
15032 the extensions free is called first.)
15034 See regdupe and regdupe_internal if you change anything here.
15036 #ifndef PERL_IN_XSUB_RE
15038 Perl_pregfree(pTHX_ REGEXP *r)
15044 Perl_pregfree2(pTHX_ REGEXP *rx)
15047 struct regexp *const r = ReANY(rx);
15048 GET_RE_DEBUG_FLAGS_DECL;
15050 PERL_ARGS_ASSERT_PREGFREE2;
15052 if (r->mother_re) {
15053 ReREFCNT_dec(r->mother_re);
15055 CALLREGFREE_PVT(rx); /* free the private data */
15056 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15057 Safefree(r->xpv_len_u.xpvlenu_pv);
15060 SvREFCNT_dec(r->anchored_substr);
15061 SvREFCNT_dec(r->anchored_utf8);
15062 SvREFCNT_dec(r->float_substr);
15063 SvREFCNT_dec(r->float_utf8);
15064 Safefree(r->substrs);
15066 RX_MATCH_COPY_FREE(rx);
15067 #ifdef PERL_ANY_COW
15068 SvREFCNT_dec(r->saved_copy);
15071 SvREFCNT_dec(r->qr_anoncv);
15072 rx->sv_u.svu_rx = 0;
15077 This is a hacky workaround to the structural issue of match results
15078 being stored in the regexp structure which is in turn stored in
15079 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15080 could be PL_curpm in multiple contexts, and could require multiple
15081 result sets being associated with the pattern simultaneously, such
15082 as when doing a recursive match with (??{$qr})
15084 The solution is to make a lightweight copy of the regexp structure
15085 when a qr// is returned from the code executed by (??{$qr}) this
15086 lightweight copy doesn't actually own any of its data except for
15087 the starp/end and the actual regexp structure itself.
15093 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15095 struct regexp *ret;
15096 struct regexp *const r = ReANY(rx);
15097 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15099 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15102 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15104 SvOK_off((SV *)ret_x);
15106 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15107 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15108 made both spots point to the same regexp body.) */
15109 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15110 assert(!SvPVX(ret_x));
15111 ret_x->sv_u.svu_rx = temp->sv_any;
15112 temp->sv_any = NULL;
15113 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15114 SvREFCNT_dec_NN(temp);
15115 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15116 ing below will not set it. */
15117 SvCUR_set(ret_x, SvCUR(rx));
15120 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15121 sv_force_normal(sv) is called. */
15123 ret = ReANY(ret_x);
15125 SvFLAGS(ret_x) |= SvUTF8(rx);
15126 /* We share the same string buffer as the original regexp, on which we
15127 hold a reference count, incremented when mother_re is set below.
15128 The string pointer is copied here, being part of the regexp struct.
15130 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15131 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15133 const I32 npar = r->nparens+1;
15134 Newx(ret->offs, npar, regexp_paren_pair);
15135 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15138 Newx(ret->substrs, 1, struct reg_substr_data);
15139 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15141 SvREFCNT_inc_void(ret->anchored_substr);
15142 SvREFCNT_inc_void(ret->anchored_utf8);
15143 SvREFCNT_inc_void(ret->float_substr);
15144 SvREFCNT_inc_void(ret->float_utf8);
15146 /* check_substr and check_utf8, if non-NULL, point to either their
15147 anchored or float namesakes, and don't hold a second reference. */
15149 RX_MATCH_COPIED_off(ret_x);
15150 #ifdef PERL_ANY_COW
15151 ret->saved_copy = NULL;
15153 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15154 SvREFCNT_inc_void(ret->qr_anoncv);
15160 /* regfree_internal()
15162 Free the private data in a regexp. This is overloadable by
15163 extensions. Perl takes care of the regexp structure in pregfree(),
15164 this covers the *pprivate pointer which technically perl doesn't
15165 know about, however of course we have to handle the
15166 regexp_internal structure when no extension is in use.
15168 Note this is called before freeing anything in the regexp
15173 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15176 struct regexp *const r = ReANY(rx);
15177 RXi_GET_DECL(r,ri);
15178 GET_RE_DEBUG_FLAGS_DECL;
15180 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15186 SV *dsv= sv_newmortal();
15187 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15188 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15189 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15190 PL_colors[4],PL_colors[5],s);
15193 #ifdef RE_TRACK_PATTERN_OFFSETS
15195 Safefree(ri->u.offsets); /* 20010421 MJD */
15197 if (ri->code_blocks) {
15199 for (n = 0; n < ri->num_code_blocks; n++)
15200 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15201 Safefree(ri->code_blocks);
15205 int n = ri->data->count;
15208 /* If you add a ->what type here, update the comment in regcomp.h */
15209 switch (ri->data->what[n]) {
15215 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15218 Safefree(ri->data->data[n]);
15224 { /* Aho Corasick add-on structure for a trie node.
15225 Used in stclass optimization only */
15227 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15229 refcount = --aho->refcount;
15232 PerlMemShared_free(aho->states);
15233 PerlMemShared_free(aho->fail);
15234 /* do this last!!!! */
15235 PerlMemShared_free(ri->data->data[n]);
15236 PerlMemShared_free(ri->regstclass);
15242 /* trie structure. */
15244 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15246 refcount = --trie->refcount;
15249 PerlMemShared_free(trie->charmap);
15250 PerlMemShared_free(trie->states);
15251 PerlMemShared_free(trie->trans);
15253 PerlMemShared_free(trie->bitmap);
15255 PerlMemShared_free(trie->jump);
15256 PerlMemShared_free(trie->wordinfo);
15257 /* do this last!!!! */
15258 PerlMemShared_free(ri->data->data[n]);
15263 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15266 Safefree(ri->data->what);
15267 Safefree(ri->data);
15273 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15274 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15275 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15278 re_dup - duplicate a regexp.
15280 This routine is expected to clone a given regexp structure. It is only
15281 compiled under USE_ITHREADS.
15283 After all of the core data stored in struct regexp is duplicated
15284 the regexp_engine.dupe method is used to copy any private data
15285 stored in the *pprivate pointer. This allows extensions to handle
15286 any duplication it needs to do.
15288 See pregfree() and regfree_internal() if you change anything here.
15290 #if defined(USE_ITHREADS)
15291 #ifndef PERL_IN_XSUB_RE
15293 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15297 const struct regexp *r = ReANY(sstr);
15298 struct regexp *ret = ReANY(dstr);
15300 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15302 npar = r->nparens+1;
15303 Newx(ret->offs, npar, regexp_paren_pair);
15304 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15306 if (ret->substrs) {
15307 /* Do it this way to avoid reading from *r after the StructCopy().
15308 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15309 cache, it doesn't matter. */
15310 const bool anchored = r->check_substr
15311 ? r->check_substr == r->anchored_substr
15312 : r->check_utf8 == r->anchored_utf8;
15313 Newx(ret->substrs, 1, struct reg_substr_data);
15314 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15316 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15317 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15318 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15319 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15321 /* check_substr and check_utf8, if non-NULL, point to either their
15322 anchored or float namesakes, and don't hold a second reference. */
15324 if (ret->check_substr) {
15326 assert(r->check_utf8 == r->anchored_utf8);
15327 ret->check_substr = ret->anchored_substr;
15328 ret->check_utf8 = ret->anchored_utf8;
15330 assert(r->check_substr == r->float_substr);
15331 assert(r->check_utf8 == r->float_utf8);
15332 ret->check_substr = ret->float_substr;
15333 ret->check_utf8 = ret->float_utf8;
15335 } else if (ret->check_utf8) {
15337 ret->check_utf8 = ret->anchored_utf8;
15339 ret->check_utf8 = ret->float_utf8;
15344 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15345 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15348 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15350 if (RX_MATCH_COPIED(dstr))
15351 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15353 ret->subbeg = NULL;
15354 #ifdef PERL_ANY_COW
15355 ret->saved_copy = NULL;
15358 /* Whether mother_re be set or no, we need to copy the string. We
15359 cannot refrain from copying it when the storage points directly to
15360 our mother regexp, because that's
15361 1: a buffer in a different thread
15362 2: something we no longer hold a reference on
15363 so we need to copy it locally. */
15364 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15365 ret->mother_re = NULL;
15367 #endif /* PERL_IN_XSUB_RE */
15372 This is the internal complement to regdupe() which is used to copy
15373 the structure pointed to by the *pprivate pointer in the regexp.
15374 This is the core version of the extension overridable cloning hook.
15375 The regexp structure being duplicated will be copied by perl prior
15376 to this and will be provided as the regexp *r argument, however
15377 with the /old/ structures pprivate pointer value. Thus this routine
15378 may override any copying normally done by perl.
15380 It returns a pointer to the new regexp_internal structure.
15384 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15387 struct regexp *const r = ReANY(rx);
15388 regexp_internal *reti;
15390 RXi_GET_DECL(r,ri);
15392 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15396 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15397 Copy(ri->program, reti->program, len+1, regnode);
15399 reti->num_code_blocks = ri->num_code_blocks;
15400 if (ri->code_blocks) {
15402 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15403 struct reg_code_block);
15404 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15405 struct reg_code_block);
15406 for (n = 0; n < ri->num_code_blocks; n++)
15407 reti->code_blocks[n].src_regex = (REGEXP*)
15408 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15411 reti->code_blocks = NULL;
15413 reti->regstclass = NULL;
15416 struct reg_data *d;
15417 const int count = ri->data->count;
15420 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15421 char, struct reg_data);
15422 Newx(d->what, count, U8);
15425 for (i = 0; i < count; i++) {
15426 d->what[i] = ri->data->what[i];
15427 switch (d->what[i]) {
15428 /* see also regcomp.h and regfree_internal() */
15429 case 'a': /* actually an AV, but the dup function is identical. */
15433 case 'u': /* actually an HV, but the dup function is identical. */
15434 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15437 /* This is cheating. */
15438 Newx(d->data[i], 1, struct regnode_charclass_class);
15439 StructCopy(ri->data->data[i], d->data[i],
15440 struct regnode_charclass_class);
15441 reti->regstclass = (regnode*)d->data[i];
15444 /* Trie stclasses are readonly and can thus be shared
15445 * without duplication. We free the stclass in pregfree
15446 * when the corresponding reg_ac_data struct is freed.
15448 reti->regstclass= ri->regstclass;
15452 ((reg_trie_data*)ri->data->data[i])->refcount++;
15457 d->data[i] = ri->data->data[i];
15460 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15469 reti->name_list_idx = ri->name_list_idx;
15471 #ifdef RE_TRACK_PATTERN_OFFSETS
15472 if (ri->u.offsets) {
15473 Newx(reti->u.offsets, 2*len+1, U32);
15474 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15477 SetProgLen(reti,len);
15480 return (void*)reti;
15483 #endif /* USE_ITHREADS */
15485 #ifndef PERL_IN_XSUB_RE
15488 - regnext - dig the "next" pointer out of a node
15491 Perl_regnext(pTHX_ regnode *p)
15499 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15500 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15503 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15512 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
15515 STRLEN l1 = strlen(pat1);
15516 STRLEN l2 = strlen(pat2);
15519 const char *message;
15521 PERL_ARGS_ASSERT_RE_CROAK2;
15527 Copy(pat1, buf, l1 , char);
15528 Copy(pat2, buf + l1, l2 , char);
15529 buf[l1 + l2] = '\n';
15530 buf[l1 + l2 + 1] = '\0';
15532 /* ANSI variant takes additional second argument */
15533 va_start(args, pat2);
15537 msv = vmess(buf, &args);
15539 message = SvPV_const(msv,l1);
15542 Copy(message, buf, l1 , char);
15543 /* l1-1 to avoid \n */
15544 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
15547 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15549 #ifndef PERL_IN_XSUB_RE
15551 Perl_save_re_context(pTHX)
15555 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15557 const REGEXP * const rx = PM_GETRE(PL_curpm);
15560 for (i = 1; i <= RX_NPARENS(rx); i++) {
15561 char digits[TYPE_CHARS(long)];
15562 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15563 GV *const *const gvp
15564 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15567 GV * const gv = *gvp;
15568 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15580 S_put_byte(pTHX_ SV *sv, int c)
15582 PERL_ARGS_ASSERT_PUT_BYTE;
15584 /* Our definition of isPRINT() ignores locales, so only bytes that are
15585 not part of UTF-8 are considered printable. I assume that the same
15586 holds for UTF-EBCDIC.
15587 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15588 which Wikipedia says:
15590 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15591 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15592 identical, to the ASCII delete (DEL) or rubout control character. ...
15593 it is typically mapped to hexadecimal code 9F, in order to provide a
15594 unique character mapping in both directions)
15596 So the old condition can be simplified to !isPRINT(c) */
15599 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15600 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15601 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15602 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15603 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15606 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15611 const char string = c;
15612 if (c == '-' || c == ']' || c == '\\' || c == '^')
15613 sv_catpvs(sv, "\\");
15614 sv_catpvn(sv, &string, 1);
15619 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15621 /* Appends to 'sv' a displayable version of the innards of the bracketed
15622 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
15623 * output anything */
15626 int rangestart = -1;
15627 bool has_output_anything = FALSE;
15629 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15631 for (i = 0; i <= 256; i++) {
15632 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15633 if (rangestart == -1)
15635 } else if (rangestart != -1) {
15637 if (i <= rangestart + 3) { /* Individual chars in short ranges */
15638 for (; rangestart < i; rangestart++)
15639 put_byte(sv, rangestart);
15642 || ! isALPHANUMERIC(rangestart)
15643 || ! isALPHANUMERIC(j)
15644 || isDIGIT(rangestart) != isDIGIT(j)
15645 || isUPPER(rangestart) != isUPPER(j)
15646 || isLOWER(rangestart) != isLOWER(j)
15648 /* This final test should get optimized out except
15649 * on EBCDIC platforms, where it causes ranges that
15650 * cross discontinuities like i/j to be shown as hex
15651 * instead of the misleading, e.g. H-K (since that
15652 * range includes more than H, I, J, K). */
15653 || (j - rangestart)
15654 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15656 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15658 (j < 256) ? j : 255);
15660 else { /* Here, the ends of the range are both digits, or both
15661 uppercase, or both lowercase; and there's no
15662 discontinuity in the range (which could happen on EBCDIC
15664 put_byte(sv, rangestart);
15665 sv_catpvs(sv, "-");
15669 has_output_anything = TRUE;
15673 return has_output_anything;
15676 #define CLEAR_OPTSTART \
15677 if (optstart) STMT_START { \
15678 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15682 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15684 STATIC const regnode *
15685 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15686 const regnode *last, const regnode *plast,
15687 SV* sv, I32 indent, U32 depth)
15690 U8 op = PSEUDO; /* Arbitrary non-END op. */
15691 const regnode *next;
15692 const regnode *optstart= NULL;
15694 RXi_GET_DECL(r,ri);
15695 GET_RE_DEBUG_FLAGS_DECL;
15697 PERL_ARGS_ASSERT_DUMPUNTIL;
15699 #ifdef DEBUG_DUMPUNTIL
15700 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15701 last ? last-start : 0,plast ? plast-start : 0);
15704 if (plast && plast < last)
15707 while (PL_regkind[op] != END && (!last || node < last)) {
15708 /* While that wasn't END last time... */
15711 if (op == CLOSE || op == WHILEM)
15713 next = regnext((regnode *)node);
15716 if (OP(node) == OPTIMIZED) {
15717 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15724 regprop(r, sv, node);
15725 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15726 (int)(2*indent + 1), "", SvPVX_const(sv));
15728 if (OP(node) != OPTIMIZED) {
15729 if (next == NULL) /* Next ptr. */
15730 PerlIO_printf(Perl_debug_log, " (0)");
15731 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15732 PerlIO_printf(Perl_debug_log, " (FAIL)");
15734 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15735 (void)PerlIO_putc(Perl_debug_log, '\n');
15739 if (PL_regkind[(U8)op] == BRANCHJ) {
15742 const regnode *nnode = (OP(next) == LONGJMP
15743 ? regnext((regnode *)next)
15745 if (last && nnode > last)
15747 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15750 else if (PL_regkind[(U8)op] == BRANCH) {
15752 DUMPUNTIL(NEXTOPER(node), next);
15754 else if ( PL_regkind[(U8)op] == TRIE ) {
15755 const regnode *this_trie = node;
15756 const char op = OP(node);
15757 const U32 n = ARG(node);
15758 const reg_ac_data * const ac = op>=AHOCORASICK ?
15759 (reg_ac_data *)ri->data->data[n] :
15761 const reg_trie_data * const trie =
15762 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15764 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15766 const regnode *nextbranch= NULL;
15769 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15770 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15772 PerlIO_printf(Perl_debug_log, "%*s%s ",
15773 (int)(2*(indent+3)), "",
15774 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15775 PL_colors[0], PL_colors[1],
15776 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15777 PERL_PV_PRETTY_ELLIPSES |
15778 PERL_PV_PRETTY_LTGT
15783 U16 dist= trie->jump[word_idx+1];
15784 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15785 (UV)((dist ? this_trie + dist : next) - start));
15788 nextbranch= this_trie + trie->jump[0];
15789 DUMPUNTIL(this_trie + dist, nextbranch);
15791 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15792 nextbranch= regnext((regnode *)nextbranch);
15794 PerlIO_printf(Perl_debug_log, "\n");
15797 if (last && next > last)
15802 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15803 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15804 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15806 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15808 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15810 else if ( op == PLUS || op == STAR) {
15811 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15813 else if (PL_regkind[(U8)op] == ANYOF) {
15814 /* arglen 1 + class block */
15815 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15816 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15817 node = NEXTOPER(node);
15819 else if (PL_regkind[(U8)op] == EXACT) {
15820 /* Literal string, where present. */
15821 node += NODE_SZ_STR(node) - 1;
15822 node = NEXTOPER(node);
15825 node = NEXTOPER(node);
15826 node += regarglen[(U8)op];
15828 if (op == CURLYX || op == OPEN)
15832 #ifdef DEBUG_DUMPUNTIL
15833 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15838 #endif /* DEBUGGING */
15842 * c-indentation-style: bsd
15843 * c-basic-offset: 4
15844 * indent-tabs-mode: nil
15847 * ex: set ts=8 sts=4 sw=4 et: